In [1]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)


import matplotlib.pyplot as plt
import seaborn as sns
plt.rcParams['figure.figsize'] = (15, 20)

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler, RobustScaler, Normalizer

from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold

from sklearn.ensemble import IsolationForest
from sklearn.covariance import EllipticEnvelope
from sklearn.neighbors import LocalOutlierFactor
from sklearn.svm import OneClassSVM
from sklearn.linear_model import SGDOneClassSVM

import math
import scipy.stats as ss

In [2]:
#Reading Files
path = "/Users/larst/OneDrive/Documents/GitHub/KaggleS3E21/data/"
data = pd.read_csv(path+"sample_submission.csv")

In [3]:
rf = RandomForestRegressor(
       n_estimators=1000,
       max_depth=7,
       n_jobs=-1,
       random_state=42)



In [4]:
X, y = data.drop('target', axis = 1), data.target

scores = cross_val_score(rf, 
                         X, 
                         y,
                         cv = 10, 
                         scoring = 'neg_root_mean_squared_error',
                         n_jobs = -1)

print(f'RMSE Mean (BASELINE): {scores.mean()}, Standard Deviation: {scores.std()}')



RMSE Mean (BASELINE): -1.3770624373999154, Standard Deviation: 0.6399666156079411


In [5]:
#clipping
data.target = data.target.clip(lower = 7, upper = 20)



In [6]:
svm = SGDOneClassSVM(nu=0.55)
yhat = svm.fit_predict(data.drop('id', axis = 1))

print('Outliers detected (SGD One Class SVM): {}'.format(np.count_nonzero(yhat == -1)))

mask = yhat != -1
data_SVM = data.loc[mask, :].reset_index(drop = True)

X_SVM, Y_SVM = data_SVM.drop('target', axis = 1), data_SVM.target

scores = cross_val_score(rf, 
                         X_SVM,
                         Y_SVM,
                         cv = 10, 
                         scoring = 'neg_root_mean_squared_error',
                         n_jobs = -1)

print(f'[{svm.__class__.__name__}] --> RMSE Mean: {scores.mean()}, Standard Deviation: {scores.std()}')

Outliers detected (SGD One Class SVM): 29
[SGDOneClassSVM] --> RMSE Mean: -0.9854915037766169, Standard Deviation: 0.101814869695318


In [None]:
iso = IsolationForest(random_state = 0)
yhat = iso.fit_predict(data_SVM.drop('id', axis = 1))

print('Outliers detected (Isolation Forest): {}'.format(np.count_nonzero(yhat == -1)))

mask = yhat != -1
data_ISO = data_SVM.loc[mask, :].reset_index(drop = True)

X_ISO, Y_ISO = data_ISO.drop("target", axis=1), data_ISO.target

scores = cross_val_score(rf, 
                         X_ISO,
                         Y_ISO,
                         cv = 10, 
                         scoring = 'neg_root_mean_squared_error',
                         n_jobs = -1)

print(f'[{iso.__class__.__name__}] --> RMSE Mean: {scores.mean()}, Standard Deviation: {scores.std()}')

Outliers detected (Isolation Forest): 72


In [None]:
lof = LocalOutlierFactor(n_neighbors=3, contamination=0.12)
yhat = lof.fit_predict(data_ISO.drop(['id', 'target'], axis = 1))

print('Outliers detected (Local Outlier Factor): {}'.format(np.count_nonzero(yhat == -1)))

mask = yhat != -1
data_final = data_ISO.loc[mask, :].reset_index(drop = True)

X_LOF, Y_LOF = data_final.drop('target', axis = 1), data_final.target

scores = cross_val_score(rf, 
                         X_LOF,
                         Y_LOF,
                         cv = 10, 
                         scoring = 'neg_root_mean_squared_error',
                         n_jobs = -1)

print(f'[{lof.__class__.__name__}] --> RMSE Mean: {scores.mean()}, Standard Deviation: {scores.std()}')

In [None]:
X_final, y_final = data_final.drop('target', axis = 1), data_final.target



In [None]:
rf.fit(X_final, y_final)

imp = pd.DataFrame({'importances': rf.feature_importances_}, index = X_final.columns).sort_values(by = 'importances', ascending = False)

In [None]:
selected_features = [
    'O2_1', 
    'O2_2',
    'BOD5_5'
    ,'NO2_2'
    ,'O2_4'
    ,'NH4_6'
    ,'NH4_1'
    #,'NH4_5'
    #,'O2_7'
    #,'O2_6'
]

In [None]:
# kf = RepeatedKFold(n_splits = 10, n_repeats = 5, random_state = 0)
scores = cross_val_score(rf, 
                         X_final[selected_features],
                         y_final,
                         cv = 10, 
                         scoring = 'neg_root_mean_squared_error',
                         n_jobs = -1)

print(f'[ENSEMBLE + FS + CLIPPING_7_20] --> RMSE Mean: {scores.mean()}, Standard Deviation: {scores.std()}')

In [None]:
# Trick from https://www.kaggle.com/competitions/playground-series-s3e21/discussion/434519

bad_labels = [2365, 1089, 1936, 1680, 211,2294,448, 437,309,1684]
data_final = data_final[~data_final['id'].isin(bad_labels)]

In [None]:
for column in X.columns:
    if column not in selected_features:
        data_final[column] = 0        



In [None]:
df_test = data_final[data_final['target']>8.5]
df_test_data = data[data['target']>8.5]
df_test = df_test[df_test['target']<8.7]
df_test_data = df_test_data[df_test_data['target']<8.7]
data_final.shape
drop_index = np.random.choice(df_test.index.to_numpy(), 15, replace=False)
data_final.loc[drop_index]
data_final = data_final.drop(drop_index)

data_final.shape

In [None]:
sns.kdeplot(data=data_final[data_final['target']<10], x="target")
sns.kdeplot(data=data[data['target']<10], x="target")
plt.legend(['data','original_df'])
plt.title('target distribution')



In [None]:
submission = data_final.reset_index(drop = True).copy()

submission.head()



In [None]:
submission.shape

In [None]:
submission.to_csv('submission.csv', index = False)

In [None]:
data_final = data_ISO.loc[mask, :].reset_index(drop = True)

X_LOF, Y_LOF = data_final.drop('target', axis = 1), data_final.target