In [1]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)


import matplotlib.pyplot as plt
import seaborn as sns
plt.rcParams['figure.figsize'] = (15, 20)

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler, RobustScaler, Normalizer

from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold

from sklearn.ensemble import IsolationForest
from sklearn.covariance import EllipticEnvelope
from sklearn.neighbors import LocalOutlierFactor
from sklearn.svm import OneClassSVM
from sklearn.linear_model import SGDOneClassSVM

import math
import scipy.stats as ss

In [2]:
#Reading Files
path = "/Users/larst/OneDrive/Documents/GitHub/KaggleS3E21/data/"
data = pd.read_csv(path+"sample_submission.csv")

In [3]:
rf = RandomForestRegressor(
       n_estimators=1000,
       max_depth=7,
       n_jobs=-1,
       random_state=42)



In [4]:
X, y = data.drop('target', axis = 1), data.target

scores = cross_val_score(rf, 
                         X, 
                         y,
                         cv = 10, 
                         scoring = 'neg_root_mean_squared_error',
                         n_jobs = -1)

print(f'RMSE Mean (BASELINE): {scores.mean()}, Standard Deviation: {scores.std()}')



RMSE Mean (BASELINE): -1.3770624373999154, Standard Deviation: 0.639966615607941


In [5]:
#clipping
data.target = data.target.clip(lower = 7, upper = 20)



In [6]:
svm = SGDOneClassSVM(nu=0.55)
yhat = svm.fit_predict(data.drop('id', axis = 1))

print('Outliers detected (SGD One Class SVM): {}'.format(np.count_nonzero(yhat == -1)))

mask = yhat != -1
data_SVM = data.loc[mask, :].reset_index(drop = True)

X_SVM, Y_SVM = data_SVM.drop('target', axis = 1), data_SVM.target

scores = cross_val_score(rf, 
                         X_SVM,
                         Y_SVM,
                         cv = 10, 
                         scoring = 'neg_root_mean_squared_error',
                         n_jobs = -1)

print(f'[{svm.__class__.__name__}] --> RMSE Mean: {scores.mean()}, Standard Deviation: {scores.std()}')

Outliers detected (SGD One Class SVM): 29
[SGDOneClassSVM] --> RMSE Mean: -0.9854915037766169, Standard Deviation: 0.10181486969531803


In [7]:
iso = IsolationForest(random_state = 0)
yhat = iso.fit_predict(data_SVM.drop('id', axis = 1))

print('Outliers detected (Isolation Forest): {}'.format(np.count_nonzero(yhat == -1)))

mask = yhat != -1
data_ISO = data_SVM.loc[mask, :].reset_index(drop = True)

X_ISO, Y_ISO = data_ISO.drop("target", axis=1), data_ISO.target

scores = cross_val_score(rf, 
                         X_ISO,
                         Y_ISO,
                         cv = 10, 
                         scoring = 'neg_root_mean_squared_error',
                         n_jobs = -1)

print(f'[{iso.__class__.__name__}] --> RMSE Mean: {scores.mean()}, Standard Deviation: {scores.std()}')

Outliers detected (Isolation Forest): 72
[IsolationForest] --> RMSE Mean: -0.9768844283538188, Standard Deviation: 0.10080394581725005


In [8]:
lof = LocalOutlierFactor(n_neighbors=3, contamination=0.1)
yhat = lof.fit_predict(data_ISO.drop(['id', 'target'], axis = 1))

print('Outliers detected (Local Outlier Factor): {}'.format(np.count_nonzero(yhat == -1)))

mask = yhat != -1
data_final = data_ISO.loc[mask, :].reset_index(drop = True)

X_LOF, Y_LOF = data_final.drop('target', axis = 1), data_final.target

scores = cross_val_score(rf, 
                         X_LOF,
                         Y_LOF,
                         cv = 10, 
                         scoring = 'neg_root_mean_squared_error',
                         n_jobs = -1)

print(f'[{lof.__class__.__name__}] --> RMSE Mean: {scores.mean()}, Standard Deviation: {scores.std()}')

Outliers detected (Local Outlier Factor): 340
[LocalOutlierFactor] --> RMSE Mean: -0.9284264065835146, Standard Deviation: 0.0823662722981291


In [9]:
X_final, y_final = data_final.drop('target', axis = 1), data_final.target



In [10]:
rf.fit(X_final, y_final)

imp = pd.DataFrame({'importances': rf.feature_importances_}, index = X_final.columns).sort_values(by = 'importances', ascending = False)

In [11]:
selected_features = [
    'O2_1', 
    'O2_2',
    'BOD5_5'
    ,'NO2_2'
    ,'O2_4'
    ,'NH4_6'
    #,'NH4_1'
    #,'NH4_5'
    ,'O2_7'
    ,'O2_6'
]

# kf = RepeatedKFold(n_splits = 10, n_repeats = 5, random_state = 0)
scores = cross_val_score(rf, 
                         X_final[selected_features],
                         y_final,
                         cv = 10, 
                         scoring = 'neg_root_mean_squared_error',
                         n_jobs = -1)

print(f'[ENSEMBLE + FS + CLIPPING_7_20] --> RMSE Mean: {scores.mean()}, Standard Deviation: {scores.std()}')

[ENSEMBLE + FS + CLIPPING_7_20] --> RMSE Mean: -0.9197637551195701, Standard Deviation: 0.08342942332978054


In [12]:
# Trick from https://www.kaggle.com/competitions/playground-series-s3e21/discussion/434519

bad_labels = [2365, 1089, 1936, 1680, 211]
data_final = data_final[~data_final['id'].isin(bad_labels)]

In [13]:
for column in X.columns:
    if column not in selected_features:
        data_final[column] = 0        



In [14]:
submission = data_final.reset_index(drop = True).copy()

submission.head()



Unnamed: 0,id,target,O2_1,O2_2,O2_3,O2_4,O2_5,O2_6,O2_7,NH4_1,NH4_2,NH4_3,NH4_4,NH4_5,NH4_6,NH4_7,NO2_1,NO2_2,NO2_3,NO2_4,NO2_5,NO2_6,NO2_7,NO3_1,NO3_2,NO3_3,NO3_4,NO3_5,NO3_6,NO3_7,BOD5_1,BOD5_2,BOD5_3,BOD5_4,BOD5_5,BOD5_6,BOD5_7
0,0,8.59,7.5,9.0,0,9.265,0,8.43,7.15,0,0,0,0,0,1.285,0,0,0.05,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,16.645,0,0
1,0,9.1,13.533,40.9,0,9.265,0,10.07,7.15,0,0,0,0,0,0.28,0,0,1.36,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5.725,0,0
2,0,8.21,3.71,5.42,0,9.265,0,10.07,7.15,0,0,0,0,0,0.38,0,0,0.05,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,6.75,0,0
3,0,8.39,8.7,8.1,0,9.2,0,8.67,6.67,0,0,0,0,0,1.48,0,0,0.05,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8.67,0,0
4,0,8.07,8.05,8.65,0,9.265,0,10.07,7.15,0,0,0,0,0,0.28,0,0,0.115,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8.4,0,0


In [15]:
submission.to_csv('submission.csv', index = False)