In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import IsolationForest
from sklearn.feature_selection import SelectKBest, chi2, RFECV, RFE, SelectFromModel
from sklearn.impute import KNNImputer
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.kernel_ridge import KernelRidge
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score

# 1. Load training data and test data

## 1.1 Read the csv files and get an overview of the data

In [None]:
X_df = pd.read_csv('X_train.csv') # df stands for dataframe
y_df = pd.read_csv('y_train.csv')
X_test_df = pd.read_csv('X_test.csv')
print("Dimension of X: {}".format(X_df.shape))
print("Dimension of y: {}".format(y_df.shape))
print(X_df.tail())
print(y_df.tail())

Dimension of X: (1212, 833)
Dimension of y: (1212, 2)
          id        x0             x1            x2           x3  \
1207  1207.0  9.895208  832442.883343  20585.510774  1082.657477   
1208  1208.0  9.838322  832442.816133  20585.530631  1020.816077   
1209  1209.0  9.267322  832442.815687  20585.525811  1076.663108   
1210  1210.0       NaN  832442.826707  20585.564056  1076.359217   
1211  1211.0  9.070810  832442.841694  20585.531908  1077.977276   

                x4         x5             x6            x7            x8  ...  \
1207  1.107019e+06  10.915545  597900.459438           NaN  1.020097e+06  ...   
1208  9.834291e+05        NaN  597900.428561  10842.043309  1.110318e+06  ...   
1209  7.651645e+05   9.579855            NaN           NaN  1.092330e+06  ...   
1210  9.332198e+05   8.863739  597900.402740  13121.042555  8.245987e+05  ...   
1211  9.568525e+05  10.894921  597900.446646   9947.064942  1.010069e+06  ...   

              x822         x823          x824     

## 1.2 Get rid of the id column

In [None]:
X_data = X_df.iloc[:,1:]
y_data = y_df.iloc[:,1:]
X_test = X_test_df.iloc[:, 1:]
print("X_data shape:", X_data.shape)
print("y_data shape:", y_data.shape)
print("X_test shape:", X_test.shape)

X_data shape: (1212, 832)
y_data shape: (1212, 1)
X_test shape: (776, 832)


# 2. Fill the missing values

## 2.1 Get to know how many missing values there are

In [None]:
feature_missing_values = X_data.isna().sum(axis = 0)
print ("Number of missing values for each feature:\n", feature_missing_values)

record_missing_values = X_data.isna().sum(axis = 1)
print ("Number of missing values for each record:\n", record_missing_values)

Number of missing values for each feature:
 x0       94
x1       98
x2       95
x3      106
x4       95
       ... 
x827    103
x828     97
x829    100
x830     88
x831    121
Length: 832, dtype: int64
Number of missing values for each record:
 0       59
1       67
2       61
3       57
4       62
        ..
1207    55
1208    76
1209    60
1210    51
1211    69
Length: 1212, dtype: int64


## 2.2 Using median

In [None]:
# X_data_1 = X_data.fillna(X_data.median())
# X_test_1 = X_test.fillna(X_test.median())
# missing_values = X_data_1.isna().any() # any(): Return whether any element is True, potentially over an axis.
# print ("After filling, number of missing values for each feature:\n", missing_values)

After filling, number of missing values for each feature:
 x0      False
x1      False
x2      False
x3      False
x4      False
        ...  
x827    False
x828    False
x829    False
x830    False
x831    False
Length: 832, dtype: bool


## 2.3 Using KNN

In [None]:
# To use KNN imputer, we need to first normalize our data so the distance measure of each feature is the same
scaler = RobustScaler(quantile_range=(10, 90))
X_data_sc = pd.DataFrame(scaler.fit_transform(X_data), columns=X_data.columns)
X_test_sc = pd.DataFrame(scaler.fit_transform(X_test), columns=X_test.columns)

imputer = KNNImputer(n_neighbors=5)
X_data_1 = pd.DataFrame(imputer.fit_transform(X_data_sc), columns=X_data.columns)
X_test_1 = pd.DataFrame(imputer.fit_transform(X_test_sc), columns=X_test.columns)

print("Missing values", X_data_1.isna().any())

# 3. Outlier detection

## 3.1 Using IsolationForest

In [None]:
iso_forest = IsolationForest(n_estimators=500, contamination=0.05)
iso_forest = iso_forest.fit(X_data_1)
isof_outliers = iso_forest.predict(X_data_1)
X_data_2 = X_data_1.iloc[isof_outliers == 1, :]
y_data = y_data.iloc[isof_outliers == 1].values
print(X_data_2.shape)

X_test_2 = X_test_1

# # Delete nothing
# X_data_2 = X_data_1
# X_test_2 = X_test_1
# y_data = y_data.values

(1151, 832)


# 4. Feature selection

## 4.1 Using SelectKBest

In [None]:
# selector = SelectKBest(score_func=chi2, k=85)
# selector = selector.fit(X_data_2.abs(), y = y_data)
# X_data_3 = selector.transform(X_data_2)
# X_test_3 = selector.transform(X_test_2)
# print("X_data shape", X_data_3.shape)

X_data shape (1151, 85)


## 4.2 Using SelectFromModel

In [None]:
# For RFR
# estimator2 = RandomForestRegressor()
# selector2 = SelectFromModel(estimator2, max_features=150, threshold="mean")
# selector2 = selector2.fit(X_data_2, y_data)
# X_data_3 = selector2.transform(X_data_2)
# X_test_3 = selector2.transform(X_test_2)

# For SVR
# estimator2 = KernelRidge()
# selector2 = SelectFromModel(estimator2, max_features=150, threshold="mean", importance_getter='dual_coef_')
# selector2 = selector2.fit(X_data_2, y_data.ravel())
# X_data_3 = selector2.transform(X_data_2)
# X_test_3 = selector2.transform(X_test_2)

## 4.3 Using RFE

In [None]:
# Make RFE faster
selector = SelectKBest(score_func=chi2, k=300)
selector = selector.fit(X_data_2.abs(), y = y_data)
X_data_2_c = selector.transform(X_data_2)
X_test_2_c = selector.transform(X_test_2)

estimator = RandomForestRegressor(n_estimators=320, min_samples_split=3)
selector = RFE(estimator, step=5, n_features_to_select=85, verbose=2)
selector = selector.fit(X_data_2_c, y_data.ravel())
X_data_3 = selector.transform(X_data_2_c)
X_test_3 = selector.transform(X_test_2_c)

In [None]:
# estimator = RandomForestRegressor(n_estimators=200, min_samples_split=3)
# selector = RFE(estimator, step=5, n_features_to_select=85, verbose=2)
# selector = selector.fit(X_data_2, y_data.ravel())
# X_data_3 = selector.transform(X_data_2)
# X_test_3 = selector.transform(X_test_2)

## 4.4 Using RFECV

In [None]:
selector = SelectKBest(score_func=chi2, k=400)
selector = selector.fit(X_data_2.abs(), y = y_data)
X_data_2_c = selector.transform(X_data_2)
X_test_2_c = selector.transform(X_test_2)

estimator = RandomForestRegressor(n_estimators=320, min_samples_split=3)
selector = RFECV(estimator, step=10, cv=5, n_features_to_select=85, verbose=2)
selector = selector.fit(X_data_2_c, y_data.ravel())
X_data_3 = selector.transform(X_data_2_c)
X_test_3 = selector.transform(X_test_2_c)

# 5. Model Selection

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X_data_3, y_data, test_size=0.2, random_state=1)
print("X_train shape: ", X_train.shape)

X_train shape:  (920, 85)


## 5.1 KernelRidgeRegression
Not good at all. Best score ~0.2

In [None]:
# model = KernelRidge()
# parameters = {'alpha':[0.01, 0.05, 0.1, 0.2, 0.5, 0.8, 1.0, 3.0, 5.0], 'kernel':['chi2', 'rbf', 'sigmoid', 'laplacian'], 'gamma':[0.1, 1, 5, 10]}
# grid = GridSearchCV(model, parameters, scoring='r2', cv=5, verbose=3, n_jobs=-1)
# grid.fit(X_train, y_train.ravel())
# print(grid.best_estimator_)
# print(grid.best_score_)

# y_pred = grid.predict(X_valid)
# r_score = r2_score(y_valid, y_pred)
# print("Validation score:", r_score)

## 5.2 ExtraTreesRegression

In [None]:
# model = ExtraTreesRegressor()
# parameters = {'n_estimators':[100, 125, 150, 175, 200, 300, 325, 350,400], 'min_samples_split':[2, 3, 4]}
# grid = GridSearchCV(model, parameters, scoring='r2', cv=5, verbose=3, n_jobs=-1)
# grid.fit(X_train, y_train)
# print(grid.best_estimator_)
# print(grid.best_score_)

# y_pred = grid.predict(X_valid)
# r_score = r2_score(y_valid, y_pred)
# print("Validation score:", r_score)

## 5.3 RandomForestRegression

In [None]:
model = RandomForestRegressor()
parameters = {'n_estimators':[200, 300, 325, 350, 400, 430, 450, 500], 'min_samples_split':[2, 3, 4, 5]}
grid = GridSearchCV(model, parameters, scoring='r2', verbose=1, cv=5, n_jobs=-1)
grid.fit(X_train, y_train.ravel())
print(grid.best_estimator_)
print(grid.best_score_)

y_pred = grid.predict(X_valid)
r_score = r2_score(y_valid, y_pred)
print("Validation score:", r_score)


Fitting 5 folds for each of 32 candidates, totalling 160 fits
RandomForestRegressor(n_estimators=200)
0.4998341358542165
Validation score: 0.4525354011722176


In [None]:
# estimator2 = grid.best_estimator_
# selector2 = SelectFromModel(estimator2, max_features=150, threshold="mean")
# selector2 = selector2.fit(X_data_2, y_data.ravel())
# X_data_3 = selector2.transform(X_data_2)
# X_test_3 = selector2.transform(X_test_2)

# X_train, X_valid, y_train, y_valid = train_test_split(X_data_3, y_data, test_size=0.2, random_state=1)
# print("X_train shape: ", X_train.shape)

# rfr = estimator2.fit(X_train, y_train.ravel())
# y_pred = rfr.predict(X_valid)
# r_score = r2_score(y_valid,y_pred)
# print("Validation score:", r_score)


In [None]:
rfr = estimator.fit(X_train, y_train)
y_pred = rfr.predict(X_valid)
r_score = r2_score(y_valid,y_pred)
print("Validation score:", r_score)

## 5.4 SupportVectorRegression
score ~ 0.5

In [None]:
# model = SVR()
# parameters = {'kernel':['sigmoid', 'rbf', 'poly'], 'degree':[3, 4, 5, 7, 9], 'gamma':[0.005, 0.01, 0.05, 0.1, 1], 'C':[0.01, 0.05, 0.1, 1, 2]}
# grid = GridSearchCV(model, parameters, scoring='r2', cv=5, verbose=1, n_jobs=8)
# grid.fit(X_train, y_train.ravel())
# print(grid.best_estimator_)
# print(grid.best_score_)

# y_pred = grid.predict(X_valid)
# r_score = r2_score(y_valid, y_pred)
# print("Validation score:", r_score)

# 6. Export the prediction

In [None]:
# rfr = grid.best_estimator_
rfr = rfr.fit(X_data_3, y_data.ravel())

y_test = rfr.predict(X_test_3)
data_id = X_df['id']
result = list(zip(data_id,y_test))
result_table = pd.DataFrame(data = result, columns = ['id', 'y'])
result_table.tail()
result_table.to_csv('rfr7.csv', index = False)