In [25]:
from __future__ import print_function, division

In [49]:
# import libraries
import numpy as np
import pandas as pd
from PIL import Image
import matplotlib.pyplot as plt
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import scale
from sklearn.decomposition import KernelPCA
from sklearn.linear_model import Ridge, ElasticNet, BayesianRidge, RANSACRegressor, LinearRegression, HuberRegressor
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

In [50]:
# update your data path
DATA_PATH = "/home/aunagar/Personal/Study/Sem1/Advanced ML/projects/task1/Task1/"

In [51]:
# import data
train_X = pd.read_csv(DATA_PATH + "X_train.csv")
train_Y = pd.read_csv(DATA_PATH + "y_train.csv")
test_X = pd.read_csv(DATA_PATH + "X_test.csv")
sample_submission = pd.read_csv(DATA_PATH + "sample.csv")

In [52]:
# split data
train_ids = train_X.iloc[:, 0]
train_features = train_X.iloc[:, 1:]
test_ids = test_X.iloc[:, 0]
test_features = test_X.iloc[:, 1:]

In [53]:
######## missing value imputation ########
# train
train_features = train_features.fillna(train_features.mean())
# test
test_features = test_features.fillna(train_features.mean())

####### limiting feature using variance threshold (i.e. remove features with 0 variance) ######
train_features_mean, train_features_std = train_features.mean(), train_features.std()

train_features = train_features.iloc[:, np.where(train_features_std > 0.0)[0]]
test_features = test_features.iloc[:, np.where(train_features_std > 0.0)[0]]

############## Outlier removal ###############
# train
train_features[train_features > train_features_mean + 3.0*train_features_std] = np.nan
train_features[train_features < train_features_mean -3.0*train_features_std] = np.nan
train_features = train_features.fillna(train_features.mean())

# test
test_features[test_features > train_features_mean + 3.0*train_features_std] = np.nan
test_features[test_features < train_features_mean - 3.0*train_features_std] = np.nan
test_features = test_features.fillna(train_features.mean())

In [54]:
##### Normalization #####
# train
train_mean, train_std = train_features.mean(), train_features.std()
train_features = (train_features - train_mean)/train_std
# test 
test_features = (test_features - train_mean)/train_std

In [55]:
###### Correlated feature removal #########
# Create correlation matrix
corr_matrix = train_features.corr().abs()
# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
# Find index of feature columns with correlation greater than 0.7
to_drop = [column for column in upper.columns if any(upper[column] > 0.7)]

# train
train_features = train_features.drop(columns = to_drop)
# test
test_features = test_features.drop(columns = to_drop)

In [56]:
# ##### correlation with respect to Y #######
# corr_values = train_features.corrwith(train_Y['y'], axis = 0)

# # Find index of feature columns with correlation greater than 0.7
# to_drop = [column for column in corr_values.index if corr_values.loc[column] < 0.01]

In [57]:
###### linear model
lr = ElasticNet(alpha = 0.5, l1_ratio=0.5)
validation_score = cross_val_score(lr, train_features, train_Y.iloc[:, 1:], cv = 5, scoring = 'r2')
print(validation_score.mean())

# train model on whole train data
lr.fit(X = train_features, y = train_Y.iloc[:, 1])

0.42487022151743564


ElasticNet(alpha=0.5, copy_X=True, fit_intercept=True, l1_ratio=0.5,
           max_iter=1000, normalize=False, positive=False, precompute=False,
           random_state=None, selection='cyclic', tol=0.0001, warm_start=False)

In [58]:
# finding to which features to remove
non_zero_weights = np.where(lr.coef_ != 0.)[0]

# removing these features from training data
train_features = train_features.iloc[:, non_zero_weights]
test_features = test_features.iloc[:, non_zero_weights]

In [48]:
###### RANSAC
rfr = RandomForestRegressor(n_estimators=20, max_depth=10)
rsr = RANSACRegressor(base_estimator=rfr, min_samples=0.2)
#validation_score = cross_val_score(rsr, train_features, train_Y.iloc[:, 1:], cv = 5, scoring = 'r2')
#print(validation_score.mean())

# train model on whole train data
rsr.fit(X = train_features, y = train_Y.iloc[:, 1])

RANSACRegressor(base_estimator=RandomForestRegressor(bootstrap=True,
                                                     criterion='mse',
                                                     max_depth=10,
                                                     max_features='auto',
                                                     max_leaf_nodes=None,
                                                     min_impurity_decrease=0.0,
                                                     min_impurity_split=None,
                                                     min_samples_leaf=1,
                                                     min_samples_split=2,
                                                     min_weight_fraction_leaf=0.0,
                                                     n_estimators=20,
                                                     n_jobs=None,
                                                     oob_score=False,
                                                    

In [12]:
#### training more complex model on this cleared data
rfr = RandomForestRegressor(n_estimators=1000, max_depth=15)
validation_score = cross_val_score(rfr, train_features, train_Y.iloc[:, 1], cv = 5, scoring= 'r2')
print(validation_score.mean())

rfr.fit(X= train_features, y = train_Y.iloc[:, 1].values)

0.5027772868644284


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=15,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=1000,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

In [56]:
#### training more complex model on this cleared data
gbr = GradientBoostingRegressor(learning_rate= 1e-03, n_estimators=5000, subsample=0.3, max_depth=10,
                                max_features='sqrt', n_iter_no_change=5, validation_fraction=0.3, tol = 1e-06)
validation_score = cross_val_score(gbr, train_features, train_Y.iloc[:, 1], cv = 5, scoring= 'r2')
print(validation_score.mean())

gbr.fit(X= train_features, y = train_Y.iloc[:, 1].values)

0.4812409612845605


GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
                          learning_rate=0.001, loss='ls', max_depth=10,
                          max_features='sqrt', max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=5000,
                          n_iter_no_change=5, presort='auto', random_state=None,
                          subsample=0.3, tol=1e-06, validation_fraction=0.3,
                          verbose=0, warm_start=False)

In [44]:
best_model = rfr

In [45]:
predicted = best_model.predict(test_features)
sample_submission['y'] = predicted

In [46]:
sample_submission.to_csv("submissions/Ajay_3rd_sub.csv", index = False)