In [53]:
from __future__ import print_function, division

In [54]:
import warnings
warnings.filterwarnings('ignore')

In [55]:
# import libraries
import numpy as np
import pandas as pd
from PIL import Image
import matplotlib.pyplot as plt
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import r2_score
import xgboost as xgb
from sklearn.preprocessing import scale
from sklearn.decomposition import KernelPCA
from sklearn.linear_model import Ridge, ElasticNet, BayesianRidge
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

In [56]:
# update your data path
DATA_PATH = "/home/aunagar/Personal/Study/Sem1/Advanced ML/projects/task1/Task1/"

In [57]:
# import data
train_X = pd.read_csv(DATA_PATH + "X_train.csv")
train_Y = pd.read_csv(DATA_PATH + "y_train.csv")
test_X = pd.read_csv(DATA_PATH + "X_test.csv")
sample_submission = pd.read_csv(DATA_PATH + "sample.csv")

In [58]:
# split data
train_ids = train_X.iloc[:, 0]
train_features = train_X.iloc[:, 1:]
test_ids = test_X.iloc[:, 0]
test_features = test_X.iloc[:, 1:]

In [59]:
######## missing value imputation (median) ########
# train
train_features = train_features.fillna(train_features.mean())
# test
test_features = test_features.fillna(train_features.mean())


####### limiting feature using variance threshold (i.e. remove features with 0 variance) ######
train_features_mean, train_features_std = train_features.mean(), train_features.std()

train_features = train_features.iloc[:, np.where(train_features_std > 0.0)[0]]
test_features = test_features.iloc[:, np.where(train_features_std > 0.0)[0]]

############## Outlier removal ###############
train_features_mean, train_features_std = train_features.mean(), train_features.std()
# train
train_features[train_features > train_features_mean + 2.*train_features_std] = np.nan
train_features[train_features < train_features_mean -2.*train_features_std] = np.nan
train_features = train_features.fillna(train_features.mean())

# test
test_features[test_features > train_features_mean + 2.*train_features_std] = np.nan
test_features[test_features < train_features_mean - 2.*train_features_std] = np.nan
test_features = test_features.fillna(train_features.mean())

In [60]:
##### Normalization #####
# train
train_mean, train_std = train_features.mean(), train_features.std()
train_features = (train_features - train_mean)/train_std
# test 
test_features = (test_features - train_mean)/train_std

In [61]:
###### Correlated feature removal #########
# Create correlation matrix
corr_matrix = train_features.corr().abs()
# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
# Find index of feature columns with correlation greater than 0.7
to_drop = [column for column in upper.columns if any(upper[column] > 0.8)]

# # train
# train_features = train_features.drop(columns = to_drop)
# # test
# test_features = test_features.drop(columns = to_drop)

In [73]:
alpha = np.arange(0.1, 2., 0.4)
l1_ratio = np.arange(0.1,1.,0.4)
results = pd.DataFrame(columns=['alpha', 'l1_ratio', 'cv_score'])

In [74]:
###### linear model
for a in alpha:
    for l1 in l1_ratio:
        lr = ElasticNet(alpha = a, l1_ratio=l1)
        validation_score = cross_val_score(lr, train_features, train_Y.iloc[:, 1:], cv = 5, scoring = 'r2')
        print(a, ", ", l1, ", ", validation_score.mean(), "\n")

        # train model on whole train data
        lr.fit(X = train_features, y = train_Y.iloc[:, 1])
        # finding to which features to remove
        non_zero_weights = np.where(lr.coef_ != 0.)[0]

        # removing these features from training data
        train_features = train_features.iloc[:, non_zero_weights]
        test_features = test_features.iloc[:, non_zero_weights]
        
        #### training more complex model on this cleared data
        rfr = RandomForestRegressor(n_estimators=500, max_depth=5, verbose = False, n_jobs = -1)
        validation_score = cross_val_score(rfr, train_features, train_Y.iloc[:, 1], cv = 5, scoring= 'r2')
        print("Rfr validation score: ", validation_score.mean(), "\n")
        results = results.append({'alpha':a, 'l1_ratio':l1, 'cv_score':validation_score.mean()}, ignore_index = True)

0.1 ,  0.1 ,  0.46467041322348057 

Rfr validation score:  0.4856688936687591 

0.1 ,  0.5 ,  0.46410770011941815 

Rfr validation score:  0.4852579943996792 

0.1 ,  0.9 ,  0.4636486085630616 

Rfr validation score:  0.4843836982398395 

0.5 ,  0.1 ,  0.46449754018927586 

Rfr validation score:  0.485459171063605 

0.5 ,  0.5 ,  0.4620328366779666 

Rfr validation score:  0.48612076706730767 

0.5 ,  0.9 ,  0.4580077189508291 

Rfr validation score:  0.48624439605699166 

0.9 ,  0.1 ,  0.4596991601066714 

Rfr validation score:  0.48676272348825345 

0.9 ,  0.5 ,  0.4538195087384194 

Rfr validation score:  0.48330467003672295 

0.9 ,  0.9 ,  0.443850293027659 

Rfr validation score:  0.4869515021759798 

1.3000000000000003 ,  0.1 ,  0.4526003150049108 

Rfr validation score:  0.48629305200567535 

1.3000000000000003 ,  0.5 ,  0.4413911678543515 

Rfr validation score:  0.4866178286937153 

1.3000000000000003 ,  0.9 ,  0.42162898096640894 

Rfr validation score:  0.48664288066782735 


In [23]:
# finding to which features to remove
non_zero_weights = np.where(lr.coef_ != 0.)[0]

# removing these features from training data
train_features = train_features.iloc[:, non_zero_weights]
test_features = test_features.iloc[:, non_zero_weights]

In [28]:
xgbr = xgb.XGBRegressor(learning_rate=0.4, max_depth=5, n_estimators=500)
validation_score = cross_val_score(xgbr, train_features, train_Y.iloc[:, 1], cv = 5, scoring = 'r2')
print(validation_score.mean())

  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \


0.41871526873359244


### Model Prediction

In [31]:
best_model = rfr

In [32]:
predicted = best_model.predict(test_features)
sample_submission['y'] = predicted

In [33]:
sample_submission.to_csv("submissions/Ajay_4th_sub.csv", index = False)