In [1]:
from __future__ import print_function, division

In [2]:
# import libraries
import numpy as np
import pandas as pd
from PIL import Image
from sklearn.preprocessing import scale
from sklearn.decomposition import KernelPCA
from sklearn.linear_model import Ridge, ElasticNet, BayesianRidge
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor

In [63]:
# update your data path
DATA_PATH = "/home/aunagar/Personal/Study/Sem1/Advanced ML/projects/task1/Task1/"

In [64]:
# import data
train_X = pd.read_csv(DATA_PATH + "X_train.csv")
train_Y = pd.read_csv(DATA_PATH + "y_train.csv")
test_X = pd.read_csv(DATA_PATH + "X_test.csv")
sample_submission = pd.read_csv(DATA_PATH + "sample.csv")

In [65]:
# split data
train_ids = train_X.iloc[:, 0]
train_features = train_X.iloc[:, 1:]
test_ids = test_X.iloc[:, 0]
test_features = test_X.iloc[:, 1:]

### Normalization and Preprocessing

In [67]:
######## normalization ##########
# train
train_mean, train_std = train_features.mean(), train_features.std()
train_features = (train_features - train_mean)/train_std
# test 
test_features = (test_features - train_mean)/train_std

######## missing value imputation ########
# train
train_features = train_features.fillna(train_features.mean())
# test
test_features = test_features.fillna(train_features.mean())

# there are some columns which were 0.0 everywhere
null_columns = train_features.columns[train_features.loc[:,: ].isnull().sum() > 0]
# train
train_features = train_features.drop(columns = null_columns)
# test
test_features = test_features.drop(columns = null_columns)

############## Outlier removal ###############
# train
train_features[train_features > 3.0] = np.nan
train_features[train_features < -3.0] = np.nan
train_features = train_features.fillna(train_features.mean())

# test
test_features[test_features > 3.0] = np.nan
test_features[test_features < -3.0] = np.nan
test_features = test_features.fillna(train_features.mean())

### Dimensionality Reduction

In [68]:
###### Correlated feature removal #########
# Create correlation matrix
corr_matrix = train_features.corr().abs()
# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
# Find index of feature columns with correlation greater than 0.95
to_drop = [column for column in upper.columns if any(upper[column] > 0.7)]

# train
train_features = train_features.drop(columns = to_drop)
# test
test_features = test_features.drop(columns = to_drop)

In [69]:
####### Decomposition ##########
transformer = KernelPCA(n_components=50, kernel = 'linear')
transformer.fit(train_features)

# train
train_features = transformer.transform(train_features)
# test
test_features = transformer.transform(test_features)

### Modeling and cross validation

In [70]:
###### linear model
lr = ElasticNet(alpha = 0.5, l1_ratio=0.5)
validation_score = cross_val_score(lr, train_features, train_Y.iloc[:, 1:], cv = 5, scoring = 'r2')
print(validation_score.mean())

# train model on whole train data
lr.fit(X = train_features, y = train_Y.iloc[:, 1])

0.3353383493686912


ElasticNet(alpha=0.5, copy_X=True, fit_intercept=True, l1_ratio=0.5,
           max_iter=1000, normalize=False, positive=False, precompute=False,
           random_state=None, selection='cyclic', tol=0.0001, warm_start=False)

In [73]:
#### BayesianRegression
br = BayesianRidge(n_iter=1000)
validation_score = cross_val_score(br, train_features, train_Y.iloc[:, 1:], cv = 5, scoring= 'r2')
print(validation_score.mean())

br.fit(X= train_features, y = train_Y.iloc[:, 1].values)

0.33119082995724075


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


BayesianRidge(alpha_1=1e-06, alpha_2=1e-06, compute_score=False, copy_X=True,
              fit_intercept=True, lambda_1=1e-06, lambda_2=1e-06, n_iter=1000,
              normalize=False, tol=0.001, verbose=False)

In [77]:
#### Random Forest Regression
rfr = RandomForestRegressor(n_estimators=200, criterion='mse', min_samples_split = 10, max_depth = 6)
validation_score = cross_val_score(rfr, train_features, train_Y.iloc[:, 1], cv = 5, scoring= 'r2')
print(validation_score.mean())

rfr.fit(X= train_features, y = train_Y.iloc[:, 1].values)

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


0.32784461451863967


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=6,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=10,
                      min_weight_fraction_leaf=0.0, n_estimators=200,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

### Predictions

In [52]:
best_model = 

In [53]:
predicted = best_model.predict(test_features)

In [58]:
sample_submission['y'] = predicted

In [61]:
sample_submission.to_csv("submissions/Ajay_1st_sub.csv", index = False)