In [None]:
# Please make sure all files are in the same repository as this code
import pandas as pd
import numpy as np
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import ElasticNetCV
from sklearn.metrics import mean_squared_error

#Loading given data
df = pd.read_csv('/content/drive/MyDrive/ColabNotebooks/internship_train.csv')
X = df.drop('target', axis=1)
Y = df['target']
test_df = pd.read_csv('/content/drive/MyDrive/ColabNotebooks/internship_hidden_test.csv')

In [None]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,44,45,46,47,48,49,50,51,52,target
0,236,488,16,221,382,97,-4.472136,0.107472,0,132,...,13.340874,0.870542,1.962937,7.466666,11.547794,8.822916,9.046424,7.895535,11.010677,20.107472
1,386,206,357,232,1,198,7.81025,0.763713,1,143,...,12.484882,7.16868,2.885415,12.413973,10.260494,10.091351,9.270888,3.173994,13.921871,61.763713
2,429,49,481,111,111,146,8.602325,0.651162,1,430,...,14.030257,0.39497,8.160625,12.592059,8.937577,2.265191,11.255721,12.794841,12.080951,74.651162
3,414,350,481,370,208,158,8.306624,0.424645,1,340,...,2.789577,6.416708,10.549814,11.456437,6.468099,2.519049,0.258284,9.317696,5.383098,69.424645
4,318,359,20,218,317,301,8.124038,0.767304,1,212,...,1.88656,1.919999,2.268203,0.149421,4.105907,10.416291,6.816217,8.58696,4.512419,66.767304


Looking at data, we can see that the difference between mean values of each column is pretty different, so it seems clear that they are in different units. Therefore, we may need to normalize it like this:

In [None]:
def transform_convert(X):
  polynom_converter = PolynomialFeatures(degree=2)
  polynom_converter.fit(X)
  X = polynom_converter.transform(X)

  scaler = StandardScaler()
  scaler.fit(X)
  X = scaler.transform(X)
  return X
X = transform_convert(X)

Now as data is normalized, it seems unclear (at least to me) what exact regression we should use: ridge or lasso. That is why i decided to go with elastic net after trying many others

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.33)

model = ElasticNetCV(l1_ratio = [.1, .5, .7, .95, .99, 1], eps = 0.001,
                     n_alphas = 100, max_iter = 1000)

# Might take 4-6 minutes
model.fit(X_train, y_train)
mod_pred = model.predict(X_test)

RMSE = np.sqrt(mean_squared_error(y_test, mod_pred))
RMSE

0.04316043996435657

Results are pretty satisfying :)

If we look at the calculated l1_ratio, we can see that it's purely lasso regression, so there is no difference between Lasso and ElasticNet here.

In [9]:
#Code for making and saving predictions for test df
test_transformed = transform_convert(test_df)
test_df['predictions'] = model.predict(test_transformed)
test_df.to_csv("predictions.csv")