In [377]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split,cross_val_score, KFold

from sklearn.linear_model import LinearRegression,Ridge, Lasso
from sklearn.metrics import mean_squared_error

from sklearn.preprocessing import PolynomialFeatures,StandardScaler

from sklearn.metrics import r2_score

In [416]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
#used np.random.seed = 0 so it gives same random numbers
np.random.seed=0

In [379]:
#to check if data contains any duplicate
train_df.drop_duplicates()
train_df.shape
#it checks for columns which have null values
train_df.isna().sum()

Unnamed: 0,0
Feature1,9
Feature2,0
Feature3,0
Feature4,13
Label,0


In [380]:
#to calculate number of percentage of null values exist in column
null_value_percentages=(train_df.isna().sum()/train_df.shape[0])*100
null_value_percentages

#since in class it's mentioned that if null percentage value is less than < 5%
#we can remove rows,which contains null.

Unnamed: 0,0
Feature1,3.75
Feature2,0.0
Feature3,0.0
Feature4,5.416667
Label,0.0


In [381]:
#here we have removed for null percentage <7%, i.e removing all rows which has null values
rows_to_drop=null_value_percentages[null_value_percentages<7].sort_values(ascending=False)
rows_to_drop=rows_to_drop.keys()
for row in rows_to_drop:
	if(null_value_percentages[row]<7):
 		train_df.drop(labels=train_df.index[train_df[row].isna()],inplace=True)
train_df.shape

(218, 5)

In [217]:
# it's commented as we are not using it, but to test data if we replace null value with mean of the column(i.e try another method)
# train_df["Feature4"].fillna(value=train_df["Feature4"].mean(), inplace=True)

In [218]:
# it's commented as we are not using it, but to test data if we replace null value with mean of the column (i.e try another method)
#train_df['Feature1'].fillna(value = train_df['Feature1'].mean(), inplace=True)

In [382]:
train_df.describe()

Unnamed: 0,Feature1,Feature3,Feature4,Label
count,218.0,218.0,218.0,218.0
mean,30.266184,77.151164,59.677684,312.933527
std,11.288466,178.419988,46.19687,474.819509
min,10.432014,-765.458449,0.0,-32.466802
25%,20.967679,24.51813,14.190661,28.680499
50%,28.692677,56.419921,53.137483,141.023046
75%,40.171933,92.265208,108.10715,373.894458
max,49.487028,1260.447274,127.999842,3253.238626


In [383]:
# to convert "Feature2" column as int, for training and testing data model
train_df["Feature2"] = train_df["Feature2"].astype("int")
train_df.describe()

Unnamed: 0,Feature1,Feature2,Feature3,Feature4,Label
count,218.0,218.0,218.0,218.0,218.0
mean,30.266184,0.43578,77.151164,59.677684,312.933527
std,11.288466,0.497,178.419988,46.19687,474.819509
min,10.432014,0.0,-765.458449,0.0,-32.466802
25%,20.967679,0.0,24.51813,14.190661,28.680499
50%,28.692677,0.0,56.419921,53.137483,141.023046
75%,40.171933,1.0,92.265208,108.10715,373.894458
max,49.487028,1.0,1260.447274,127.999842,3253.238626


In [384]:
#to divide data into input(x) and output(y)
X = train_df.drop(columns='Label')
print(X)
Y = train_df['Label']
print(Y)

      Feature1  Feature2    Feature3    Feature4
0    26.303954         1   59.919036  126.853479
1    19.646076         0   34.504636  108.363933
2    36.983463         0   98.503396    5.085491
3    36.516512         0   38.418101    1.173449
4    28.734387         0   99.286184   29.123061
..         ...       ...         ...         ...
235  31.769066         0  189.379552    0.002523
236  28.015702         1   85.468516   61.646937
237  31.477720         0    0.360471   19.738584
238  27.198042         0   48.290593   18.924597
239  25.648317         0   14.710356    9.092079

[218 rows x 4 columns]
0       170.361411
1        29.897337
2       373.460027
3        26.696336
4       332.682539
          ...     
235    1277.121058
236     304.899057
237      26.351808
238     102.047530
239      24.894291
Name: Label, Length: 218, dtype: float64


In [385]:
#Remove any row where at least one column has a Z-score greater than 2 (or the specified threshold), meaning that row contains an outlier
def remove_outliers(df, z_thresh=2):
    z_scores = np.abs((df - df.mean()) / df.std())
    return df[(z_scores < z_thresh).all(axis=1)]

In [386]:
#to remove outliers from X and make changes to y also
X = remove_outliers(X)
Y = Y.loc[X.index]

In [387]:
#diving the data to train and test the data
X_train, X_validate, y_train, y_validate = train_test_split(X,Y,test_size = 0.25, random_state = 67)

In [388]:
#standardardizing the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_validation_scaled = scaler.transform(X_validate)


In [392]:
#function to perform different regularization models so that to get the most efficient
def evaluate_models(degree,alpha):
    poly = PolynomialFeatures(degree=degree)
    X_train_poly = poly.fit_transform(X_train_scaled)
    X_validate_poly = poly.transform(X_validation_scaled)

    models = {
        'Polynomial Regression': LinearRegression(),
        'Ridge Regression': Ridge(alpha=alpha),
        'Lasso Regression': Lasso(alpha=alpha)
    }
    bname = ''
    bmse = float('inf')
    bmodel = None

    for name, model in models.items():
        model.fit(X_train_poly, y_train)
        y_validate_pred = model.predict(X_validate_poly)
        value_mse = mean_squared_error(y_validate, y_validate_pred)
        print(f'{name} (Degree {degree})  Validation MSE: {value_mse}')

        if value_mse < bmse:
            bname = name
            bmodel = model
            bmse = value_mse


    return bmodel, poly, bmse, bname

In [409]:
#this checks for different polynomial degrees and alphas for regularization
#and provides with the perfect training model and it details
degrees = [1,2, 3, 4, 5]
alphas = [0.01,0.05,0.1,0.15,0.3,0.5,0.9,1,1.1,1.3]
best_name = ''
best_model = None
best_poly = None
best_mse = float('inf')
best_degree = 0
best_alpha = 0

for degree in degrees:
  for alpha in alphas:
    model, poly, mse, name = evaluate_models(degree,alpha)
    if mse < best_mse:
        best_model = model
        best_poly = poly
        best_mse = mse
        best_name = name
        best_degree = degree
        best_alpha = alpha


Polynomial Regression (Degree 1)  Validation MSE: 31932.97617008545
Ridge Regression (Degree 1)  Validation MSE: 31934.11366925672
Lasso Regression (Degree 1)  Validation MSE: 31932.211927323646
Polynomial Regression (Degree 1)  Validation MSE: 31932.97617008545
Ridge Regression (Degree 1)  Validation MSE: 31938.674938228643
Lasso Regression (Degree 1)  Validation MSE: 31929.313244209385
Polynomial Regression (Degree 1)  Validation MSE: 31932.97617008545
Ridge Regression (Degree 1)  Validation MSE: 31944.40182068823
Lasso Regression (Degree 1)  Validation MSE: 31925.39849492182
Polynomial Regression (Degree 1)  Validation MSE: 31932.97617008545
Ridge Regression (Degree 1)  Validation MSE: 31950.156707117574
Lasso Regression (Degree 1)  Validation MSE: 31921.497477407713
Polynomial Regression (Degree 1)  Validation MSE: 31932.97617008545
Ridge Regression (Degree 1)  Validation MSE: 31967.588293606746
Lasso Regression (Degree 1)  Validation MSE: 31909.87668659356
Polynomial Regression (D

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Lasso Regression (Degree 3)  Validation MSE: 2371.4718487432506
Polynomial Regression (Degree 3)  Validation MSE: 2754.355617178716
Ridge Regression (Degree 3)  Validation MSE: 2642.344832355615
Lasso Regression (Degree 3)  Validation MSE: 2343.387412469019
Polynomial Regression (Degree 3)  Validation MSE: 2754.355617178716
Ridge Regression (Degree 3)  Validation MSE: 2634.283071240141
Lasso Regression (Degree 3)  Validation MSE: 2317.6213671516753
Polynomial Regression (Degree 3)  Validation MSE: 2754.355617178716
Ridge Regression (Degree 3)  Validation MSE: 2619.538358642238
Lasso Regression (Degree 3)  Validation MSE: 2265.9711090101523
Polynomial Regression (Degree 4)  Validation MSE: 5554.3189366019815
Ridge Regression (Degree 4)  Validation MSE: 5547.231960941549
Lasso Regression (Degree 4)  Validation MSE: 5555.3666345357315
Polynomial Regression (Degree 4)  Validation MSE: 5554.3189366019815
Ridge Regression (Degree 4)  Validation MSE: 5519.710700246948
Lasso Regression (Degree

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Lasso Regression (Degree 4)  Validation MSE: 4585.867999574303
Polynomial Regression (Degree 4)  Validation MSE: 5554.3189366019815
Ridge Regression (Degree 4)  Validation MSE: 5277.765152922078
Lasso Regression (Degree 4)  Validation MSE: 4329.9583708271575
Polynomial Regression (Degree 4)  Validation MSE: 5554.3189366019815
Ridge Regression (Degree 4)  Validation MSE: 5127.623469701876
Lasso Regression (Degree 4)  Validation MSE: 3902.467191411439
Polynomial Regression (Degree 4)  Validation MSE: 5554.3189366019815
Ridge Regression (Degree 4)  Validation MSE: 5095.986797568178
Lasso Regression (Degree 4)  Validation MSE: 3795.603409836908
Polynomial Regression (Degree 4)  Validation MSE: 5554.3189366019815
Ridge Regression (Degree 4)  Validation MSE: 5066.173756233313
Lasso Regression (Degree 4)  Validation MSE: 3768.3772567091687
Polynomial Regression (Degree 4)  Validation MSE: 5554.3189366019815
Ridge Regression (Degree 4)  Validation MSE: 5011.263197328783
Lasso Regression (Degre

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Polynomial Regression (Degree 5)  Validation MSE: 41621.56883063586
Ridge Regression (Degree 5)  Validation MSE: 18197.038569778306
Lasso Regression (Degree 5)  Validation MSE: 9338.01808133205
Polynomial Regression (Degree 5)  Validation MSE: 41621.56883063586
Ridge Regression (Degree 5)  Validation MSE: 16110.706244845378
Lasso Regression (Degree 5)  Validation MSE: 8303.91012046615
Polynomial Regression (Degree 5)  Validation MSE: 41621.56883063586
Ridge Regression (Degree 5)  Validation MSE: 14739.964185761173
Lasso Regression (Degree 5)  Validation MSE: 7391.612363909666
Polynomial Regression (Degree 5)  Validation MSE: 41621.56883063586
Ridge Regression (Degree 5)  Validation MSE: 12637.02067482332
Lasso Regression (Degree 5)  Validation MSE: 6823.969755658343
Polynomial Regression (Degree 5)  Validation MSE: 41621.56883063586
Ridge Regression (Degree 5)  Validation MSE: 11566.987865429846
Lasso Regression (Degree 5)  Validation MSE: 7386.481307992569
Polynomial Regression (Degre

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Ridge Regression (Degree 5)  Validation MSE: 10852.667433661747
Lasso Regression (Degree 5)  Validation MSE: 7674.600214732541
Polynomial Regression (Degree 5)  Validation MSE: 41621.56883063586
Ridge Regression (Degree 5)  Validation MSE: 10767.332186170037
Lasso Regression (Degree 5)  Validation MSE: 7666.381367310388
Polynomial Regression (Degree 5)  Validation MSE: 41621.56883063586
Ridge Regression (Degree 5)  Validation MSE: 10698.19557789906
Lasso Regression (Degree 5)  Validation MSE: 7719.053249212128
Polynomial Regression (Degree 5)  Validation MSE: 41621.56883063586
Ridge Regression (Degree 5)  Validation MSE: 10592.71273279684
Lasso Regression (Degree 5)  Validation MSE: 7952.340451520302


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


In [410]:
#this provides with the best model and it's details with ,mse
print(f'Best Model: {best_name} (degree {best_degree} ,alpha {best_alpha})')
print(f'Best Model Validation MSE: {best_mse}')

#it's observed that on increasing value of alpha , leads to overfitting on data model



Best Model: Lasso Regression (degree 2 ,alpha 1.3)
Best Model Validation MSE: 2141.9132671664315


In [417]:
test_df

Unnamed: 0,id,Feature1,Feature2,Feature3,Feature4
0,0,26.520753,False,2.276917,30.081571
1,1,19.260327,False,19.963927,32.54003
2,2,35.662216,False,114.665257,111.156286
3,3,17.693725,False,62.135788,11.251143
4,4,44.396224,False,167.567429,127.518999
5,5,39.486321,False,66.371886,117.147907
6,6,12.166547,False,32.204082,9.238612
7,7,40.238035,True,53.926449,29.840928
8,8,31.62961,True,58.782199,3.546138
9,9,46.008271,True,164.609236,22.277235


In [418]:

#it is used to test the data with the test data given,
# to remove index from testing data(save it as label_add used in last)
label_add = test_df[['id']].astype("int");
test_df = test_df.drop(columns=['id'])
test_scaled = scaler.transform(test_df)
test_poly = best_poly.transform(test_scaled)
poly_pred_test = best_model.predict(test_poly)



In [419]:
#to make a file for output label, as required
result = np.column_stack((label_add, poly_pred_test))

# Create a DataFrame with column names
result = pd.DataFrame(result, columns=['id', 'Label'])
result["id"] = result["id"].astype("int")
result = result.reset_index(drop=True)
result.to_csv('output.csv', index=False)



In [420]:
print("Alpha:", best_model.alpha)
print("L1 Ratio:", best_model.l1_ratio)
print("Coefficients:", best_model.coef_)
print("Intercept:", best_model.intercept_)
print("Number of iterations:", best_model.n_iter_)

Alpha: 1.3
L1 Ratio: 1.0
Coefficients: [  0.           4.29447277   2.12433755 303.78068767  -2.66062201
  -1.89358525  -6.23794056   8.08730934  -6.34396972   0.
   2.27521037   0.         123.19991553   1.61345887  -0.        ]
Intercept: 193.44428597137272
Number of iterations: 20
