##  Configuring Environment

In [44]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [3]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

## Exploratory Data Analysis ( EDA )

In [8]:
print("Train Data Overview:")
print(train_data.info())
print(train_data.describe(),end="\n\n\n")


print("Test Data Overview:")
print(test_data.info())
print(test_data.describe())

Train Data Overview:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 240 entries, 0 to 239
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Feature1  231 non-null    float64
 1   Feature2  240 non-null    bool   
 2   Feature3  240 non-null    float64
 3   Feature4  227 non-null    float64
 4   Label     240 non-null    float64
dtypes: bool(1), float64(4)
memory usage: 7.9 KB
None
         Feature1     Feature3    Feature4        Label
count  231.000000   240.000000  227.000000   240.000000
mean    30.277218    69.706130   59.584543   308.476235
std     11.176810   187.923364   45.546105   464.394364
min     10.432014 -1129.060001    0.000000   -32.466802
25%     20.973077    23.892696   15.156651    28.887374
50%     29.870713    54.468361   53.978447   140.441039
75%     39.987688    91.713817  105.722662   373.604837
max     49.487028  1260.447274  127.999842  3253.238626


Test Data Overview:
<class 'pandas.core.fr

In [5]:
f1=train_data["Feature1"]
f2=train_data["Feature2"]
f3=train_data["Feature3"]
f4=train_data["Feature4"]
f1

0      26.303954
1      19.646076
2      36.983463
3      36.516512
4      28.734387
         ...    
235    31.769066
236    28.015702
237    31.477720
238    27.198042
239    25.648317
Name: Feature1, Length: 240, dtype: float64

In [9]:
train_data_x = train_data.drop("Label",axis=1)
train_data_x

Unnamed: 0,Feature1,Feature2,Feature3,Feature4
0,26.303954,True,59.919036,126.853479
1,19.646076,False,34.504636,108.363933
2,36.983463,False,98.503396,5.085491
3,36.516512,False,38.418101,1.173449
4,28.734387,False,99.286184,29.123061
...,...,...,...,...
235,31.769066,False,189.379552,0.002523
236,28.015702,True,85.468516,61.646937
237,31.477720,False,0.360471,19.738584
238,27.198042,False,48.290593,18.924597


In [13]:
train_data_label=train_data["Label"]
train_data_label

0       170.361411
1        29.897337
2       373.460027
3        26.696336
4       332.682539
          ...     
235    1277.121058
236     304.899057
237      26.351808
238     102.047530
239      24.894291
Name: Label, Length: 240, dtype: float64

# Data Preprocessing

1. **Removing Null values**: 
   - All rows with missing values were dropped. 

2. **Converting Bool to 0 or 1**:
   - `Feature2` was converted to numeric (0 or 1) for model compatibility.

3. **Removing Outlier**: 
   - Any entry with a Z score more than a certain threshold ( = 2 ) was conisdered an outlier and was dropped.

4. **Feature Scaling**:
   - Features were scaled using `StandardScaler` to ensure proper model performance.

5. **Splitting train data into train and validation partition**:
   - `train_data_x`   split into `train_data_x` and `validation_data_x` respectively

## 1. Removing Null values

In [15]:
train_data_x_no_null = train_data_x.dropna()
train_data_label_no_nulll = train_data_label.loc[train_data_x_no_null.index]
train_data_x_no_null

Unnamed: 0,Feature1,Feature2,Feature3,Feature4
0,26.303954,True,59.919036,126.853479
1,19.646076,False,34.504636,108.363933
2,36.983463,False,98.503396,5.085491
3,36.516512,False,38.418101,1.173449
4,28.734387,False,99.286184,29.123061
...,...,...,...,...
235,31.769066,False,189.379552,0.002523
236,28.015702,True,85.468516,61.646937
237,31.477720,False,0.360471,19.738584
238,27.198042,False,48.290593,18.924597


## 2. Removing Outliers

In [16]:
remove_outliers = lambda df, z_thresh=2: df[(np.abs((df - df.mean()) / df.std()) < z_thresh).all(axis=1)]
train_data_x_clean = remove_outliers(train_data_x_no_null)
train_data_label_clean = train_data_label_no_nulll.loc[train_data_x_clean.index]
train_data_label_clean

0       170.361411
1        29.897337
2       373.460027
3        26.696336
4       332.682539
          ...     
235    1277.121058
236     304.899057
237      26.351808
238     102.047530
239      24.894291
Name: Label, Length: 212, dtype: float64

## 3. Type Casting Feature2 to int

In [17]:
train_data_x_clean['Feature2'] = train_data_x_clean['Feature2'].astype(int)
test_data['Feature2']          = test_data['Feature2'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data_x_clean['Feature2'] = train_data_x_clean['Feature2'].astype(int)


In [20]:
train_data_x_clean

Unnamed: 0,Feature1,Feature2,Feature3,Feature4
0,26.303954,1,59.919036,126.853479
1,19.646076,0,34.504636,108.363933
2,36.983463,0,98.503396,5.085491
3,36.516512,0,38.418101,1.173449
4,28.734387,0,99.286184,29.123061
...,...,...,...,...
235,31.769066,0,189.379552,0.002523
236,28.015702,1,85.468516,61.646937
237,31.477720,0,0.360471,19.738584
238,27.198042,0,48.290593,18.924597


In [26]:
X = train_data_x_clean
y = train_data_label_clean
X_test = test_data[['Feature1', 'Feature2', 'Feature3', 'Feature4']]

## 5. Splitting train data into train and validation partition

In [38]:
X,validation_X, y,validation_y = train_test_split(X,y, test_size=0.2, random_state=543)
validation_X

Unnamed: 0,Feature1,Feature2,Feature3,Feature4
126,16.589071,0,47.634097,78.085552
72,15.677264,0,60.430686,48.669945
4,28.734387,0,99.286184,29.123061
173,10.804856,0,14.13166,65.358907
61,28.596068,1,43.866911,32.787804
76,31.098075,0,45.899261,101.585255
205,29.870713,1,12.049476,57.094771
32,34.76525,1,36.943107,74.44232
57,18.318807,0,35.7337,14.324672
2,36.983463,0,98.503396,5.085491


# Model Training

## 4. Feature Scaling

In [None]:
from sklearn.linear_model import LinearRegression,Lasso,Ridge
from sklearn.metrics import mean_squared_error

In [None]:
linearReg=LinearRegression()
lassoReg=Lasso(alpha=0.01)
ridgeReg=Ridge(alpha=0.01)

In [41]:
scaler = StandardScaler()
X_poly_scaled      = scaler.fit_transform(X)
X_test_poly_scaled = scaler.fit_transform(validation_X)

In [42]:
def evaluate_models(degree):
   poly = PolynomialFeatures(degree=2, include_bias=False)
   X_poly = poly.fit_transform(X)
   validation_X_poly = poly.transform(validation_X)
    
    #for each degree 3 cases -> normal, ridge and lasso
   models = {
        'Polynomial Regression': LinearRegression(),
        'Ridge Regression': Ridge(alpha=1.0),
        'Lasso Regression': Lasso(alpha=0.1)
    }
    
   best_model = None
   best_mse = float('inf')
   best_name = ''
    
    #for all 3 cases, we calculate mse whichever is the lowest MSE, we return the best model
   for name, model in models.items():
       model.fit(X_poly, y)
       y_val_pred = model.predict(validation_X_poly)
       val_mse = mean_squared_error(validation_y, y_val_pred)
       print(f'{name} (degree {degree}) Validation MSE: {val_mse}')
        
       if val_mse < best_mse:
            best_model = model
            best_mse = val_mse
            best_name = name
    
   return best_model, poly, best_mse, best_name

In [45]:
degrees = [1,2, 3, 4, 5,6,7,8,9,10]
best_overall_model = None
best_overall_poly = None
best_overall_mse = float('inf')
best_overall_name = ''
best_degree = 0

#iterate over all degrees from 1 to 5 and then among the best models for that degree, we find the best model (again with overall lowest mse)
for degree in degrees:
    best_model, best_poly, best_mse, best_name = evaluate_models(degree)
    if best_mse < best_overall_mse:
        best_overall_model = best_model
        best_overall_poly = best_poly
        best_overall_mse = best_mse
        best_overall_name = best_name
        best_degree = degree

Polynomial Regression (degree 1) Validation MSE: 1604.3146770072665
Ridge Regression (degree 1) Validation MSE: 1599.6487921338046
Lasso Regression (degree 1) Validation MSE: 1600.9299529995935
Polynomial Regression (degree 2) Validation MSE: 1604.3146770072665
Ridge Regression (degree 2) Validation MSE: 1599.6487921338046
Lasso Regression (degree 2) Validation MSE: 1600.9299529995935
Polynomial Regression (degree 3) Validation MSE: 1604.3146770072665
Ridge Regression (degree 3) Validation MSE: 1599.6487921338046
Lasso Regression (degree 3) Validation MSE: 1600.9299529995935
Polynomial Regression (degree 4) Validation MSE: 1604.3146770072665
Ridge Regression (degree 4) Validation MSE: 1599.6487921338046
Lasso Regression (degree 4) Validation MSE: 1600.9299529995935
Polynomial Regression (degree 5) Validation MSE: 1604.3146770072665
Ridge Regression (degree 5) Validation MSE: 1599.6487921338046
Lasso Regression (degree 5) Validation MSE: 1600.9299529995935
Polynomial Regression (degree 

In [46]:
print(f'Best Model: {best_overall_name} (degree {best_degree})')
print(f'Best Model Validation MSE: {best_overall_mse}')

test_dataset = pd.read_csv("test.csv")
test_dataset_x = test_dataset.drop("id",axis=1)
test_X = test_dataset_x.dropna()
X_test_standardized = scaler.fit_transform(X=test_X)

# Transform the test data using the best polynomial model
test_poly = best_overall_poly.transform(X_test_standardized)

# Predict on the test set
test_predictions = best_overall_model.predict(test_poly)

Best Model: Ridge Regression (degree 1)
Best Model Validation MSE: 1599.6487921338046




In [48]:
#putting predictions in file as per expected format 
ids = [i for  i in range(len(test_dataset)) ]
submission = pd.DataFrame({
    'id': ids,
    'Label': test_predictions
})
submission.to_csv('IMT2022543_submission9.csv',index=False) 