# Setting Up the Environmetn

In [214]:
import numpy as np
import pandas as pd
from   sklearn.preprocessing import StandardScaler,PolynomialFeatures 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

In [215]:
dataset = pd.read_csv("train.csv")

In [None]:
dataset

In [None]:
# dataset_x = dataset.drop("mpg",axis=1)
f1=dataset["Feature1"]
f2=dataset["Feature2"]
f3=dataset["Feature3"]
f4=dataset["Feature4"]
f1

In [None]:
dataset_x = dataset.drop("Label",axis=1)
dataset_x

In [None]:
Y=dataset["Label"]
Y

## Exporatory Data Analysis

( try proving that none of the features can be modeled using Linear Regression Convincingly )

In [None]:
dataset.describe()

In [None]:
f1.shape

# Data Preprocessing

1. **Removing Null values**: 
   - All rows with missing values were dropped. 

2. **Converting Bool to 0 or 1**:
   - `Feature2` was converted to numeric (0 or 1) for model compatibility.

3. **Removing Outlier**: 
   - Any entry with a Z score more than a certain threshold ( = 2 ) was conisdered an outlier and was dropped.

4. **Feature Scaling**:
   - Features were scaled using `StandardScaler` to ensure proper model performance.

5. **Splitting train data into train and validation partition**:
   - `train_data_x`   split into `train_data_x` and `validation_data_x` respectively

# Pre-processing

In [222]:
scaler=StandardScaler()

In [None]:
X = dataset_x.dropna()
X

In [224]:
Y = Y.loc[X.index]

In [225]:
def remove_outliers(df, z_thresh=2):
    z_scores = np.abs((df - df.mean()) / df.std()) 
    return df[(z_scores < z_thresh).all(axis=1)]

X = remove_outliers(X)

In [None]:
Y = Y.loc[X.index]
X

In [227]:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y, test_size=0.2, random_state=543)
X_train_standardized = scaler.fit_transform(X=X_train)
X_test_standardized = scaler.fit_transform(X=X_test)

# Model Training

In [228]:
from sklearn.linear_model import LinearRegression,Lasso,Ridge
from sklearn.metrics import mean_squared_error

In [229]:
poly = PolynomialFeatures(degree=2)

X_train_polynomial = poly.fit_transform(X_train)
X_test_polynomial  = poly.transform(X_test)

scaler=StandardScaler()
X_train_polynomial_transformed = scaler.fit_transform(X_train_polynomial)
X_test_polynomial_transformed  = scaler.transform(X_test_polynomial)

In [230]:
def evaluate_models(degree):
    poly = PolynomialFeatures(degree=degree)
    X_train_poly = poly.fit_transform(X_train_standardized)
    X_val_poly = poly.transform(X_test_standardized)
    
    #for each degree 3 cases -> normal, ridge and lasso
    models = {
        'Polynomial Regression': LinearRegression(),
        'Ridge Regression':      Ridge(alpha=0.001),
        'Lasso Regression':      Lasso(alpha=0.01)
    }
    
    best_model = None
    best_mse = float('inf')
    best_name = ''
    
    #for all 3 cases, we calculate mse whichever is the lowest MSE, we return the best model
    for name, model in models.items():
        model.fit(X_train_poly, Y_train)
        y_val_pred = model.predict(X_val_poly)
        val_mse = mean_squared_error(Y_test, y_val_pred)
        print(f'{name} (degree {degree}) Validation MSE: {val_mse}')
        
        if val_mse < best_mse:
            best_model = model
            best_mse = val_mse
            best_name = name
    
    return best_model, poly, best_mse, best_name

In [None]:
degrees = [1,2, 3, 4, 5, 6, 7, 8]
best_overall_model = None
best_overall_poly = None
best_overall_mse = float('inf')
best_overall_name = ''
best_degree = 0

#iterate over all degrees from 1 to 5 and then among the best models for that degree, we find the best model (again with overall lowest mse)
for degree in degrees:
    best_model, best_poly, best_mse, best_name = evaluate_models(degree)
    if best_mse < best_overall_mse:
        best_overall_model = best_model
        best_overall_poly = best_poly
        best_overall_mse = best_mse
        best_overall_name = best_name
        best_degree = degree

## Model Evaluation

In [None]:
print(f'Best Model: {best_overall_name} (degree {best_degree})')
print(f'Best Model Validation MSE: {best_overall_mse}')

test_dataset = pd.read_csv("test.csv")
test_dataset_x = test_dataset.drop("id",axis=1)
test_X = test_dataset_x.dropna()
X_test_standardized = scaler.fit_transform(X=test_X)

# Transform the test data using the best polynomial model
test_poly = best_overall_poly.transform(X_test_standardized)

# Predict on the test set
test_predictions = best_overall_model.predict(test_poly)

In [None]:
len(test_dataset)

In [None]:
len(test_dataset_x)

In [235]:
#putting predictions in file as per expected format 
ids = [i for  i in range(len(test_dataset)) ]
submission = pd.DataFrame({
    'id': ids,
    'Label': test_predictions
})
submission.to_csv('IMT2022543_submission_11.csv',index=False) 