# DATA CLEANING

In [92]:


import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [93]:
df = pd.read_csv('used_cars_data.csv')

In [97]:
df['Owner_Type'].unique().tolist()

['First', 'Second', 'Fourth & Above', 'Third']

In [98]:
for i, val in enumerate(df['Owner_Type']):
    if val == 'First':
        df.loc[i, 'Owner_Type'] = 1
    elif val == 'Second':
        df.loc[i, 'Owner_Type'] = 2
    elif val == 'Third':
        df.loc[i, 'Owner_Type'] = 3
    elif val == 'Fourth & Above':
        df.loc[i, 'Owner_Type'] = 4


In [99]:
main_df = df.drop(['Name','Location', 'Fuel_Type', 'Transmission', 'New_Price' ], axis = 1)

In [101]:
main_df['Mileage'] = main_df['Mileage'].str.extract('(\d+\.?\d*)').astype(float)
main_df['Engine'] = main_df['Engine'].str.extract('(\d+\.?\d*)').astype(float)
main_df['Power'] = main_df['Power'].str.extract('(\d+\.?\d*)').astype(float)

In [104]:
df_cleaned = main_df.dropna()

In [105]:
X = df_cleaned[['Year', 'Kilometers_Driven', 'Owner_Type', 'Mileage', 'Engine', 'Power', 'Seats']]
y = df_cleaned['Price']

In [106]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# LINEAR REGRESSION

In [107]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# 1. Create the model
model = LinearRegression()

# 2. Train the model
model.fit(X_train, y_train)

# 3. Make predictions
y_pred = model.predict(X_test)

# 4. Evaluate
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R² Score:", r2)


Mean Squared Error: 35.81756023180754
R² Score: 0.6585909401140368


## Average Prediciton Model ( DUMMY )

In [110]:
from sklearn.dummy import DummyRegressor
dummy = DummyRegressor(strategy="mean")
dummy.fit(X_train, y_train)
y_dummy = dummy.predict(X_test)

baseline_mse = mean_squared_error(y_test, y_dummy)
baseline_R2 = r2_score(y_test, y_dummy)
print("Baseline MSE:", baseline_mse)
print("Baseline R2:", baseline_R2)


Baseline MSE: 105.13375843208532
Baseline R2: -0.0021234667095249016


# LASSO REGRESSION

In [111]:
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error


In [112]:
lasso = Lasso(alpha=1.0)  # Start with default alpha
lasso.fit(X_train, y_train)


In [113]:
y_pred = lasso.predict(X_test)

r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)

print(f'R² Score: {r2}')
print(f'Mean Squared Error: {mse}')


R² Score: 0.6595569901871917
Mean Squared Error: 35.716210968569705


### Alpha Selection

In [114]:
params = {'alpha': [0.001, 0.01, 0.1, 1, 10, 100]}
grid = GridSearchCV(Lasso(), params, scoring='r2', cv=5)
grid.fit(X_train, y_train)

print(f"Best alpha: {grid.best_params_['alpha']}")
print(f"Best R² on train: {grid.best_score_}")

# Evaluate on test set
y_pred = grid.predict(X_test)
print(f"Test R²: {r2_score(y_test, y_pred)}")


Best alpha: 0.001
Best R² on train: 0.6650802296335133
Test R²: 0.6586088291701111


In [115]:
lasso_best = grid.best_estimator_
selected_features = X.columns[lasso_best.coef_ != 0]

print("Selected Features by Lasso:")
print(selected_features.tolist())

Selected Features by Lasso:
['Year', 'Kilometers_Driven', 'Owner_Type', 'Mileage', 'Engine', 'Power', 'Seats']


# Random Forrest

In [116]:
from sklearn.ensemble import RandomForestRegressor

In [120]:
rf = RandomForestRegressor(n_estimators=200, random_state=42)
rf.fit(X_train, y_train)

In [121]:
y_pred = rf.predict(X_test)

from sklearn.metrics import r2_score, mean_squared_error
print("R²:", r2_score(y_test, y_pred))
print("MSE:", mean_squared_error(y_test, y_pred))


R²: 0.8611574903968064
MSE: 14.566104227312472


In [122]:
from sklearn.model_selection import GridSearchCV

params = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
}

grid = GridSearchCV(RandomForestRegressor(random_state=42), params, cv=3, scoring='r2')
grid.fit(X_train, y_train)

print("Best Params:", grid.best_params_)

y_pred = grid.predict(X_test)
print("Test R²:", r2_score(y_test, y_pred))


Best Params: {'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 200}
Test R²: 0.8618631974710415


# XGBoost

In [122]:
!pip install xgboost

In [123]:
import xgboost as xgb
from xgboost import XGBRegressor

xg = XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)
xg.fit(X_train, y_train)


ValueError: DataFrame.dtypes for data must be int, float, bool or category. When categorical type is supplied, the experimental DMatrix parameter`enable_categorical` must be set to `True`.  Invalid columns:Owner_Type: object

In [None]:
y_pred = xg.predict(X_test)

from sklearn.metrics import r2_score, mean_squared_error
print("R²:", r2_score(y_test, y_pred))
print("MSE:", mean_squared_error(y_test, y_pred))


In [None]:
from sklearn.model_selection import GridSearchCV

params = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
}

grid = GridSearchCV(XGBRegressor(random_state=42), params, cv=3, scoring='r2')
grid.fit(X_train, y_train)

print("Best Params:", grid.best_params_)

y_pred = grid.predict(X_test)
print("Tuned R²:", r2_score(y_test, y_pred))
