Vaishnavi Udanshiv R00224406 AML Project_2

Loading Libraries

In [27]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import GridSearchCV

In [28]:
laptop_df = pd.read_csv('D://MachineLearning//laptop_data.csv')   #loading dataset

In [29]:
laptop_df.dropna(inplace=True)    #dropping missing rows

Feature Engineering

In [30]:
cat_cols = ['Company', 'TypeName', 'ScreenResolution', 'Cpu', 'Gpu', 'OpSys']    # Encoding categorical variables
le = LabelEncoder()
for col in cat_cols:
    laptop_df[col] = le.fit_transform(laptop_df[col])

In [31]:
laptop_df['Ram'] = laptop_df['Ram'].str.replace('GB', '').astype(int)              
laptop_df['Memory'] = laptop_df['Memory'].str.extract('(\d+)').astype(int)
laptop_df['Weight'] = laptop_df['Weight'].str.replace('kg', '').astype(float)

In [32]:
num_cols = ['Inches', 'Weight', 'Price']    # Standarding numerical variables
laptop_df[num_cols] = (laptop_df[num_cols] - laptop_df[num_cols].mean()) / laptop_df[num_cols].std()

In [33]:
X = laptop_df.drop('Price', axis=1)   # Splitting the data into features and target variable
y = laptop_df['Price']


In [34]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Building three different models

In [35]:
models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest": RandomForestRegressor()
}

for model_name, model in models.items():
    print("Model:", model_name)
    
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    print("Mean Absolute Error:", mae)
    print("Mean Squared Error:", mse)
    print("R-squared Score:", r2)
    print()

Model: Linear Regression
Mean Absolute Error: 0.39469546328725397
Mean Squared Error: 0.37057979245340766
R-squared Score: 0.6435107578675339

Model: Decision Tree
Mean Absolute Error: 0.34753906660634426
Mean Squared Error: 0.3066751940885274
R-squared Score: 0.704985512573

Model: Random Forest
Mean Absolute Error: 0.2584672185836348
Mean Squared Error: 0.19074648748416181
R-squared Score: 0.8165062635701913



Hyperparameter Optimization

In [36]:
dt_param_grid = {
    'max_depth': [None, 5, 10],      # parameters grid for grid search
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}

rf_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2']
}

In [37]:
dt = DecisionTreeRegressor(random_state=42)   # Creating a Decision Tree regressor object

dt_grid_search = GridSearchCV(estimator=dt, param_grid=dt_param_grid, cv=5, scoring='neg_mean_squared_error')   # Performing grid search for Decision Tree
dt_grid_search.fit(X_train, y_train)

dt_best_params = dt_grid_search.best_params_     # getting best hyperparameters and model for Decision Tree
dt_best_model = dt_grid_search.best_estimator_

In [38]:
rf = RandomForestRegressor(random_state=42)     # Creating a Random Forest regressor object

rf_grid_search = GridSearchCV(estimator=rf, param_grid=rf_param_grid, cv=5, scoring='neg_mean_squared_error')   # Performing grid search for Random Forest
rf_grid_search.fit(X_train, y_train)

rf_best_params = rf_grid_search.best_params_      # Getting the best hyperparameters and model for Random Forest
rf_best_model = rf_grid_search.best_estimator_

In [None]:
dt_best_model.fit(X_train, y_train)     # Training the best models with the best hyperparameters
rf_best_model.fit(X_train, y_train)

In [None]:
y_pred_dt = dt_best_model.predict(X_test)    # Predicting on the test set for decision tree
y_pred_rf = rf_best_model.predict(X_test)     # Predicting on the test set for random forest

Evaluation after hyperparameter optimization

In [78]:
mae_dt = mean_absolute_error(y_test, y_pred_dt)
mse_dt = mean_squared_error(y_test, y_pred_dt)
r2_dt = r2_score(y_test, y_pred_dt)

mae_rf = mean_absolute_error(y_test, y_pred_rf)
mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

print("Best Hyperparameters for Decision Tree:", dt_best_params)
print("Mean Absolute Error for Decision Tree:", mae_dt)
print("Mean Squared Error for Decision Tree:", mse_dt)
print("R-squared Score for Decision Tree:", r2_dt)
print()

print("Best Hyperparameters for Random Forest:", rf_best_params)
print("Mean Absolute Error for Random Forest:", mae_rf)
print("Mean Squared Error for Random Forest:", mse_rf)
print("R-squared Score for Random Forest:", r2_rf)
print()


Best Hyperparameters for Decision Tree: {'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 2}
Mean Absolute Error for Decision Tree: 12684.663570654495
Mean Squared Error for Decision Tree: 466979740.1353102
R-squared Score for Decision Tree: 0.6761314970276693

Best Hyperparameters for Random Forest: {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Mean Absolute Error for Random Forest: 8843.667684137934
Mean Squared Error for Random Forest: 251733212.16983593
R-squared Score for Random Forest: 0.8254132854880653

