<a href="https://colab.research.google.com/github/Ovizero01/Machine-Leaning/blob/main/026_Applied%20Machine%20Learning%20Foundations%2C%20Dataset%2C%20Eda%20%26%20First%20Model/026_Laptop%20Price%20Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import VotingRegressor, StackingRegressor

from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.compose import ColumnTransformer

import warnings
warnings.filterwarnings("ignore")

In [None]:
df = pd.read_csv('laptop_price_prediction.csv')
df.head()

# Y Data Profiling


In [None]:
!pip install ydata-profiling

In [None]:
from ydata_profiling import ProfileReport
profile = ProfileReport(df, title="Laptop Price Prediction", explorative=True)
profile.to_file("ydata.html")

# Drop Data

In [None]:
df = df.drop(df.columns[:2], axis=1)

In [None]:
df.head()

# Simplifying Columns

In [None]:
df.columns = [
    'brand', 'name', 'price', 'spec_rating', 'processor', 'CPU',
    'Ram', 'Ram_type', 'ROM', 'ROM_type', 'GPU', 'display_size',
    'resolution_width', 'resolution_height', 'OS', 'warranty'
]

# Correlation for Numerical Value

In [None]:
corr_target = df.select_dtypes(include=np.number).corr()['price'].sort_values(ascending=False)
print(corr_target)

# Separate X and y

In [None]:
X = df.drop('price', axis=1)
y = df['price']

# Numerical Column and Categorical Columns

In [None]:
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

In [None]:
numerical_features

In [None]:
categorical_features

# Pipleline

In [None]:
# for numeric features
num_transformer = Pipeline(
    steps = [
        ('scaler', StandardScaler())
    ]
)

In [None]:
# for categorical features
cat_transformer = Pipeline(
    steps = [
        ('encoder', OneHotEncoder(handle_unknown='ignore'))
    ]
)

In [None]:
# combine them
preprocessor = ColumnTransformer(
    transformers = [
        ('num', num_transformer, numerical_features),
        ('cat', cat_transformer, categorical_features)
    ]
)

# Train-Test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [None]:
#base learner
reg_lr = LinearRegression()
reg_rf = RandomForestRegressor(n_estimators=100, random_state=42)
reg_gb = GradientBoostingRegressor(n_estimators=100, random_state=42)
reg_xgb = XGBRegressor(n_estimators=100, random_state=42)

In [None]:
#voting regressor
voting_reg = VotingRegressor(
    estimators=[
        ('lr', reg_lr),
        ('rf', reg_rf),
        ('gb', reg_gb),
        ('xgb', reg_xgb)
    ]
)

In [None]:
#stacking
stacking_reg = StackingRegressor(
    estimators=[
        ('rf', reg_rf),
        ('gb', reg_gb)
    ],
    final_estimator=Ridge()
)

# Model Training

In [None]:
model_to_train = {
    'Linear Regression': reg_lr,
    'Random Forest': reg_rf,
    'Gradient Boosting': reg_gb,
    'XGBRegressor': reg_xgb,
    'Voting Ensemble': voting_reg,
    'Stacking Ensemble': stacking_reg
}

In [None]:
# Training & Evaluation

result = []

for name, model in model_to_train.items():
  pipe = Pipeline([
      ('preprocessor', preprocessor),
      ('model', model)
  ])

  # train
  pipe.fit(X_train, y_train)

  #predict
  y_pred = pipe.predict(X_test)

  #Evaluate
  r2 = r2_score(y_test, y_pred)
  rmse = np.sqrt(mean_squared_error(y_test, y_pred))
  mae = mean_absolute_error(y_test, y_pred)

  result.append({
      "Model": name,
      "R2 Score": r2,
      "RMSE": rmse,
      "MAE": mae
  })

results_df = pd.DataFrame(result).sort_values("R2 Score", ascending=False)
print(results_df)

In [None]:
best_model_name = results_df.iloc[0]['Model']
best_model_obj = model_to_train[best_model_name]

min_val = min(y_test.min(), y_pred.min())
max_val = max(y_test.max(), y_pred.max())

#fit the best model
final_pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('model',best_model_obj)
])

final_pipe.fit(X_train,y_train)
y_final_pred = final_pipe.predict(X_test)


#plot Actual vs predicted

plt.figure( figsize = (8,6) )

sns.scatterplot(x=y_test, y=y_final_pred, alpha = 0.6, color='teal' )
plt.plot( [min_val,max_val] , [min_val,max_val], color = "red", linestyle = '--'  )

plt.xlabel("Actual Price")
plt.ylabel("Predicted Price")

plt.grid(True)
plt.show()