<a href="https://colab.research.google.com/github/Sree-KM2001/CAR-PURCHASE/blob/main/ML_Assignment_ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load the dataset
data = fetch_california_housing(as_frame=True)
df = data.frame

# Inspect the dataset
print(df.info())
print(df.describe())

# Check for missing values
print(df.isnull().sum())

# Split into features and target
X = df.drop("MedHouseVal", axis=1)  # Features
y = df["MedHouseVal"]  # Target (median house value)

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature scaling (standardization)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   MedInc       20640 non-null  float64
 1   HouseAge     20640 non-null  float64
 2   AveRooms     20640 non-null  float64
 3   AveBedrms    20640 non-null  float64
 4   Population   20640 non-null  float64
 5   AveOccup     20640 non-null  float64
 6   Latitude     20640 non-null  float64
 7   Longitude    20640 non-null  float64
 8   MedHouseVal  20640 non-null  float64
dtypes: float64(9)
memory usage: 1.4 MB
None
             MedInc      HouseAge      AveRooms     AveBedrms    Population  \
count  20640.000000  20640.000000  20640.000000  20640.000000  20640.000000   
mean       3.870671     28.639486      5.429000      1.096675   1425.476744   
std        1.899822     12.585558      2.474173      0.473911   1132.462122   
min        0.499900      1.000000      0.846154      0.333333      3.

In [2]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR

# Initialize models
models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree Regressor": DecisionTreeRegressor(random_state=42),
    "Random Forest Regressor": RandomForestRegressor(random_state=42),
    "Gradient Boosting Regressor": GradientBoostingRegressor(random_state=42),
    "Support Vector Regressor (SVR)": SVR()
}

# Fit and evaluate models
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

results = {}

for name, model in models.items():
    # Fit the model
    model.fit(X_train_scaled, y_train)

    # Make predictions
    y_pred = model.predict(X_test_scaled)

    # Evaluate the model
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    # Store the results
    results[name] = {"MSE": mse, "MAE": mae, "R²": r2}

# Convert results to a DataFrame for comparison
results_df = pd.DataFrame(results).T
print(results_df)


                                     MSE       MAE        R²
Linear Regression               0.555892  0.533200  0.575788
Decision Tree Regressor         0.493969  0.453904  0.623042
Random Forest Regressor         0.255341  0.327483  0.805144
Gradient Boosting Regressor     0.293999  0.371650  0.775643
Support Vector Regressor (SVR)  0.357004  0.398599  0.727563


In [3]:
# Identify best and worst models
best_model = results_df.sort_values("R²", ascending=False).iloc[0]
worst_model = results_df.sort_values("R²", ascending=False).iloc[-1]

print(f"Best Model: {best_model.name} with R² = {best_model['R²']}")
print(f"Worst Model: {worst_model.name} with R² = {worst_model['R²']}")


Best Model: Random Forest Regressor with R² = 0.8051444145919077
Worst Model: Linear Regression with R² = 0.575787706032451
