<a href="https://colab.research.google.com/github/Sahil01S/Car_price_prediction/blob/main/Car_Price_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import statsmodels.api as sm


np.random.seed(42)

# Load the dataset
df = pd.read_csv('cars.csv')

# Display the first few rows of the dataset
print(df.head())


   car_ID  symboling                   CarName fueltype aspiration doornumber  \
0       1          3        alfa-romero giulia      gas        std        two   
1       2          3       alfa-romero stelvio      gas        std        two   
2       3          1  alfa-romero Quadrifoglio      gas        std        two   
3       4          2               audi 100 ls      gas        std       four   
4       5          2                audi 100ls      gas        std       four   

       carbody drivewheel enginelocation  wheelbase  ...  enginesize  \
0  convertible        rwd          front       88.6  ...         130   
1  convertible        rwd          front       88.6  ...         130   
2    hatchback        rwd          front       94.5  ...         152   
3        sedan        fwd          front       99.8  ...         109   
4        sedan        4wd          front       99.4  ...         136   

   fuelsystem  boreratio  stroke compressionratio horsepower  peakrpm citympg  \

In [None]:
print(df.isnull().sum())

car_ID              0
symboling           0
CarName             0
fueltype            0
aspiration          0
doornumber          0
carbody             0
drivewheel          0
enginelocation      0
wheelbase           0
carlength           0
carwidth            0
carheight           0
curbweight          0
enginetype          0
cylindernumber      0
enginesize          0
fuelsystem          0
boreratio           0
stroke              0
compressionratio    0
horsepower          0
peakrpm             0
citympg             0
highwaympg          0
price               0
dtype: int64


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 26 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   car_ID            205 non-null    int64  
 1   symboling         205 non-null    int64  
 2   CarName           205 non-null    object 
 3   fueltype          205 non-null    object 
 4   aspiration        205 non-null    object 
 5   doornumber        205 non-null    object 
 6   carbody           205 non-null    object 
 7   drivewheel        205 non-null    object 
 8   enginelocation    205 non-null    object 
 9   wheelbase         205 non-null    float64
 10  carlength         205 non-null    float64
 11  carwidth          205 non-null    float64
 12  carheight         205 non-null    float64
 13  curbweight        205 non-null    int64  
 14  enginetype        205 non-null    object 
 15  cylindernumber    205 non-null    object 
 16  enginesize        205 non-null    int64  
 1

In [None]:
categorical_columns = ['symboling','fueltype', 'aspiration', 'doornumber', 'carbody',
                       'drivewheel', 'enginelocation', 'enginetype', 'cylindernumber', 'fuelsystem']

# Convert categorical variables to dummy variables
df = pd.get_dummies(df, columns=categorical_columns, drop_first=True, dtype=int)

In [None]:
print(df.head())

   car_ID                   CarName  wheelbase  carlength  carwidth  \
0       1        alfa-romero giulia       88.6      168.8      64.1   
1       2       alfa-romero stelvio       88.6      168.8      64.1   
2       3  alfa-romero Quadrifoglio       94.5      171.2      65.5   
3       4               audi 100 ls       99.8      176.6      66.2   
4       5                audi 100ls       99.4      176.6      66.4   

   carheight  curbweight  enginesize  boreratio  stroke  ...  \
0       48.8        2548         130       3.47    2.68  ...   
1       48.8        2548         130       3.47    2.68  ...   
2       52.4        2823         152       2.68    3.47  ...   
3       54.3        2337         109       3.19    3.40  ...   
4       54.3        2824         136       3.19    3.40  ...   

   cylindernumber_three  cylindernumber_twelve  cylindernumber_two  \
0                     0                      0                   0   
1                     0                      0  

In [None]:
X = df.drop(['car_ID', 'CarName', 'price'], axis=1)
y = df['price']

In [None]:
# Add a constant for statsmodels
X_const = sm.add_constant(X)

# Perform the train-test split
X_train, X_test, y_train, y_test = train_test_split(X_const, y, test_size=0.2, random_state=42)





In [None]:
model = sm.OLS(y_train, X_train)
result = model.fit()

# Print the summary of the model
print(result.summary())



                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.953
Model:                            OLS   Adj. R-squared:                  0.936
Method:                 Least Squares   F-statistic:                     55.11
Date:                Wed, 07 Aug 2024   Prob (F-statistic):           3.23e-61
Time:                        16:31:15   Log-Likelihood:                -1449.7
No. Observations:                 164   AIC:                             2989.
Df Residuals:                     119   BIC:                             3129.
Df Model:                          44                                         
Covariance Type:            nonrobust                                         
                            coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------
const                 -2.496e+

In [None]:
# Make predictions
y_pred = result.predict(X_test)

# Calculate performance metrics
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
intercept = result.params[0]

print(f"Intercept: {intercept}")
print(f"Mean Squared Error: {mse}")
print(f"R^2 Score: {r2}")
print(f"Predictions on test set: {y_pred.values[:5]}")
print(f"Actual values on test set: {y_test.values[:5]}")

Intercept: -24960.11565478317
Mean Squared Error: 9589266.309257166
R^2 Score: 0.878530809522927
Predictions on test set: [28875.09334703 20604.38940655 10877.38841499 12914.01935502
 26967.15829428]
Actual values on test set: [30760.    17859.167  9549.    11850.    28248.   ]


  intercept = result.params[0]


The R squared value is giving 0.87, which is not bad; still we try other different methods to validate acccuracy on the test dataset.

In [None]:
from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

In [None]:
# Define models and hyperparameters for grid search
models = {
    'Lasso': {
        'model': Lasso(),
        'params': {
            'alpha': [0.1, 0.5, 1.0, 5.0, 10.0]
        }
    },
    'Decision Tree': {
        'model': DecisionTreeRegressor(),
        'params': {
            'max_depth': [3, 5, 7, 10],
            'min_samples_split': [2, 5, 10]
        }
    },
    'Random Forest': {
        'model': RandomForestRegressor(),
        'params': {
            'n_estimators': [50, 100, 200],
            'max_depth': [3, 5, 7, 10],
            'min_samples_split': [2, 5, 10]
        }
    }
}



In [None]:
# Initialize the grid search
results = {}
for model_name, config in models.items():
    grid_search = GridSearchCV(estimator=config['model'], param_grid=config['params'], cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    best_params = grid_search.best_params_
    best_score = -grid_search.best_score_
    results[model_name] = {
        'best_params': best_params,
        'best_score': best_score
    }

    # Evaluate on test set
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    results[model_name].update({
        'test_mse': mse,
        'test_r2': r2
    })


In [None]:

# Print the results
for model_name, metrics in results.items():
    print(f"Model: {model_name}")
    print(f"Best Parameters: {metrics['best_params']}")
    print(f"Best Cross-Validation Score (MSE): {metrics['best_score']}")
    print(f"Test Mean Squared Error: {metrics['test_mse']}")
    print(f"Test R^2 Score: {metrics['test_r2']}\n")

Model: Lasso
Best Parameters: {'alpha': 10.0}
Best Cross-Validation Score (MSE): 8744520.146734526
Test Mean Squared Error: 11326711.63796353
Test R^2 Score: 0.8565222354808861

Model: Decision Tree
Best Parameters: {'max_depth': 5, 'min_samples_split': 10}
Best Cross-Validation Score (MSE): 9479307.597065864
Test Mean Squared Error: 9596251.322454114
Test R^2 Score: 0.8784423289373283

Model: Random Forest
Best Parameters: {'max_depth': 7, 'min_samples_split': 2, 'n_estimators': 50}
Best Cross-Validation Score (MSE): 5884592.267845519
Test Mean Squared Error: 3438048.2499086596
Test R^2 Score: 0.9564495421996604



We xcan see Random forest is giving best prediction with R squared value.

In [None]:

def fit_model(X_train, y_train):

    X_train_const = sm.add_constant(X_train)
    model = sm.OLS(y_train, X_train_const).fit()
    return model


In [None]:

def calculate_adjusted_r2(X_train, y_train, X_test, y_test):

    model = fit_model(X_train, y_train)
    y_pred = model.predict(sm.add_constant(X_test))
    r2 = r2_score(y_test, y_pred)
    n = len(y_train)
    p = X_train.shape[1]
    adj_r2 = 1 - (1 - r2) * (n - 1) / (n - p - 1)
    return adj_r2



In [None]:

# Forward selection
def forward_selection(X_train, y_train, X_test, y_test):

    features = X_train.columns.tolist()
    selected_features = []
    best_adj_r2 = -np.inf

    while features:
        best_feature = None
        for feature in features:
            current_features = selected_features + [feature]
            X_train_subset = X_train[current_features]
            X_test_subset = X_test[current_features]
            adj_r2 = calculate_adjusted_r2(X_train_subset, y_train, X_test_subset, y_test)

            if adj_r2 > best_adj_r2:
                best_adj_r2 = adj_r2
                best_feature = feature

        if best_feature:
            selected_features.append(best_feature)
            features.remove(best_feature)
        else:
            break

    return selected_features, best_adj_r2



In [None]:

selected_features, best_adj_r2 = forward_selection(X_train, y_train, X_test, y_test)

# Fit the final model with selected features
X_train_final = X_train[selected_features]
X_test_final = X_test[selected_features]
final_model = fit_model(X_train_final, y_train)

# Print the results
print("Selected Features:")
print(selected_features)
print(f"Best Adjusted R^2: {best_adj_r2}")

# Print coefficients
coefficients = final_model.params
print("\nCoefficients:")
print(coefficients)

Selected Features:
['enginesize', 'cylindernumber_four', 'enginetype_ohcv', 'drivewheel_fwd', 'stroke', 'enginetype_ohc', 'fuelsystem_4bbl', 'fuelsystem_idi', 'peakrpm', 'enginetype_l']
Best Adjusted R^2: 0.9154477818396042

Coefficients:
const                 -6529.495806
enginesize              175.507945
cylindernumber_four   -4210.687191
enginetype_ohcv       -5411.293004
drivewheel_fwd        -1485.024724
stroke                -4903.810800
enginetype_ohc         2659.969841
fuelsystem_4bbl        4704.513010
fuelsystem_idi         3694.181528
peakrpm                   3.041055
enginetype_l           2012.774325
dtype: float64


In [None]:
results_df = pd.DataFrame({
    'Actual Values': y_test.values,
    'Predicted Values': y_pred,
    **X_test_final
})
print("\nResults:")
print(results_df.head())


Results:
     Actual Values  Predicted Values  enginesize  cylindernumber_four  \
15       30760.000      35609.220000         209                    0   
9        17859.167      18104.479600         131                    0   
100       9549.000       9196.905521         120                    1   
132      11850.000      13521.661092         121                    1   
68       28248.000      26509.196561         183                    0   

     enginetype_ohcv  drivewheel_fwd  stroke  enginetype_ohc  fuelsystem_4bbl  \
15                 0               0    3.39               1                0   
9                  0               0    3.40               1                0   
100                0               1    3.47               1                0   
132                0               1    3.07               1                0   
68                 0               0    3.64               1                0   

     fuelsystem_idi  peakrpm  enginetype_l  
15                0

Here we have done forward selection, which is giving high R squared value. Also we have printed the predicted values and the R squared value.


