In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
df = pd.read_csv('car_data.csv') 
print(df.describe())
print(df.info())
print(df.head())
df.columns

       Selling Price  Kilometers Driven         Year  Car Condition
count   2.237000e+03        2237.000000  2237.000000    2237.000000
mean    4.184431e+05       61928.605275  2013.763523       4.370854
std     2.280516e+05       42260.955917     2.874686       0.288990
min     7.529900e+04         913.000000  2006.000000       3.000000
25%     2.720990e+05       32137.000000  2012.000000       4.200000
50%     3.557990e+05       55430.000000  2014.000000       4.300000
75%     5.032990e+05       83427.000000  2016.000000       4.600000
max     1.952397e+06      855881.000000  2020.000000       5.000000
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2237 entries, 0 to 2236
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Model              2237 non-null   object 
 1   Selling Price      2237 non-null   int64  
 2   Kilometers Driven  2237 non-null   int64  
 3   Year               2237 non-null   in

Index(['Model', 'Selling Price', 'Kilometers Driven', 'Year', 'Owner',
       'Fuel Type', 'Transmission', 'Insurance', 'Car Condition'],
      dtype='object')

In [7]:
selected_cols = ['Selling Price', 'Kilometers Driven', 'Year', 'Car Condition', 'Fuel Type', 'Transmission']
df = df[selected_cols]
df.dropna(inplace=True)

df_encoded = pd.get_dummies(df, columns=['Fuel Type', 'Transmission'], drop_first=True)

X = df_encoded.drop('Selling Price', axis=1)
y = df_encoded['Selling Price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)

coef_df = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': model.coef_
}).sort_values(by='Coefficient', key=abs, ascending=False)

print("\n Coefficient Table:")
print(coef_df)

print("\n Model Performance:")
print(f"Intercept: {model.intercept_:.2f}")
print(f"R² Score: {r2:.4f}")
print(f"Mean Squared Error: {mse:.2f}")
top_feature = coef_df.iloc[0]
print(f"\n Most impactful feature: '{top_feature['Feature']}' with coefficient {top_feature['Coefficient']:.2f}")



 Coefficient Table:
              Feature   Coefficient
25  Transmission_HR12  4.275201e+05
16  Transmission_DL8C  4.154083e+05
51  Transmission_RJ14 -3.822905e+05
53  Transmission_TN02 -3.243038e+05
41  Transmission_MH04  2.919484e+05
..                ...           ...
52  Transmission_RJ45 -3.152573e-05
49  Transmission_PB11 -1.341749e-05
21  Transmission_GJ18 -1.076411e-05
10  Transmission_DL14  8.806965e-07
55  Transmission_TN12  3.388712e-11

[63 rows x 2 columns]

 Model Performance:
Intercept: -67130628.06
R² Score: 0.4788
Mean Squared Error: 24104359606.28

 Most impactful feature: 'Transmission_HR12' with coefficient 427520.10
