In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder

In [3]:
df = pd.read_csv('/content/car.csv')

In [4]:
df.head()

Unnamed: 0,Car_Name,Year,Selling_Price,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner
0,ritz,2014,3.35,5.59,27000,Petrol,Dealer,Manual,0
1,sx4,2013,4.75,9.54,43000,Diesel,Dealer,Manual,0
2,ciaz,2017,7.25,9.85,6900,Petrol,Dealer,Manual,0
3,wagon r,2011,2.85,4.15,5200,Petrol,Dealer,Manual,0
4,swift,2014,4.6,6.87,42450,Diesel,Dealer,Manual,0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 301 entries, 0 to 300
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Car_Name       301 non-null    object 
 1   Year           301 non-null    int64  
 2   Selling_Price  301 non-null    float64
 3   Present_Price  301 non-null    float64
 4   Kms_Driven     301 non-null    int64  
 5   Fuel_Type      301 non-null    object 
 6   Seller_Type    301 non-null    object 
 7   Transmission   301 non-null    object 
 8   Owner          301 non-null    int64  
dtypes: float64(2), int64(3), object(4)
memory usage: 21.3+ KB


In [6]:
df_copy = df.copy()

## Covert Categorical value into Numerical Formate

In [7]:
df_copy['Fuel_Type'].unique()

array(['Petrol', 'Diesel', 'CNG'], dtype=object)

In [8]:
df_copy['Fuel_Type'] = df_copy['Fuel_Type'].replace(['Petrol','Diesel','CNG'],[1,2,3])



In [9]:
df_copy['Transmission'] = df_copy['Transmission'].replace(['Manual','Automatic'],[1,2])

In [10]:
df_copy['Seller_Type'] = df_copy['Seller_Type'].replace(['Dealer','Individual'],[1,2])

In [11]:
df_copy.head(5)

Unnamed: 0,Car_Name,Year,Selling_Price,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner
0,ritz,2014,3.35,5.59,27000,1,1,1,0
1,sx4,2013,4.75,9.54,43000,2,1,1,0
2,ciaz,2017,7.25,9.85,6900,1,1,1,0
3,wagon r,2011,2.85,4.15,5200,1,1,1,0
4,swift,2014,4.6,6.87,42450,2,1,1,0


In [12]:
df_copy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 301 entries, 0 to 300
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Car_Name       301 non-null    object 
 1   Year           301 non-null    int64  
 2   Selling_Price  301 non-null    float64
 3   Present_Price  301 non-null    float64
 4   Kms_Driven     301 non-null    int64  
 5   Fuel_Type      301 non-null    int64  
 6   Seller_Type    301 non-null    int64  
 7   Transmission   301 non-null    int64  
 8   Owner          301 non-null    int64  
dtypes: float64(2), int64(6), object(1)
memory usage: 21.3+ KB


In [13]:
encoder = OneHotEncoder(sparse_output=False)

In [14]:
categorical_columns = df_copy.select_dtypes(include=['object']).columns.tolist()

In [15]:
one_hot_encoded = encoder.fit_transform(df_copy[categorical_columns])

In [16]:
one_hot_df = pd.DataFrame(one_hot_encoded, columns=encoder.get_feature_names_out(categorical_columns))

In [17]:
from sklearn.linear_model import LinearRegression

In [18]:
df_copy = df_copy.drop(columns='Car_Name',axis=1)

In [19]:
df_copy.head()

Unnamed: 0,Year,Selling_Price,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner
0,2014,3.35,5.59,27000,1,1,1,0
1,2013,4.75,9.54,43000,2,1,1,0
2,2017,7.25,9.85,6900,1,1,1,0
3,2011,2.85,4.15,5200,1,1,1,0
4,2014,4.6,6.87,42450,2,1,1,0


In [20]:
Y = df_copy['Selling_Price']
X = df_copy.drop('Selling_Price',axis=1)

In [21]:
from sklearn.model_selection import train_test_split

In [22]:
x_train,x_test,y_train,y_test=train_test_split(X,Y,random_state=20,test_size=0.3)

In [23]:
x_test.shape,x_train.shape,y_test.shape,y_train.shape

((91, 7), (210, 7), (91,), (210,))

In [24]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x_train=scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

In [25]:
## Model Training

from sklearn.linear_model import LinearRegression
regression = LinearRegression()

In [26]:
regression.fit(x_train,y_train)

In [27]:
y_pred_test = regression.predict(x_test)

In [28]:
## performance Metrics
from sklearn.metrics import mean_absolute_error,mean_squared_error
mse = mean_squared_error(y_test,y_pred_test)
mae = mean_absolute_error(y_test,y_pred_test)
rmse = np.sqrt(mse)

print(mse," ",mae," "," ",rmse)

2.145569997089453   1.1343953708632184     1.4647764324597297


In [29]:
from sklearn.metrics import r2_score
r2_score = r2_score(y_test,y_pred_test)
print(r2_score)

0.8965614248515029
