In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder
import datetime


In [None]:
#read csv file
Auto_data = pd.read_csv('train.csv')

In [None]:
#Find NULL values in the dataset
missing_count = Auto_data.isna().sum()
print(missing_count)

Unnamed: 0              0
Name                    0
Location                0
Year                    0
Kilometers_Driven       0
Fuel_Type               0
Transmission            0
Owner_Type              0
Mileage                 2
Engine                 36
Power                  36
Seats                  38
New_Price            5032
Price                   0
dtype: int64


In [None]:
#b.excluding units of attributes and keeping the numeric values
Auto_data['Mileage'] = Auto_data['Mileage'].str.split(' ', expand=True)[0].astype(float)
Auto_data['Engine'] = Auto_data['Engine'].str.split(' ', expand=True)[0].astype(float)
Auto_data['Power'] = Auto_data['Power'].str.split(' ', expand=True)[0].astype(float)
Auto_data['New_Price'] = Auto_data['New_Price'].str.split(' ', expand=True)[0].astype(float)
print(Auto_data.head())

   Unnamed: 0                              Name    Location  Year  \
0           1  Hyundai Creta 1.6 CRDi SX Option        Pune  2015   
1           2                      Honda Jazz V     Chennai  2011   
2           3                 Maruti Ertiga VDI     Chennai  2012   
3           4   Audi A4 New 2.0 TDI Multitronic  Coimbatore  2013   
4           6            Nissan Micra Diesel XV      Jaipur  2013   

   Kilometers_Driven Fuel_Type Transmission Owner_Type  Mileage  Engine  \
0              41000    Diesel       Manual      First    19.67  1582.0   
1              46000    Petrol       Manual      First    13.00  1199.0   
2              87000    Diesel       Manual      First    20.77  1248.0   
3              40670    Diesel    Automatic     Second    15.20  1968.0   
4              86999    Diesel       Manual      First    23.08  1461.0   

    Power  Seats  New_Price  Price  
0  126.20    5.0        NaN  12.50  
1   88.70    5.0       8.61   4.50  
2   88.76    7.0       

In [None]:
mean_value = Auto_data['Engine'].mean()
median_value = Auto_data['Engine'].median()
print(f"Mean: {mean_value}")
print(f"Median: {median_value}")

mean_value = Auto_data['Power'].mean()
median_value = Auto_data['Power'].median()
print(f"Mean: {mean_value}")
print(f"Median: {median_value}")

mean_value = Auto_data['Seats'].mean()
median_value = Auto_data['Seats'].median()
print(f"Mean: {mean_value}")
print(f"Median: {median_value}")

Mean: 1631.552572706935
Median: 1497.0
Mean: 113.80314403717087
Median: 98.6
Mean: 5.286452057152694
Median: 5.0


In [None]:
Auto_data.shape

(5847, 14)

In [None]:
#a.
#Completing values that are lackingSince the mean and median for these columns are relatively near to one another
#despite the fact that the data distribution contains outliers and is somewhat skewed
#we can select the mean or median.
engine_mean = Auto_data['Engine'].mean()
Auto_data['Engine'].fillna(engine_mean, inplace=True)
power_mean = Auto_data['Power'].mean()
Auto_data['Power'].fillna(power_mean, inplace=True)
seats_mean = Auto_data['Seats'].mean()
Auto_data['Seats'].fillna(seats_mean, inplace=True)


#Drop New_Price column
#Out of 5847 rows, New Price has a large number of missing values (around 5032 rows). Using mean to fill in these missing data,
#The model's performance may be affected by the median or mode. Many other important variables, such as the engine and mileage,
#can be used to forecast the target variable

Auto_data = Auto_data.drop(columns = ['New_Price'])
Auto_data.head()

missing_count = Auto_data.isna().sum()
print(missing_count)

Unnamed: 0           0
Name                 0
Location             0
Year                 0
Kilometers_Driven    0
Fuel_Type            0
Transmission         0
Owner_Type           0
Mileage              2
Engine               0
Power                0
Seats                0
Price                0
dtype: int64


In [None]:
Auto_data.shape

(5847, 13)

In [None]:
#c.One Hot Encoding for categorical values
Auto_data = pd.get_dummies(Auto_data, columns=['Fuel_Type','Transmission'])
Auto_data.head()

Unnamed: 0.1,Unnamed: 0,Name,Location,Year,Kilometers_Driven,Owner_Type,Mileage,Engine,Power,Seats,Price,Fuel_Type_Diesel,Fuel_Type_Electric,Fuel_Type_Petrol,Transmission_Automatic,Transmission_Manual
0,1,Hyundai Creta 1.6 CRDi SX Option,Pune,2015,41000,First,19.67,1582.0,126.2,5.0,12.5,1,0,0,0,1
1,2,Honda Jazz V,Chennai,2011,46000,First,13.0,1199.0,88.7,5.0,4.5,0,0,1,0,1
2,3,Maruti Ertiga VDI,Chennai,2012,87000,First,20.77,1248.0,88.76,7.0,6.0,1,0,0,0,1
3,4,Audi A4 New 2.0 TDI Multitronic,Coimbatore,2013,40670,Second,15.2,1968.0,140.8,5.0,17.74,1,0,0,1,0
4,6,Nissan Micra Diesel XV,Jaipur,2013,86999,First,23.08,1461.0,63.1,5.0,3.5,1,0,0,0,1


In [None]:
#d.A new column is added to the dataset.
now = datetime.datetime.now().year
Auto_data['Car_Life'] = now - Auto_data['Year']
Auto_data.head()

Unnamed: 0.1,Unnamed: 0,Name,Location,Year,Kilometers_Driven,Owner_Type,Mileage,Engine,Power,Seats,Price,Fuel_Type_Diesel,Fuel_Type_Electric,Fuel_Type_Petrol,Transmission_Automatic,Transmission_Manual,Car_Life
0,1,Hyundai Creta 1.6 CRDi SX Option,Pune,2015,41000,First,19.67,1582.0,126.2,5.0,12.5,1,0,0,0,1,8
1,2,Honda Jazz V,Chennai,2011,46000,First,13.0,1199.0,88.7,5.0,4.5,0,0,1,0,1,12
2,3,Maruti Ertiga VDI,Chennai,2012,87000,First,20.77,1248.0,88.76,7.0,6.0,1,0,0,0,1,11
3,4,Audi A4 New 2.0 TDI Multitronic,Coimbatore,2013,40670,Second,15.2,1968.0,140.8,5.0,17.74,1,0,0,1,0,10
4,6,Nissan Micra Diesel XV,Jaipur,2013,86999,First,23.08,1461.0,63.1,5.0,3.5,1,0,0,0,1,10
