# **Week 5 :** 
#### Que : **Perform data preprocessing on a dataset** (e.g., Titanic dataset) **including cleaning, handling missing values, transformation, normalization, encoding, and feature engineering for predictive modeling.**
---

### *Importing Libraries and Dataset*

In [40]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

In [3]:
data = pd.read_csv('carsdataset.csv')
data.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner
0,Maruti 800 AC,2007,60000,70000,Petrol,Individual,Manual,First Owner
1,Maruti Wagon R LXI Minor,2007,135000,50000,Petrol,Individual,Manual,First Owner
2,Hyundai Verna 1.6 SX,2012,600000,100000,Diesel,Individual,Manual,First Owner
3,Datsun RediGO T Option,2017,250000,46000,Petrol,Individual,Manual,First Owner
4,Honda Amaze VX i-DTEC,2014,450000,141000,Diesel,Individual,Manual,Second Owner


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4340 entries, 0 to 4339
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   name           4340 non-null   object
 1   year           4340 non-null   int64 
 2   selling_price  4340 non-null   int64 
 3   km_driven      4340 non-null   int64 
 4   fuel           4339 non-null   object
 5   seller_type    4333 non-null   object
 6   transmission   4340 non-null   object
 7   owner          4340 non-null   object
dtypes: int64(3), object(5)
memory usage: 271.4+ KB


In [5]:
data.describe(include='all')

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner
count,4340,4340.0,4340.0,4340.0,4339,4333,4340,4340
unique,1491,,,,5,3,2,5
top,Maruti Swift Dzire VDI,,,,Diesel,Individual,Manual,First Owner
freq,69,,,,2152,3237,3892,2832
mean,,2013.090783,504127.3,66215.777419,,,,
std,,4.215344,578548.7,46644.102194,,,,
min,,1992.0,20000.0,1.0,,,,
25%,,2011.0,208749.8,35000.0,,,,
50%,,2014.0,350000.0,60000.0,,,,
75%,,2016.0,600000.0,90000.0,,,,


### *Data Cleaning and Handling Missing Values*

In [6]:
data.isna().sum()

name             0
year             0
selling_price    0
km_driven        0
fuel             1
seller_type      7
transmission     0
owner            0
dtype: int64

In [None]:
data['seller_type'].fillna(data['seller_type'].mode()[0],inplace=True)
data['fuel'].fillna(data['fuel'].mode()[0],inplace=True)

In [8]:
data.isna().sum()

name             0
year             0
selling_price    0
km_driven        0
fuel             0
seller_type      0
transmission     0
owner            0
dtype: int64

### *Data Transformation* by *Label Encoding*

In [9]:
data.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner
0,Maruti 800 AC,2007,60000,70000,Petrol,Individual,Manual,First Owner
1,Maruti Wagon R LXI Minor,2007,135000,50000,Petrol,Individual,Manual,First Owner
2,Hyundai Verna 1.6 SX,2012,600000,100000,Diesel,Individual,Manual,First Owner
3,Datsun RediGO T Option,2017,250000,46000,Petrol,Individual,Manual,First Owner
4,Honda Amaze VX i-DTEC,2014,450000,141000,Diesel,Individual,Manual,Second Owner


In [10]:
label_encoder = LabelEncoder() 
  
data['fuel']= label_encoder.fit_transform(data['fuel']) 
data['seller_type']= label_encoder.fit_transform(data['seller_type']) 
data['transmission']= label_encoder.fit_transform(data['transmission']) 
data['owner']= label_encoder.fit_transform(data['owner']) 

In [11]:
data.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner
0,Maruti 800 AC,2007,60000,70000,4,1,1,0
1,Maruti Wagon R LXI Minor,2007,135000,50000,4,1,1,0
2,Hyundai Verna 1.6 SX,2012,600000,100000,1,1,1,0
3,Datsun RediGO T Option,2017,250000,46000,4,1,1,0
4,Honda Amaze VX i-DTEC,2014,450000,141000,1,1,1,2


### *Normalization* by *Standard Scalar*

In [15]:
scalar = MinMaxScaler()

numerical_features = ['selling_price', 'km_driven']
data[numerical_features] = scalar.fit_transform(data[numerical_features])

In [16]:
data.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner
0,Maruti 800 AC,2007,0.004505,0.086783,4,1,1,0
1,Maruti Wagon R LXI Minor,2007,0.01295,0.061988,4,1,1,0
2,Hyundai Verna 1.6 SX,2012,0.065315,0.123976,1,1,1,0
3,Datsun RediGO T Option,2017,0.025901,0.057028,4,1,1,0
4,Honda Amaze VX i-DTEC,2014,0.048423,0.174807,1,1,1,2


### *Feature Engineering* for *Predictive Modeling*

In [30]:
data = data.drop('name', axis=1)

In [31]:
X = data.drop('selling_price', axis=1)
y = data['selling_price']

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [33]:
X_train.shape

(3472, 6)

In [34]:
X_test.shape

(868, 6)

In [35]:
y_train.shape

(3472,)

In [36]:
y_test.shape

(868,)

---
---