# Classifier model to deal with categorical missing values

In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from scipy import stats

In [2]:
#Loading the dataset
df = pd.read_csv('cars_price.csv')
df['volume(cm3)'] = df['volume(cm3)'].fillna(0)
df.head()

Unnamed: 0.1,Unnamed: 0,make,model,priceUSD,year,condition,mileage(kilometers),fuel_type,volume(cm3),color,transmission,drive_unit,segment
0,0,honda,accord,565,1993,with mileage,960015.0,petrol,2000.0,black,mechanics,front-wheel drive,D
1,1,ford,fusion,5550,2008,with mileage,172000.0,diesel,1400.0,silver,mechanics,front-wheel drive,M
2,2,nissan,teana,8300,2008,with mileage,223000.0,petrol,2500.0,purple,auto,front-wheel drive,D
3,3,volkswagen,fox,3300,2005,with mileage,140000.0,petrol,1200.0,blue,mechanics,front-wheel drive,A
4,4,nissan,primera,2450,2002,with damage,413000.0,diesel,2200.0,burgundy,mechanics,front-wheel drive,D


In [3]:
test = df[df.isnull().any(axis=1)]
test = test.dropna(axis=0, how='any', thresh=None, subset=['drive_unit'], inplace=False)
test.shape

(3636, 13)

In [4]:
#dropping null values to create training data
train = df.dropna(axis=0, how='any', thresh=None, subset=None, inplace=False)

In [5]:
#dropping unnecessary columns
train.drop('color', inplace = True, axis = 1)
test.drop('color', inplace = True, axis = 1)
#as model attribute shows a large variation and no trend we would drop it
train.drop('model', inplace = True, axis = 1) 
test.drop('model', inplace = True, axis = 1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [6]:
#removing outliers
z_scores = stats.zscore(train['priceUSD'])

abs_z_scores = np.abs(z_scores)
filtered_entries = (abs_z_scores < 3)
train = train[filtered_entries]
train.shape

(34461, 11)

In [7]:
#encoding the data
def label_encoder(attrib, data_frame):
    label_encoder = preprocessing.LabelEncoder() 
    data_frame[attrib]= label_encoder.fit_transform(data_frame[attrib]) 
    

In [8]:
#encoding training dataset label encoder
label_encoder('segment', train)
label_encoder('make', train)
label_encoder('year', train)

In [9]:
# encoding training dataset one hot encoder
temp_df = pd.DataFrame({
        'fuel_type': ['petrol', 'diesel', 'electrocar']
    })
train = pd.concat([train,pd.get_dummies(train['fuel_type'], prefix='fuel_type',drop_first=True)],axis=1)
train.drop(['fuel_type'],axis=1, inplace=True)

temp_df = pd.DataFrame({
        'transmission': ['mechanics', 'auto']
    })
train = pd.concat([train,pd.get_dummies(train['transmission'], prefix='transmission',drop_first=True)],axis=1)
train.drop(['transmission'],axis=1, inplace=True)

temp_df = pd.DataFrame({
        'drive_unit': ['front-wheel drive', 'part-time four-wheel drive', 'rear drive',
       'all-wheel drive']
    })
train = pd.concat([train,pd.get_dummies(train['drive_unit'], prefix='drive_unit',drop_first=True)],axis=1)
train.drop(['drive_unit'],axis=1, inplace=True)

temp_df = pd.DataFrame({
        'condition': ['with mileage', 'with damage', 'for parts']
    })
train = pd.concat([train,pd.get_dummies(train['condition'], prefix='condition',drop_first=True)],axis=1)
train.drop(['condition'],axis=1, inplace=True)


In [10]:
test.drop('segment', inplace = True, axis = 1)
#encoding test dataset
label_encoder('make',test)
label_encoder('year', test)


In [11]:
#one hot encoding
temp_df = pd.DataFrame({
        'fuel_type': ['petrol', 'diesel', 'electrocar']
    })
test = pd.concat([test,pd.get_dummies(test['fuel_type'], prefix='fuel_type',drop_first=True)],axis=1)
test.drop(['fuel_type'],axis=1, inplace=True)

temp_df = pd.DataFrame({
        'transmission': ['mechanics', 'auto']
    })
test = pd.concat([test,pd.get_dummies(test['transmission'], prefix='transmission',drop_first=True)],axis=1)
test.drop(['transmission'],axis=1, inplace=True)

temp_df = pd.DataFrame({
        'drive_unit': ['front-wheel drive', 'part-time four-wheel drive', 'rear drive',
       'all-wheel drive']
    })
test = pd.concat([test,pd.get_dummies(test['drive_unit'], prefix='drive_unit',drop_first=True)],axis=1)
test.drop(['drive_unit'],axis=1, inplace=True)

temp_df = pd.DataFrame({
        'condition': ['with mileage', 'with damage', 'for parts']
    })
test = pd.concat([test,pd.get_dummies(test['condition'], prefix='condition',drop_first=True)],axis=1)
test.drop(['condition'],axis=1, inplace=True)

In [12]:
#Random Forest Model 
from sklearn.model_selection import train_test_split
X = train[['make','priceUSD','year','volume(cm3)','fuel_type_electrocar','fuel_type_petrol','transmission_mechanics','drive_unit_front-wheel drive','drive_unit_part-time four-wheel drive','drive_unit_rear drive']]
y = train['segment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

from sklearn.ensemble import RandomForestClassifier
clf=RandomForestClassifier(n_estimators=400)
clf.fit(X_train,y_train)


RandomForestClassifier(n_estimators=400)

In [13]:
y_pred=clf.predict(X_test)
from sklearn import metrics
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.7610259981429898


In [14]:
X_missing = test[['make','priceUSD','year','volume(cm3)','fuel_type_electrocar','fuel_type_petrol','transmission_mechanics','drive_unit_front-wheel drive','drive_unit_part-time four-wheel drive','drive_unit_rear drive']]
abc = sc.fit_transform(X_missing)
res =clf.predict(abc)
test.insert(6, "segment", res, True) 

In [15]:
frames = [train,test]

result = pd.concat(frames)

result.to_csv('new.csv') 

In [16]:
result.shape

(38097, 15)