In [1]:
#IMPORT MAIN DATA ANALYSE AND VISUALISATION LIBRARIES

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [3]:
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier

In [4]:
data = pd.read_csv('brands.csv') #READ DATA

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2072 entries, 0 to 2071
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Brand     2072 non-null   object
 1   Model     2071 non-null   object
 2   Category  2072 non-null   object
dtypes: object(3)
memory usage: 48.7+ KB


In [6]:
data.dropna(inplace=True) #DROP EMPTRY CELLS

In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2071 entries, 0 to 2071
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Brand     2071 non-null   object
 1   Model     2071 non-null   object
 2   Category  2071 non-null   object
dtypes: object(3)
memory usage: 64.7+ KB


In [8]:
data.columns

Index(['Brand', 'Model', 'Category'], dtype='object')

In [9]:
data.Category.unique()

array(['coupes', 'sedans', 'SUVs', 'Avant', 'hatchbacks', 'convertibles',
       'limousines', 'shooting', 'minivans', 'roadsters', 'fastbacks',
       'touring', 'station', 'microvans', 'hardtop', 'Popemobiles',
       'Roadster', 'K-Cars', 'vans', 'Targa', 'pickup'], dtype=object)

In [10]:
data.drop(data[data['Category'] == 'Popemobiles'].index, inplace=True)
data.drop(data[data['Category'] == 'K-Cars'].index, inplace=True)
data.drop(data[data['Category'] == 'microvans'].index, inplace=True)
data.drop(data[data['Category'] == 'touring'].index, inplace=True)

In [11]:
data.Category.unique()

array(['coupes', 'sedans', 'SUVs', 'Avant', 'hatchbacks', 'convertibles',
       'limousines', 'shooting', 'minivans', 'roadsters', 'fastbacks',
       'station', 'hardtop', 'Roadster', 'vans', 'Targa', 'pickup'],
      dtype=object)

In [12]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1969 entries, 0 to 2071
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Brand     1969 non-null   object
 1   Model     1969 non-null   object
 2   Category  1969 non-null   object
dtypes: object(3)
memory usage: 61.5+ KB


In [13]:
category = {'sedans': 'NON-OFFROADER', 'limousines': 'NON-OFFROADER', 'SUVs' : 'OFFROADER', 'pickup':'OFFROADER', 'fastbacks': 'NON-OFFROADER', 'Targa': 'NON-OFFROADER',
            'station':'NON-OFFROADER', 'Avant': 'NON-OFFROADER','shooting':'NON-OFFROADER', 'coupes': 'NON-OFFROADER', 'hatchbacks' : 'NON-OFFROADER', 'convertibles': 'NON-OFFROADER',
            'minivans': 'NON-OFFROADER', 'roadsters': 'NON-OFFROADER','hardtop': 'NON-OFFROADER', 'Roadster': 'NON-OFFROADER', 'vans': 'NON-OFFROADER'}
data['New_Category'] = data['Category'].map(category)

In [14]:
data.New_Category.unique()

array(['NON-OFFROADER', 'OFFROADER'], dtype=object)

In [15]:
data.head(10)

Unnamed: 0,Brand,Model,Category,New_Category
0,Acura,CL,coupes,NON-OFFROADER
1,Acura,Legend coupe,coupes,NON-OFFROADER
2,Acura,NSX,coupes,NON-OFFROADER
3,Acura,RSX,coupes,NON-OFFROADER
4,Acura,CSX,sedans,NON-OFFROADER
5,Acura,EL,sedans,NON-OFFROADER
6,Acura,ILX,sedans,NON-OFFROADER
7,Acura,RL,sedans,NON-OFFROADER
8,Acura,RLX,sedans,NON-OFFROADER
9,Acura,TL,sedans,NON-OFFROADER


In [16]:
# IMPORT LIBRARIES REQUIRED FROM SVC CLASSFICATION

In [17]:
X = data.drop(['Category','New_Category'], axis=1)
y=data['New_Category']

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 101)

In [19]:
svc_model=make_pipeline(OneHotEncoder(handle_unknown='ignore'), SVC())

In [20]:
svc_model.fit(X_train, y_train)

Pipeline(steps=[('onehotencoder', OneHotEncoder(handle_unknown='ignore')),
                ('svc', SVC())])

In [21]:
svc_model_pred = svc_model.predict(X_test)

In [22]:
print(confusion_matrix(y_test, svc_model_pred))
print('/n')
print(classification_report(y_test, svc_model_pred))

[[506   2]
 [ 82   1]]
/n
               precision    recall  f1-score   support

NON-OFFROADER       0.86      1.00      0.92       508
    OFFROADER       0.33      0.01      0.02        83

     accuracy                           0.86       591
    macro avg       0.60      0.50      0.47       591
 weighted avg       0.79      0.86      0.80       591



In [23]:
from sklearn.linear_model import LogisticRegression

lr_model = make_pipeline(OneHotEncoder(handle_unknown='ignore'), LogisticRegression())

In [24]:
lr_model.fit(X_train, y_train)

Pipeline(steps=[('onehotencoder', OneHotEncoder(handle_unknown='ignore')),
                ('logisticregression', LogisticRegression())])

In [25]:
lr_model_pred = lr_model.predict(X_test)

In [26]:
#Printing the performance of the model:
print(confusion_matrix(y_test, lr_model_pred))
print('/n')
print(classification_report(y_test, lr_model_pred))

[[506   2]
 [ 82   1]]
/n
               precision    recall  f1-score   support

NON-OFFROADER       0.86      1.00      0.92       508
    OFFROADER       0.33      0.01      0.02        83

     accuracy                           0.86       591
    macro avg       0.60      0.50      0.47       591
 weighted avg       0.79      0.86      0.80       591



In [27]:
from sklearn.neighbors import KNeighborsClassifier

knc_model = make_pipeline(OneHotEncoder(handle_unknown='ignore'), KNeighborsClassifier())

In [28]:
knc_model.fit(X_train, y_train)

Pipeline(steps=[('onehotencoder', OneHotEncoder(handle_unknown='ignore')),
                ('kneighborsclassifier', KNeighborsClassifier())])

In [29]:
knc_model_pred = knc_model.predict(X_test)

In [30]:
#Printing the performance of the model:
print(confusion_matrix(y_test, knc_model_pred))
print('/n')
print(classification_report(y_test, knc_model_pred))

[[493  15]
 [ 77   6]]
/n
               precision    recall  f1-score   support

NON-OFFROADER       0.86      0.97      0.91       508
    OFFROADER       0.29      0.07      0.12        83

     accuracy                           0.84       591
    macro avg       0.58      0.52      0.52       591
 weighted avg       0.78      0.84      0.80       591



In [31]:
from sklearn.ensemble import RandomForestClassifier

rfc_model = make_pipeline(OneHotEncoder(handle_unknown='ignore'), RandomForestClassifier())

In [32]:
rfc_model.fit(X_train, y_train)

Pipeline(steps=[('onehotencoder', OneHotEncoder(handle_unknown='ignore')),
                ('randomforestclassifier', RandomForestClassifier())])

In [33]:
rfc_model_pred = rfc_model.predict(X_test)

In [34]:
#Printing the performance of the model:
print(confusion_matrix(y_test, rfc_model_pred))
print('/n')
print(classification_report(y_test, rfc_model_pred))

[[506   2]
 [ 82   1]]
/n
               precision    recall  f1-score   support

NON-OFFROADER       0.86      1.00      0.92       508
    OFFROADER       0.33      0.01      0.02        83

     accuracy                           0.86       591
    macro avg       0.60      0.50      0.47       591
 weighted avg       0.79      0.86      0.80       591



In [35]:
rfc_model.predict([['BWM', '520i']])

array(['NON-OFFROADER'], dtype=object)