# Cars Type Prediction using Logistic Regression

# Read dataset

In [1]:
from warnings import filterwarnings
filterwarnings("ignore")

import pandas as pd
A = pd.read_csv("C:/Users/Shruti/Downloads/Cars93.csv")

In [2]:
A.columns=['id', 'Manufacturer', 'Model', 'Type', 'MinPrice', 'Price',
       'MaxPrice', 'MPGcity', 'MPGhighway', 'AirBags', 'DriveTrain',
       'Cylinders', 'EngineSize', 'Horsepower', 'RPM', 'Revpermile',
       'Mantransavail', 'Fueltankcapacity', 'Passengers', 'Length',
       'Wheelbase', 'Width', 'Turncircle', 'Rearseatroom', 'Luggageroom',
       'Weight', 'Origin', 'Make']

# Missing data treatment

In [3]:
from myFuctions import replacer
replacer(A)

# Drop Unwanted Colummns


In [4]:
A = A.drop(labels=["id","Make","Model"], axis=1)

# Defining X and Y

In [5]:
X = A.drop(labels=["Type"],axis=1)
Y = A[["Type"]]

# Selecting the best features for Prediction

In [6]:
def ANOVA(df,cat,con):
    from pandas import DataFrame
    from statsmodels.formula.api import ols
    rel = con + " ~ " + cat
    model = ols(rel,df).fit()
    from statsmodels.stats.anova import anova_lm
    anova_results = anova_lm(model)
    Q = DataFrame(anova_results)
    a = Q['PR(>F)'][cat]
    return round(a,3)

In [7]:
imp_cols = []
for i in X.columns:
    if(X[i].dtypes!="object"):
        x = ANOVA(A,"Type",i)
        if(x < 0.05):
            imp_cols.append(i)

In [8]:
from scipy.stats import chi2_contingency
def chisquare(df,cat1,cat2):
    import pandas as pd
    ct = pd.crosstab(df[cat1],df[cat2])
    a,b,c,d = chi2_contingency(ct)
    return b

In [9]:
for i in X.columns:
    if(X[i].dtypes=="object"):
        x = chisquare(A,"Type",i)
        if(x < 0.05):
            imp_cols.append(i)

In [10]:
#imp_cols

# Preprocessing

In [11]:
from myFuctions import preprocessing
Xnew = preprocessing(X[imp_cols])

In [12]:
#Xnew

# Spliting the Data in training and testing set

In [13]:
from sklearn.model_selection import train_test_split
xtrain,xtest,ytrain,ytest=train_test_split(Xnew,Y,random_state=21,test_size=0.2)

# Creating LR model

In [14]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
model = lr.fit(xtrain,ytrain)

pred_ts = model.predict(xtest)
pred_tr = model.predict(xtrain)

from sklearn.metrics import accuracy_score

tr_acc = accuracy_score(ytrain,pred_tr)
ts_acc = accuracy_score(ytest,pred_ts)

In [15]:
tr_acc

1.0

In [16]:
ts_acc

0.9473684210526315

# Confusion_matrix

In [17]:
from sklearn.metrics import confusion_matrix
confusion_matrix(ytest,pred_ts)

array([[1, 0, 0, 0, 0, 0],
       [0, 3, 0, 0, 0, 0],
       [0, 0, 4, 0, 0, 0],
       [1, 0, 0, 3, 0, 0],
       [0, 0, 0, 0, 5, 0],
       [0, 0, 0, 0, 0, 2]], dtype=int64)

In [18]:
ytest['Pred_Type']=pred_ts
ytest.sort_values(by=["Type","Pred_Type"])

Unnamed: 0,Type,Pred_Type
91,Compact,Compact
21,Large,Large
17,Large,Large
19,Large,Large
10,Midsize,Midsize
3,Midsize,Midsize
1,Midsize,Midsize
36,Midsize,Midsize
23,Small,Compact
82,Small,Small
