# CarsForSale Challenge Solutions

In [1]:
#Basic imports
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('../input/carsforsale/cars_raw.csv')
df.head()

In [3]:
df.info()

### 1. Modifying 'Price' Column

In [4]:
#Remove rows with 'Not Priced'
df = df[df['Price'] != 'Not Priced']

#Remove symbols and convert all entries to integers
df['Price'] = [price.replace('$','') for price in df['Price']]
df['Price'] = [int(price.replace(',','')) for price in df['Price']]

### 2. Aggregating 'Certified' Values

In [5]:
df['Used/New'] = ['Certified Pre-Owned' if 'certified' in item.casefold()
                  else 'Used' for item in df['Used/New']]

In [6]:
df['Used/New'].unique()

### 3. Modifying 'Drivetrain' Column

In [7]:
df['Drivetrain'].unique()

In [8]:
#Remove rows with the dash
df = df[df['Drivetrain'] != '–']

In [9]:
df['Drivetrain'] = ['4WD' if 'four' in item.casefold()
                   else 'AWD' if 'all' in item.casefold()
                   else 'RWD' if 'rear' in item.casefold()
                   else 'FWD' for item in df['Drivetrain']]

In [10]:
df['Drivetrain'].unique()

### 4. Create Dummy Variables for Predictive Modeling

In [11]:
df.columns

In [12]:
#Remove rows that contain dashes in the 'FuelType' column
df = df[df['FuelType'] != '–']

In [13]:
#One-hot encoding without pd.get_dummies
df['Used/New'] = [1 if item == 'Used' else 0 for item in df['Used/New']]
df['SellerType'] = [1 if item == 'Dealer' else 0 for item in df['SellerType']]

#Modify 'FuelType' column
df['FuelType'] = ['Hybrid' if 'hybrid' in item.casefold()
                 else 'Hybrid' if 'plug' in item.casefold()
                 else 'Flex' if 'flex' in item.casefold()
                 else 'Electric' if 'electric' in item.casefold()
                 else 'Diesel' if 'diesel' in item.casefold()
                 else 'Gas' for item in df['FuelType']]

#Modify 'DealType' column
df['DealType'].fillna('None', inplace=True)
df = df[df['DealType'] != 'None']

#New Columns
df['TransmissionType'] = ['Automatic' if 'auto' in item.casefold()
                         else 'Manual' if 'manual' in item.casefold()
                         else 'Other' for item in df['Transmission']]

In [14]:
make = pd.get_dummies(df['Make'])
drivetrain = pd.get_dummies(df['Drivetrain'])
transmission = pd.get_dummies(df['TransmissionType'],prefix='TM')
fueltype = pd.get_dummies(df['FuelType'],)

In [15]:
columns_to_drop = ['Make','Drivetrain','Transmission','TransmissionType','SellerName',
                  'StreetName','State','Zipcode','FuelType','ExteriorColor','InteriorColor',
                  'Engine','VIN','Stock#','Year','Model']
df.drop(columns_to_drop, axis=1, inplace=True)
df = pd.concat([df,make,drivetrain,transmission,fueltype],axis=1)

In [16]:
#Check that the dataframe is properly set up
df.info()

### 5. Create a Classification Model

We will create a classification model that will predict the Deal TYPE based on all of our other features.

In [17]:
#Modeling imports
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

In [18]:
df.describe()



In [19]:
df.boxplot(column ='Price') #Univariate Analysis for outliers

In [20]:
df.boxplot(column = 'Price', by = 'DealType') #bivariate Analysis

In [21]:
pd.crosstab(df['ConsumerRating'],df['DealType'],margins=True)



In [22]:
pd.crosstab(df['Used/New'],df['DealType'],margins=True)

In [23]:
df.apply(lambda x:sum(x.isnull()),axis=0)

In [24]:
Deal = df['DealType']
Deal

In [25]:
#encode the independant variables  {Categorial variable}
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
labelencoder_x=LabelEncoder()

Deal =labelencoder_x.fit_transform(Deal)

In [26]:
df['DealType'] = Deal
df.head()

In [27]:
ConsumerRating = df['ConsumerRating']
ConsumerRating

In [28]:
import statsmodels.api as sm
model=sm.OLS(ConsumerRating,Deal).fit()
model.summary()

In [29]:
mean_deal= df["DealType"].mean()
mean_deal

In [30]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
plt.plot(ConsumerRating,Deal,'o')
plt.ylabel=("ConsumerRating")
plt.xlabel=("DealType")
plt.axhline(mean_deal,color='r',linestyle='-')
sns.regplot(x='ConsumerRating',y='DealType',data=df,color='g')

In [31]:
# since the p value obtained is <0.05 so our Null Hypothesis is accepted.

In [32]:
# Catagorical Conversion

df.cov()

In [33]:
df.corr()

In [34]:
#Mutliple Linear Regression

In [35]:
df.columns

In [36]:
import statsmodels.formula.api as sm
model2=sm.ols(formula='DealType ~ ConsumerRating + ComfortRating + ConsumerReviews + InteriorDesignRating + PerformanceRating + ValueForMoneyRating + ExteriorStylingRating + ReliabilityRating + SellerRating + TM_Automatic + TM_Manual + TM_Other + Diesel + Electric + Flex + Gas + Hybrid',data=df).fit()
model2.summary()

In [37]:
#The model gives the catagorical variables which are the only responsible predictors on which our target depends on.

In [38]:
#Prepping the data
X = np.array(df.drop('DealType',axis=1))
Y = np.array(df['DealType'])

XTRAIN, XTEST, YTRAIN, YTEST = train_test_split(X,Y,shuffle=True,random_state=69)

In [39]:
#fitting Multiple Linear Regression to the training set
from sklearn.linear_model import LinearRegression
reg=LinearRegression()
reg.fit(XTRAIN,YTRAIN)

In [40]:
#predicting the Test Set results
y_pred=reg.predict(XTEST)
y_pred

In [41]:
X.shape

In [42]:
#Building the optimal model using Backward Eliminination


#Now, as we know in multiple linear regression,   y = b0+b1X1+b2X2+b3X3+….+bnXn
#we can also represent it as  y = b0X0+b1X1+b2X2+b3X3+….+bnXn where X0 = 1


import statsmodels.formula.api as sm
X= np.append(arr=np.ones((9116,1)).astype(int),values=X,axis=1)
X

====>Logistic Regression
It is a statistical method for analysing a data set in which there are one or more independent variables that determine an outcome. The outcome is measured with a dichotomous variable (in which there are only two possible outcomes). The goal of logistic regression is to find the best fitting model to describe the relationship between the dichotomous characteristic of interest (dependent variable = response or outcome variable) and a set of independent (predictor or explanatory) variables.
Why Logistic Regression instead of Linear Regression?
1.Linear regression output as probabilities:
         As linear regression might actually produce probabilities that could be less than 0, or even bigger than 1, logistic regression was introduced. 
2.Outcome
         In linear regression, the outcome (dependent variable) is continuous. It can have any one of an infinite number of possible values.
         In logistic regression, the outcome (dependent variable) has only a limited number of possible values.

In [44]:
from sklearn.linear_model import LogisticRegression
cls = LogisticRegression(random_state = 0, max_iter = 1000)
lr_cls = cls.fit(XTRAIN,YTRAIN)

In [45]:
lr_y_test = lr_cls.predict(XTEST)
lr_y_test

In [46]:
lr_y_train = lr_cls.predict(XTRAIN)
lr_y_train

In [47]:
from sklearn.metrics import confusion_matrix
lr_cm_test = confusion_matrix(YTEST,lr_y_test)
lr_cm_test

In [48]:
lr_cm_train = confusion_matrix(YTRAIN,lr_y_train)
lr_cm_train

In [49]:
from sklearn.metrics import accuracy_score
lr_accuracy_test = accuracy_score(YTEST,lr_y_test)
lr_accuracy_test

In [50]:
lr_accuracy_train = accuracy_score(YTRAIN,lr_y_train)
lr_accuracy_train

In [51]:
from sklearn.metrics import precision_score
lr_precision_test = precision_score(YTEST,lr_y_test,average='weighted')
lr_precision_test

In [52]:
lr_precision_train = precision_score(YTRAIN,lr_y_train,average='weighted')
lr_precision_train

====> kNN Clssification

In [55]:
from sklearn import neighbors
#fitting classifier to training set
clf=neighbors.KNeighborsClassifier()
clf.fit(XTRAIN,YTRAIN)

In [56]:
kc_y_test=clf.predict(XTEST)

In [57]:
#predicting train set results
kc_y_train=clf.predict(XTRAIN)

In [58]:
#Obtaining confusion matrix on test data
from sklearn.metrics import confusion_matrix
clf_cm_train = confusion_matrix(YTRAIN,kc_y_train)
clf_cm_train

In [59]:
from sklearn.metrics import confusion_matrix
clf_cm_test = confusion_matrix(YTEST,kc_y_test)
clf_cm_test

In [60]:
kc_accuracy_test = clf.score(XTEST,YTEST)
kc_accuracy_test

In [61]:
from sklearn.metrics import precision_score
kc_precision_test = precision_score(YTEST,kc_y_test,average='weighted')
kc_precision_test

===>Decisison Tree Classifier¶

In [69]:
# decision tree classifier
from sklearn.tree import DecisionTreeClassifier

dtc_clf = DecisionTreeClassifier()
dtc_clf


In [None]:
dtc_clf.fit(XTRAIN,YTRAIN)

In [63]:
#predicting
dtc_y_test=dtc_clf.predict(XTEST)
dtc_y_test


In [64]:
dtc_y_train = dtc_clf.predict(XTRAIN)
dtc_y_train

In [65]:
from sklearn.metrics import confusion_matrix
dtc_cm_test = confusion_matrix(YTEST,dtc_y_test)
dtc_cm_test

In [66]:
from sklearn.metrics import accuracy_score
dtc_accuracy_test=accuracy_score(YTEST,dtc_y_test)
dtc_accuracy_test

====>SVM¶

In [67]:
#support vector classification
from sklearn.svm import SVC
sc=SVC(kernel='rbf')
sc_classifier = sc.fit(XTRAIN,YTRAIN) #model bui;lding

In [71]:
sc_classifier

In [72]:
#predicting on test and train data
svc_y_test=sc_classifier.predict(XTEST)
svc_y_test

In [73]:
svc_y_train=sc_classifier.predict(XTRAIN)
svc_y_train

In [75]:
#obtain accuracy
from sklearn.metrics import accuracy_score
svc_accuracy_test= accuracy_score(YTEST,svc_y_test)
accuracy4= accuracy_score(YTRAIN,svc_y_train)



In [76]:
svc_accuracy_test

In [77]:
accuracy4

In [79]:
#build confusion matrix
from sklearn.metrics import confusion_matrix
svc_cm_test=confusion_matrix(YTEST,svc_y_test)
svc_cm_test




In [80]:
svc_cm_train=confusion_matrix(YTRAIN,svc_y_train)
svc_cm_train

In [83]:
#Plot comparing accuracy scores of all models
a=['logistic','kNN','SVM','DTC']
b=[lr_accuracy_test,kc_accuracy_test,svc_accuracy_test,dtc_accuracy_test]
plt.plot(a,b,'o',color='r')
plt.grid()
plt.show()