In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd

In [2]:
orders = pd.read_csv('car_data.csv')
orders.head()

Unnamed: 0,User ID,Gender,Age,AnnualSalary,Purchased
0,385,Male,35,20000,0
1,681,Male,40,43500,0
2,353,Male,49,74000,0
3,895,Male,40,107500,1
4,661,Male,25,79000,0


This dataset contains details of 1000 customers who intend to buy a car, considering their annual salaries, gender and age

Purchase Decision (No = 0; Yes = 1)

In [6]:
orders['Gender'].value_counts()

Female    516
Male      484
Name: Gender, dtype: int64

In [8]:
#replacing Gender with dummy variable. Female = 0, Male = 1
orders['Gender'] = orders['Gender'].str.replace('Female', '0')
orders['Gender'] = orders['Gender'].str.replace('Male', '1')

In [13]:
#converting Gender from str to int64
orders['Gender'] = orders['Gender'].astype('int64')

# LOGISTIC REGRESSION

In [15]:
orders.head()

Unnamed: 0,User ID,Gender,Age,AnnualSalary,Purchased
0,385,1,35,20000,0
1,681,1,40,43500,0
2,353,1,49,74000,0
3,895,1,40,107500,1
4,661,1,25,79000,0


In [17]:
#independents variable(x) = [Gender,Age,AnnualSalary]
#Dependent(Y) = Purchased
xs = ['Gender','Age', 'AnnualSalary']
x = orders.loc[: , xs]
y = orders.loc[:,['Purchased']]
print(x)
print(y)

     Gender  Age  AnnualSalary
0         1   35         20000
1         1   40         43500
2         1   49         74000
3         1   40        107500
4         1   25         79000
..      ...  ...           ...
995       1   38         59000
996       0   47         23500
997       0   28        138500
998       0   48        134000
999       0   44         73500

[1000 rows x 3 columns]
     Purchased
0            0
1            0
2            0
3            1
4            0
..         ...
995          0
996          0
997          1
998          1
999          0

[1000 rows x 1 columns]


In [18]:
# split the train and test dataset
x_train, x_test,y_train, y_test = train_test_split(x, y,test_size=0.20,random_state=23)

In [19]:
#training and creating a model
model = LogisticRegression(random_state=0)
model.fit(x_train, y_train)

  y = column_or_1d(y, warn=True)


LogisticRegression(random_state=0)

In [20]:
#Testing the model
y_pred = model.predict(x_test)


In [22]:
#Evaluating the model accuracy
accuracy_level = accuracy_score(y_test,y_pred)
print('The accuracy of the model using all independent variables is :',accuracy_level*100,'%')

The accuracy of the model using all independent variables is : 61.5 %


# Using Gender to classify Purchased

In [23]:
x1 = orders.loc[: ,['Gender']]
y1 = orders.loc[: , ['Purchased']]

In [24]:
#splitting into train and test
x1_train,x1_test,y1_train,y1_test = train_test_split(x1,y1,test_size = 0.20,random_state = 23)

In [25]:
#creating and training the model
model1 = LogisticRegression(random_state = 0)
model1.fit(x_train,y_train)

  y = column_or_1d(y, warn=True)


LogisticRegression(random_state=0)

In [27]:
#Testing the model
y1_pred = model1.predict(x_test)

In [51]:
#Evaluating the model accuracy
accuracy_level1 = accuracy_score(y1_test,y1_pred)
print('The accuracy of the model using all independent variables is :',accuracy_level1*100,'%')

The accuracy of the model using all independent variables is : 61.5 %


# Using Age to classify the Purchased

In [39]:
x2 = orders.loc[:,['Age']]
y2 = orders.loc[:,['Purchased']]

In [40]:
x2_train,x2_test,y2_train,y2_test = train_test_split(x2,y2,test_size = 0.30,random_state = 13)

In [41]:
model2 = LogisticRegression(random_state = 0)
model2.fit(x2_train,y2_train)

  y = column_or_1d(y, warn=True)


LogisticRegression(random_state=0)

In [42]:
y2_pred = model2.predict(x2_test)

In [50]:
#Evaluating the model accuracy
accuracy_level2 = accuracy_score(y2_test,y2_pred)
print('The accuracy of the model based on Age is :',accuracy_level2*100,'%')

The accuracy of the model based on Age is : 81.0 %


# Annual Salary

In [44]:
x3 = orders.loc[:,['AnnualSalary']]
y3 = orders.loc[:,['Purchased']]

In [45]:
x3_train,x3_test,y3_train,y3_test = train_test_split(x3,y3,test_size = 0.30,random_state = 13)

In [46]:
model3 = LogisticRegression(random_state = 0)
model3.fit(x3_train,y3_train)

  y = column_or_1d(y, warn=True)


LogisticRegression(random_state=0)

In [48]:
y3_pred = model3.predict(x3_test)

In [49]:
accuracy_level3 = accuracy_score(y3_test,y3_pred)
print('The accuracy of the model based on Age is :',accuracy_level3*100,'%')

The accuracy of the model based on Age is : 35.0 %


Therefore Age is a good classifier of car purchases since the model accuracy is 81% than both Annual Salary(35%) and Gender(65%)