## A famous shopping mall company has hired you as a ML engineer . Your task is to create a  model that can predict whether the customer will purchase the product from the website or not based on customer's age and estimated salary.

In [1]:
import numpy as np
import pandas as pd

In [2]:
shoppingData = pd.read_csv('Social_Network_Ads.csv')

In [3]:
#Check for missing data
shoppingData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 5 columns):
User ID            400 non-null int64
Gender             400 non-null object
Age                400 non-null int64
EstimatedSalary    400 non-null int64
Purchased          400 non-null int64
dtypes: int64(4), object(1)
memory usage: 15.7+ KB


In [4]:
#Check whether the dataset is balanced or not
shoppingData.Purchased.value_counts()

0    257
1    143
Name: Purchased, dtype: int64

In [5]:
#1. This use-case is a Binary Classification Use-case
#2. This dataset is an UNBALANCED dataset.

In [12]:
#Seperate my data is features and label
#Rule of Classification specific to SKLEARN
#Features ---> 2d
#Label ------> 1d

features = shoppingData.iloc[:,[2,3]].values
label = shoppingData.iloc[:,4].values

In [19]:
#Get the best random_state for generalized model
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
for i in range(1,401):
    
    X_train,X_test,y_train,y_test = train_test_split(features,
                                                label,
                                                test_size=0.2,
                                                random_state=i)
    model = LogisticRegression()
    model.fit(X_train,y_train)
    
    train_score = model.score(X_train,y_train)
    test_score = model.score(X_test,y_test)
    
    if test_score > train_score:
        print("Testing {}  Training {}  Random State {}".format(test_score,train_score,i))

Testing 0.6875  Training 0.63125  Random State 3
Testing 0.8875  Training 0.834375  Random State 4
Testing 0.6625  Training 0.6375  Random State 5
Testing 0.675  Training 0.634375  Random State 7
Testing 0.675  Training 0.634375  Random State 8
Testing 0.65  Training 0.640625  Random State 10
Testing 0.6625  Training 0.6375  Random State 11
Testing 0.675  Training 0.634375  Random State 16
Testing 0.7  Training 0.628125  Random State 17
Testing 0.7  Training 0.628125  Random State 21
Testing 0.65  Training 0.640625  Random State 24
Testing 0.6625  Training 0.6375  Random State 25
Testing 0.875  Training 0.834375  Random State 26
Testing 0.675  Training 0.634375  Random State 27
Testing 0.7  Training 0.628125  Random State 28
Testing 0.6875  Training 0.63125  Random State 29
Testing 0.6875  Training 0.63125  Random State 31
Testing 0.6625  Training 0.6375  Random State 37
Testing 0.675  Training 0.640625  Random State 39
Testing 0.7  Training 0.628125  Random State 40
Testing 0.6625  Tr

Testing 0.6625  Training 0.6375  Random State 393
Testing 0.7625  Training 0.759375  Random State 396
Testing 0.7  Training 0.628125  Random State 397
Testing 0.7125  Training 0.63125  Random State 400


In [20]:
#Split the data as training set and testing set --- 80-20
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(features,
                                                label,
                                                test_size=0.2,
                                                random_state=4)

In [21]:
#Train the model
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [22]:
#Check Generalization of the model
print(model.score(X_train,y_train))
print(model.score(X_test,y_test))

0.834375
0.8875


In [24]:
#Print Confusion Matrix
#Create confusion  matrix of entire dataset to judge the quality of the model
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(label, model.predict(features))
cm

array([[238,  19],
       [ 43, 100]], dtype=int64)

In [25]:
#Since dataset is unbalanced
from sklearn.metrics import classification_report
cr = classification_report(label, model.predict(features))
print(cr)

             precision    recall  f1-score   support

          0       0.85      0.93      0.88       257
          1       0.84      0.70      0.76       143

avg / total       0.84      0.84      0.84       400

