# Use-case - Your are being hired as a ML engineer. YOur role is to create a model that can predict whether the customer will purchase a product or not based on his/her age and estimated salary.

In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv('Social_Network_Ads.csv')

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 5 columns):
User ID            400 non-null int64
Gender             400 non-null object
Age                400 non-null int64
EstimatedSalary    400 non-null int64
Purchased          400 non-null int64
dtypes: int64(4), object(1)
memory usage: 15.7+ KB


In [11]:
#Check whether the dataset is a balanced dataset or not
data.Purchased.value_counts()

0    257
1    143
Name: Purchased, dtype: int64

In [4]:
# Rules for Classification when it comes to Sklearn
# 1. Data must be complete
# 2. Data must be strictly numeric
# 3. Data must be represented in the form of numpy array
# 4. Features must be a 2d array, label must be 1d array

In [5]:
#Seperate data with features and label
features = data.iloc[:,[2,3]].values
label = data.iloc[:,4].values

In [16]:
#Train Test Split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(features,
                                                label,
                                                test_size=0.2,
                                                random_state=4)

In [17]:
#Create the model
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [18]:
#Check the Quality of the model
# 1. Check for Generalization
# 2. Check for Score and identify whether the same is satisfactory or not

print(model.score(X_train,y_train))
print(model.score(X_test,y_test))

0.790625
0.8875


In [19]:
#Question
# 1. Threshold ---- 70%
# 2. To get the list of good customers for voucher distribution
#    1 ===> 0 (not tolerable)
#   

In [20]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(label, model.predict(features))
cm

array([[247,  10],
       [ 66,  77]])

In [21]:
from sklearn.metrics import classification_report
print(classification_report(label, model.predict(features)))

              precision    recall  f1-score   support

           0       0.79      0.96      0.87       257
           1       0.89      0.54      0.67       143

   micro avg       0.81      0.81      0.81       400
   macro avg       0.84      0.75      0.77       400
weighted avg       0.82      0.81      0.80       400



In [15]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import warnings
warnings.filterwarnings('ignore')
for i in range(1,200):
    
    X_train,X_test,y_train,y_test = train_test_split(features,
                                                label,
                                                test_size=0.2,
                                                random_state=i)
    
    model = LogisticRegression()
    model.fit(X_train,y_train)
    
    train_score = model.score(X_train,y_train)
    test_score = model.score(X_test,y_test)
    
    if test_score > train_score:
        print('Test {} Train {} Seed {}'.format(test_score,train_score,i))

Test 0.6875 Train 0.63125 Seed 3
Test 0.8875 Train 0.790625 Seed 4
Test 0.6625 Train 0.6375 Seed 5
Test 0.675 Train 0.634375 Seed 7
Test 0.675 Train 0.634375 Seed 8
Test 0.65 Train 0.640625 Seed 10
Test 0.6625 Train 0.6375 Seed 11
Test 0.675 Train 0.634375 Seed 16
Test 0.7 Train 0.628125 Seed 17
Test 0.7 Train 0.628125 Seed 21
Test 0.65 Train 0.640625 Seed 24
Test 0.6625 Train 0.6375 Seed 25
Test 0.875 Train 0.834375 Seed 26
Test 0.675 Train 0.634375 Seed 27
Test 0.7 Train 0.628125 Seed 28
Test 0.6875 Train 0.63125 Seed 29
Test 0.6875 Train 0.63125 Seed 31
Test 0.6625 Train 0.6375 Seed 37
Test 0.675 Train 0.640625 Seed 39
Test 0.7 Train 0.628125 Seed 40
Test 0.6625 Train 0.64375 Seed 42
Test 0.85 Train 0.790625 Seed 46
Test 0.65 Train 0.640625 Seed 48
Test 0.675 Train 0.634375 Seed 50
Test 0.65 Train 0.640625 Seed 51
Test 0.65 Train 0.640625 Seed 54
Test 0.65 Train 0.64375 Seed 56
Test 0.6625 Train 0.6375 Seed 58
Test 0.6875 Train 0.6375 Seed 59
Test 0.7 Train 0.628125 Seed 60
Test 0.6

In [25]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
import warnings
warnings.filterwarnings('ignore')
for i in range(150,200):
    
    X_train,X_test,y_train,y_test = train_test_split(features,
                                                label,
                                                test_size=0.2,
                                                random_state=i)
    
    model = KNeighborsClassifier()
    model.fit(X_train,y_train)
    
    train_score = model.score(X_train,y_train)
    test_score = model.score(X_test,y_test)
    
    if test_score > train_score:
        print('Test {} Train {} Seed {}'.format(test_score,train_score,i))

Test 0.8625 Train 0.846875 Seed 161
Test 0.8625 Train 0.859375 Seed 170
Test 0.8625 Train 0.853125 Seed 179
Test 0.9125 Train 0.86875 Seed 192


In [26]:
X_train,X_test,y_train,y_test = train_test_split(features,
                                                label,
                                                test_size=0.2,
                                                random_state=192)
    
model1 = KNeighborsClassifier()
model1.fit(X_train,y_train)
    
train_score = model1.score(X_train,y_train)
test_score = model1.score(X_test,y_test)

print(train_score)
print(test_score)


0.86875
0.9125


In [27]:
confusion_matrix(label, model1.predict(features))

array([[242,  15],
       [ 34, 109]])

In [28]:
print(classification_report(label,model1.predict(features)))

              precision    recall  f1-score   support

           0       0.88      0.94      0.91       257
           1       0.88      0.76      0.82       143

   micro avg       0.88      0.88      0.88       400
   macro avg       0.88      0.85      0.86       400
weighted avg       0.88      0.88      0.88       400

