## Importing the libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

import warnings
warnings.filterwarnings('ignore')

## Loading the dataset

In [2]:
df = pd.read_csv('Social_Network_Ads.csv')
df

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0
...,...,...,...,...,...
395,15691863,Female,46,41000,1
396,15706071,Male,51,23000,1
397,15654296,Female,50,20000,1
398,15755018,Male,36,33000,0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   User ID          400 non-null    int64 
 1   Gender           400 non-null    object
 2   Age              400 non-null    int64 
 3   EstimatedSalary  400 non-null    int64 
 4   Purchased        400 non-null    int64 
dtypes: int64(4), object(1)
memory usage: 15.8+ KB


## Define X by selecting only the age and EstimatedSalary, and y with purchased column

In [4]:
X = df[['Age', 'EstimatedSalary']]
y = df['Purchased']

## Print count of each label in Purchased column

In [5]:
df['Purchased'].value_counts()

Purchased
0    257
1    143
Name: count, dtype: int64

## Print Correlation of each feature in the dataset

In [6]:
df.select_dtypes(include=['float64', 'int64']).corr()

Unnamed: 0,User ID,Age,EstimatedSalary,Purchased
User ID,1.0,-0.000721,0.071097,0.00712
Age,-0.000721,1.0,0.155238,0.622454
EstimatedSalary,0.071097,0.155238,1.0,0.362083
Purchased,0.00712,0.622454,0.362083,1.0


# First: Logistic Regression model

## Split the dataset into Training set and Test set with test_size = 0.25 and random_state = 0

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

## Train the model with random_state = 0

In [8]:
logmod = LogisticRegression(random_state = 0)

In [9]:
logmod.fit(X_train, y_train)

## Print the prediction results

In [10]:
predictions = logmod.predict(X_test)
print(predictions)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


## Create dataframe with the Actual Purchased and Predict Purchased

In [11]:
result_df = pd.DataFrame(y_test)

In [12]:
result_df['Predictions'] = predictions
result_df

Unnamed: 0,Purchased,Predictions
132,0,0
309,0,0
341,0,0
196,0,0
246,0,0
...,...,...
146,1,0
135,0,0
390,1,0
264,1,0


## Print Confusion Matrix and classification_report

In [13]:
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.68      1.00      0.81        68
           1       0.00      0.00      0.00        32

    accuracy                           0.68       100
   macro avg       0.34      0.50      0.40       100
weighted avg       0.46      0.68      0.55       100



In [14]:
print(confusion_matrix(y_test,predictions))

[[68  0]
 [32  0]]


In [15]:
print(accuracy_score(y_test,predictions))

0.68


## Use StandardScaler() to improved performance and re-train your model

In [16]:
scaler = StandardScaler()
X_train_sc = scaler.fit_transform(X_train)
X_test_sc = scaler.transform(X_test)

In [17]:
pca = PCA(n_components = 2)
X_train_pca = pca.fit_transform(X_train_sc)
X_test_pca = pca.transform(X_test_sc)

In [18]:
logreg_noPCA = LogisticRegression(max_iter=2085) # max_iter >> to allow the model reaches convergence
logreg = LogisticRegression()

In [19]:
%timeit logreg_noPCA.fit(X_train, y_train)

570 µs ± 10.2 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [20]:
%timeit logreg.fit(X_train_pca, y_train)

435 µs ± 2.5 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [21]:
print('Training accuracy:', logreg.score(X_train_pca, y_train))
print('Testing accuracy:', logreg.score(X_test_pca, y_test))

Training accuracy: 0.8233333333333334
Testing accuracy: 0.89


## Try to Predicting a new result - e.g: person with Age = 30 and Salary = 90,000

In [22]:
d1 = pd.DataFrame({ 'Age' : [30], 'EstimatedSalary' : [90000]})
d1_pred = logreg.predict(d1)
d1_pred[0]

1

## Try to Predicting a new result - e.g: person with Age = 40 and Salary = 90,000

In [23]:
d2 = pd.DataFrame({ 'Age' : [40], 'EstimatedSalary' : [90000]})
d2_pred = logreg.predict(d1)
d2_pred[0]

1