In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [10]:
csv = "./Social_Network_Ads.csv"
dataset = pd.read_csv(csv)
df = pd.DataFrame(dataset)

In [11]:
df.size

2000

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   User ID          400 non-null    int64 
 1   Gender           400 non-null    object
 2   Age              400 non-null    int64 
 3   EstimatedSalary  400 non-null    int64 
 4   Purchased        400 non-null    int64 
dtypes: int64(4), object(1)
memory usage: 15.8+ KB


### Data preprocessing

In [13]:
df["Gender"].unique()

array(['Male', 'Female'], dtype=object)

In [14]:
# 0 -> Female
# 1 -> Male
df["Gender"].replace(to_replace=["Male", "Female"], value=[1, 0], inplace=True)


In [15]:
df.corr()["Purchased"].sort_values(ascending=False)

Purchased          1.000000
Age                0.622454
EstimatedSalary    0.362083
User ID            0.007120
Gender            -0.042469
Name: Purchased, dtype: float64

### Logistic Regression

In [16]:
from sklearn.model_selection import train_test_split

X = df.iloc[:,1:4].values
y = df.iloc[:,4].values

In [17]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

In [18]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_test_copy = X_test.copy()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [19]:
# Fitting Logistic Regression for the training set
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state=0)
classifier.fit(X_train, y_train)

LogisticRegression(random_state=0)

### Predictions

In [20]:
#Predicting the Test set results
y_pred = classifier.predict(X_test)

In [21]:
predictions = pd.DataFrame(X_test_copy, columns=["Gender","Age", "EstimatedSalary"])
predictions["Purchased"] = y_test
predictions["predicted"] = y_pred
predictions.head(20)

Unnamed: 0,Gender,Age,EstimatedSalary,Purchased,predicted
0,1,30,87000,0,0
1,0,38,50000,0,0
2,1,35,75000,0,0
3,0,30,79000,0,0
4,0,35,50000,0,0
5,1,27,20000,0,0
6,0,31,15000,0,0
7,1,36,144000,1,1
8,0,18,68000,0,0
9,1,47,43000,0,1


### Confusion Matrix

In [22]:
#Making the confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
# Display confusion matrix results
output = pd.DataFrame(cm, columns=["Predicted Not Purchased", "Predicted Purchased"], index=["Actual Not Purchased", "Actual Purchased"])
output

Unnamed: 0,Predicted Not Purchased,Predicted Purchased
Actual Not Purchased,65,3
Actual Purchased,7,25


### Model Evaluation

In [23]:
TP = cm[1, 1]
TN = cm[0, 0]
FP = cm[0, 1]
FN = cm[1, 0]
Accuracy = (TP + TN) / (TP + TN + FP + FN)
Precision = TP / (TP + FP)
Error_rate = 1 - Precision
Recall = TP / (TP + FN)
F1 = 2 * Precision * Recall / (Precision + Recall)

In [24]:
data = [["True Positive", TP],
        ["True Negative", TN],
        ["False Positive", FP],
        ["False Negative", FN],
        ["F1 Score", F1],
        ["Accuracy", Accuracy],
        ["Precision", Precision],
        ["Error Rate", Error_rate],
        ["Recall", Recall]]
table = pd.DataFrame(data, columns=["Metric", "Value"])
table


Unnamed: 0,Metric,Value
0,True Positive,25.0
1,True Negative,65.0
2,False Positive,3.0
3,False Negative,7.0
4,F1 Score,0.833333
5,Accuracy,0.9
6,Precision,0.892857
7,Error Rate,0.107143
8,Recall,0.78125
