## Importing the libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

## Loading the dataset

In [2]:
df = pd.read_csv("Social_Network_Ads.csv")
df.head()

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   User ID          400 non-null    int64 
 1   Gender           400 non-null    object
 2   Age              400 non-null    int64 
 3   EstimatedSalary  400 non-null    int64 
 4   Purchased        400 non-null    int64 
dtypes: int64(4), object(1)
memory usage: 15.8+ KB


## Define X by selecting only the age and EstimatedSalary, and y with purchased column

In [4]:
from sklearn.model_selection import train_test_split

In [5]:
X = df[["Age", "EstimatedSalary"]]
y = df["Purchased"]

## Print count of each label in Purchased column

In [6]:
df["Purchased"].value_counts()

0    257
1    143
Name: Purchased, dtype: int64

## Print Correlation of each feature in the dataset

In [7]:
df.corr()

Unnamed: 0,User ID,Age,EstimatedSalary,Purchased
User ID,1.0,-0.000721,0.071097,0.00712
Age,-0.000721,1.0,0.155238,0.622454
EstimatedSalary,0.071097,0.155238,1.0,0.362083
Purchased,0.00712,0.622454,0.362083,1.0


# First: Logistic Regression model

## Split the dataset into Training set and Test set with test_size = 0.25 and random_state = 0

In [8]:
X_train, X_test, y_train, y_test =  train_test_split(X, y, test_size=0.25, random_state=0)

## Train the model with random_state = 0

In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

logmodel = LogisticRegression()
logmodel

In [10]:
logmodel.fit(X_train,y_train)
predictions = logmodel.predict(X_test)

## Print the prediction results

In [11]:
predictions

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int64)

## Create dataframe with the Actual Purchased and Predict Purchased

In [20]:
df_result = pd.DataFrame(y_test)
df_result["Age"] = df["Age"]
df_result["EstimatedSalary"] = df["EstimatedSalary"]
df_result["Actual_Purchased"] = predictions
df_result = df_result[["Age", "EstimatedSalary", "Actual_Purchased", "Purchased"]]
df_result.rename(columns={"Purchased":"Predict_Purchased"})

Unnamed: 0,Age,EstimatedSalary,Actual_Purchased,Predict_Purchased
132,30,87000,0,0
309,38,50000,0,0
341,35,75000,0,0
196,30,79000,0,0
246,35,50000,0,0
...,...,...,...,...
146,27,96000,0,1
135,23,63000,0,0
390,48,33000,0,1
264,48,90000,0,1


## Print Confusion Matrix and classification_report

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print('\n')
print(confusion_matrix(y_test,predictions))


In [None]:
print(classification_report(y_test,predictions))

## Use StandardScaler() to improved performance and re-train your model

In [25]:
from sklearn.preprocessing import StandardScaler
logmodel = LogisticRegression()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
model = LogisticRegression()
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)

In [26]:
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[65  3]
 [ 8 24]]
              precision    recall  f1-score   support

           0       0.89      0.96      0.92        68
           1       0.89      0.75      0.81        32

    accuracy                           0.89       100
   macro avg       0.89      0.85      0.87       100
weighted avg       0.89      0.89      0.89       100



## Try to Predicting a new result - e.g: person with Age = 30 and Salary = 90,000

In [29]:
new_data = np.array([[30, 90000]])  
new_data_scaled = scaler.transform(new_data)


prediction = model.predict(new_data_scaled)

if prediction == 1:
    print("The person is likely to be in the target group.")
else:
    print("The person is unlikely to be in the target group.")

The person is unlikely to be in the target group.


## Try to Predicting a new result - e.g: person with Age = 40 and Salary = 90,000

In [28]:
new_data = np.array([[40, 90000]])  
new_data_scaled = scaler.transform(new_data)


prediction = model.predict(new_data_scaled)

if prediction == 1:
    print("The person is likely to be in the target group.")
else:
    print("The person is unlikely to be in the target group.")

The person is likely to be in the target group.
