# Build a machine learning model to predict user will click the ad or not based on his experience and estimated salary for a given dataset.
https://drive.google.com/open?id=1I8KsCufEa47XvzrkxhntEWSy1Su0E0N


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
# StandardScaler is used to convert numerical values
# LabelEncoder is used to convert labels to numerical values
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn import metrics
from sklearn.linear_model import LogisticRegression

# Preprocessing

In [2]:
dataset = pd.read_csv("Social_Network_Ads.csv")
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 5 columns):
User ID            400 non-null int64
Gender             400 non-null object
Age                400 non-null int64
EstimatedSalary    400 non-null int64
Purchased          400 non-null int64
dtypes: int64(4), object(1)
memory usage: 15.7+ KB


In [3]:
dataset.duplicated(keep='first').any()

False

In [4]:
dataset.isnull().sum()

User ID            0
Gender             0
Age                0
EstimatedSalary    0
Purchased          0
dtype: int64

In [5]:
dataset.describe()

Unnamed: 0,User ID,Age,EstimatedSalary,Purchased
count,400.0,400.0,400.0,400.0
mean,15691540.0,37.655,69742.5,0.3575
std,71658.32,10.482877,34096.960282,0.479864
min,15566690.0,18.0,15000.0,0.0
25%,15626760.0,29.75,43000.0,0.0
50%,15694340.0,37.0,70000.0,0.0
75%,15750360.0,46.0,88000.0,1.0
max,15815240.0,60.0,150000.0,1.0


In [112]:
dataset.head()

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,1,19,19000,0
1,15810944,1,35,20000,0
2,15668575,0,26,43000,0
3,15603246,0,27,57000,0
4,15804002,1,19,76000,0


In [6]:
# loading LabelEncoder to convert labels to numerics
label_quality = LabelEncoder()
dataset['Gender']=label_quality.fit_transform(dataset['Gender'])

In [7]:
# spliting data into sets independent (X) and dependent (y) variables
X = dataset[['Gender','Age','EstimatedSalary']]
y = dataset['Purchased']
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.2, random_state=0)

In [9]:
# Transforming datasets into useful content
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X = scaler.fit_transform(X)

In [14]:
# trainning creating model
classifier = LogisticRegression()
classifier.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [15]:
# predicting test file values
test_prediction = classifier.predict(X_test)
print("Accuracy Score:") # TP_TN/TP+NP+FP+FN
print(metrics.accuracy_score(y_test, test_prediction))

# Classification report is used to measure quality of predictions
# 'precision' is the total number of correctly classified positive examples by
# the total number of predicted positive examples
# 'recall' is the ratio of the total number of correctly classified positive
# examples divided to the total number of positive examples
# 'F1-score' 2*recall*precision/recall+precision
print("Classification Report:")
print(metrics.classification_report(y_test, test_prediction))

# Confusion matrix is a summary of prediction results
print("Confusion Score:")
print(metrics.confusion_matrix(y_test, test_prediction))

Accuracy Score:
0.275
Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        58
           1       0.28      1.00      0.43        22

   micro avg       0.28      0.28      0.28        80
   macro avg       0.14      0.50      0.22        80
weighted avg       0.08      0.28      0.12        80

Confusion Score:
[[ 0 58]
 [ 0 22]]


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [16]:
# predicting training file
train_prediction = classifier.predict(X)
print("Accuracy Score:")
print(metrics.accuracy_score(y, train_prediction))
print("Classification Report:")
print(metrics.classification_report(y, train_prediction))
print("Confusion matrix: ")
print(metrics.confusion_matrix(y, train_prediction))

Accuracy Score:
0.8525
Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.91      0.89       257
           1       0.83      0.74      0.78       143

   micro avg       0.85      0.85      0.85       400
   macro avg       0.85      0.83      0.84       400
weighted avg       0.85      0.85      0.85       400

Confusion matrix: 
[[235  22]
 [ 37 106]]
