In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import pickle
import matplotlib.pyplot as plt
%matplotlib inline

In [9]:
data=pd.read_csv("advertising.csv")

In [10]:
data.head()

Unnamed: 0,Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Ad Topic Line,City,Male,Country,Timestamp,Clicked on Ad
0,68.95,35,61833.9,256.09,Cloned 5thgeneration orchestration,Wrightburgh,0,Tunisia,2016-03-27 00:53:11,0
1,80.23,31,68441.85,193.77,Monitored national standardization,West Jodi,1,Nauru,2016-04-04 01:39:02,0
2,69.47,26,59785.94,236.5,Organic bottom-line service-desk,Davidton,0,San Marino,2016-03-13 20:35:42,0
3,74.15,29,54806.18,245.89,Triple-buffered reciprocal time-frame,West Terrifurt,1,Italy,2016-01-10 02:31:19,0
4,68.37,35,73889.99,225.58,Robust logistical utilization,South Manuel,0,Iceland,2016-06-03 03:36:18,0


In [11]:
data.drop(['Ad Topic Line','City','Country','Timestamp'],axis=1,inplace=True)

In [12]:
data.head()

Unnamed: 0,Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Male,Clicked on Ad
0,68.95,35,61833.9,256.09,0,0
1,80.23,31,68441.85,193.77,1,0
2,69.47,26,59785.94,236.5,0,0
3,74.15,29,54806.18,245.89,1,0
4,68.37,35,73889.99,225.58,0,0


In [13]:
data.isnull().sum()

Daily Time Spent on Site    0
Age                         0
Area Income                 0
Daily Internet Usage        0
Male                        0
Clicked on Ad               0
dtype: int64

## Converting Categorical Features

We'll need to convert categorical features to dummy variables using pandas! Otherwise our machine learning algorithm won't be able to directly take in those features as inputs.

In [14]:
data['Male']

0      0
1      1
2      0
3      1
4      0
      ..
995    1
996    1
997    1
998    0
999    0
Name: Male, Length: 1000, dtype: int64

In [15]:
dummy_variables=pd.get_dummies(data['Male'])

In [16]:
data.drop('Male',axis=1,inplace=True)

In [17]:
data=pd.concat([data,dummy_variables],axis=1)

In [18]:
data.rename(columns={0:'Female',1:'Male'},inplace=True)

In [19]:
data

Unnamed: 0,Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Clicked on Ad,Female,Male
0,68.95,35,61833.90,256.09,0,1,0
1,80.23,31,68441.85,193.77,0,0,1
2,69.47,26,59785.94,236.50,0,1,0
3,74.15,29,54806.18,245.89,0,0,1
4,68.37,35,73889.99,225.58,0,1,0
...,...,...,...,...,...,...,...
995,72.97,30,71384.57,208.58,1,0,1
996,51.30,45,67782.17,134.42,1,0,1
997,51.63,51,42415.72,120.37,1,0,1
998,55.55,19,41920.79,187.95,0,1,0


## Train Test Split

In [20]:
from sklearn.model_selection import train_test_split

In [21]:
x_train,x_test,y_train,y_test=train_test_split(data.drop('Clicked on Ad',axis=1),data['Clicked on Ad'],test_size=0.30,random_state=101)

## Training And Predicting

In [26]:
from sklearn.linear_model import LogisticRegression
logmodel=LogisticRegression()
logmodel.fit(x_train,y_train)

LogisticRegression()

In [28]:
predictions=logmodel.predict(x_test)

## Evaluation

In [29]:
from sklearn.metrics import classification_report

In [30]:
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.91      0.95      0.93       157
           1       0.94      0.90      0.92       143

    accuracy                           0.93       300
   macro avg       0.93      0.93      0.93       300
weighted avg       0.93      0.93      0.93       300



## Saving the model

In [31]:
filename="advertisement.sav"

In [35]:
pickle.dump(logmodel,open(filename,'wb'))


In [36]:
loaded_model=pickle.load(open(filename,'rb'))

In [38]:
result=loaded_model.score(x_test,y_test)

In [39]:
print(result)

0.9266666666666666
