### 1. Initial Data Preparation

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

### Loading the dataset

In [15]:
ad_data=pd.read_csv('advertising.csv')

##### Features considered are : Daily Time Spent on Site, Age, Area Income, City, Country, Daily Internet Usage, Male

##  Data Cleaning

### Removing unnecessary text columns : Ad Topic Line, Timestamp

In [16]:
ad_data.drop('Ad Topic Line', inplace=True, axis=1)

In [17]:
ad_data.drop('Timestamp', inplace=True, axis=1)

In [18]:
print(ad_data)

     Daily Time Spent on Site  Age  Area Income  Daily Internet Usage  \
0                       68.95   35     61833.90                256.09   
1                       80.23   31     68441.85                193.77   
2                       69.47   26     59785.94                236.50   
3                       74.15   29     54806.18                245.89   
4                       68.37   35     73889.99                225.58   
..                        ...  ...          ...                   ...   
995                     72.97   30     71384.57                208.58   
996                     51.30   45     67782.17                134.42   
997                     51.63   51     42415.72                120.37   
998                     55.55   19     41920.79                187.95   
999                     45.01   26     29875.80                178.35   

               City  Male                 Country  Clicked on Ad  
0       Wrightburgh     0                 Tunisia       

### 2. Features are all except the target variable which is clicked on ad. Clicked on ad is a target variable which tells that an ad is being clicked or not based on the selected features

In [19]:
X = ad_data.iloc[:, :-1].values
Y = ad_data.iloc[:, -1].values

In [20]:
print(X)

[[68.95 35 61833.9 ... 'Wrightburgh' 0 'Tunisia']
 [80.23 31 68441.85 ... 'West Jodi' 1 'Nauru']
 [69.47 26 59785.94 ... 'Davidton' 0 'San Marino']
 ...
 [51.63 51 42415.72 ... 'South Jessica' 1 'Mongolia']
 [55.55 19 41920.79 ... 'West Steven' 0 'Guatemala']
 [45.01 26 29875.8 ... 'Ronniemouth' 0 'Brazil']]


In [21]:
print(Y)

[0 0 0 0 0 0 0 1 0 0 1 0 1 0 1 1 1 0 1 1 0 0 1 0 1 0 1 1 1 0 0 0 1 1 1 0 1
 0 1 1 0 0 0 0 0 1 0 0 1 1 0 0 1 1 1 0 1 1 0 1 0 0 0 0 1 0 1 1 0 1 1 0 1 1
 1 0 1 0 1 1 0 0 1 1 0 1 0 1 1 1 1 1 0 1 1 0 1 1 1 0 1 0 0 0 0 0 0 1 1 0 1
 1 0 1 0 0 1 1 1 1 0 0 0 1 1 0 1 0 0 0 1 1 1 0 1 1 1 1 0 0 0 1 1 0 0 1 1 1
 1 1 0 0 1 0 0 0 1 1 0 1 0 0 0 0 1 1 1 0 1 0 1 0 0 0 1 0 1 0 1 0 1 1 1 0 0
 1 1 0 1 1 1 1 1 1 0 1 1 0 0 0 0 0 1 0 0 1 0 0 1 1 0 1 0 1 0 1 1 1 1 1 0 0
 1 1 0 1 1 1 0 0 0 1 1 1 1 1 1 0 1 0 1 1 0 0 0 0 1 1 1 1 0 1 0 1 1 0 0 1 0
 1 0 1 1 1 0 1 1 0 1 0 1 0 0 0 0 1 0 0 0 0 1 1 1 0 1 0 1 0 1 1 1 0 1 0 0 0
 0 0 0 0 0 1 1 1 1 1 0 0 0 1 0 0 1 0 0 1 0 0 0 1 1 0 0 0 0 1 1 0 0 1 0 0 1
 0 0 1 0 0 0 0 1 1 0 0 1 0 0 1 0 1 0 0 0 0 1 0 1 1 1 0 1 1 0 1 0 1 0 0 0 0
 1 1 0 1 0 0 0 1 1 0 0 1 0 0 1 0 0 1 0 1 0 0 0 0 1 0 1 1 0 0 1 0 1 0 1 0 1
 1 1 1 1 0 0 1 0 1 1 0 0 0 1 0 1 1 1 1 1 0 1 0 0 0 1 0 0 1 0 0 1 0 1 0 1 1
 1 0 1 0 1 0 1 1 0 0 1 0 1 0 1 0 1 1 0 1 0 1 1 1 1 0 1 0 0 0 1 0 0 1 1 1 0
 0 0 1 1 1 0 0 1 0 1 1 0 

### Encoding categorical variables: City and Country

In [9]:
one_hot_encoded_data = pd.get_dummies(ad_data, columns = ['City', 'Country'])
print(one_hot_encoded_data)

     Daily Time Spent on Site  Age  Area Income  Daily Internet Usage  Male  \
0                       68.95   35     61833.90                256.09     0   
1                       80.23   31     68441.85                193.77     1   
2                       69.47   26     59785.94                236.50     0   
3                       74.15   29     54806.18                245.89     1   
4                       68.37   35     73889.99                225.58     0   
..                        ...  ...          ...                   ...   ...   
995                     72.97   30     71384.57                208.58     1   
996                     51.30   45     67782.17                134.42     1   
997                     51.63   51     42415.72                120.37     1   
998                     55.55   19     41920.79                187.95     0   
999                     45.01   26     29875.80                178.35     0   

     Clicked on Ad  City_Adamsbury  City_Adamside  

### Finding the target/dependent variable

In [10]:
Y = one_hot_encoded_data.iloc[:, 5].values

In [11]:
print(Y)

[0 0 0 0 0 0 0 1 0 0 1 0 1 0 1 1 1 0 1 1 0 0 1 0 1 0 1 1 1 0 0 0 1 1 1 0 1
 0 1 1 0 0 0 0 0 1 0 0 1 1 0 0 1 1 1 0 1 1 0 1 0 0 0 0 1 0 1 1 0 1 1 0 1 1
 1 0 1 0 1 1 0 0 1 1 0 1 0 1 1 1 1 1 0 1 1 0 1 1 1 0 1 0 0 0 0 0 0 1 1 0 1
 1 0 1 0 0 1 1 1 1 0 0 0 1 1 0 1 0 0 0 1 1 1 0 1 1 1 1 0 0 0 1 1 0 0 1 1 1
 1 1 0 0 1 0 0 0 1 1 0 1 0 0 0 0 1 1 1 0 1 0 1 0 0 0 1 0 1 0 1 0 1 1 1 0 0
 1 1 0 1 1 1 1 1 1 0 1 1 0 0 0 0 0 1 0 0 1 0 0 1 1 0 1 0 1 0 1 1 1 1 1 0 0
 1 1 0 1 1 1 0 0 0 1 1 1 1 1 1 0 1 0 1 1 0 0 0 0 1 1 1 1 0 1 0 1 1 0 0 1 0
 1 0 1 1 1 0 1 1 0 1 0 1 0 0 0 0 1 0 0 0 0 1 1 1 0 1 0 1 0 1 1 1 0 1 0 0 0
 0 0 0 0 0 1 1 1 1 1 0 0 0 1 0 0 1 0 0 1 0 0 0 1 1 0 0 0 0 1 1 0 0 1 0 0 1
 0 0 1 0 0 0 0 1 1 0 0 1 0 0 1 0 1 0 0 0 0 1 0 1 1 1 0 1 1 0 1 0 1 0 0 0 0
 1 1 0 1 0 0 0 1 1 0 0 1 0 0 1 0 0 1 0 1 0 0 0 0 1 0 1 1 0 0 1 0 1 0 1 0 1
 1 1 1 1 0 0 1 0 1 1 0 0 0 1 0 1 1 1 1 1 0 1 0 0 0 1 0 0 1 0 0 1 0 1 0 1 1
 1 0 1 0 1 0 1 1 0 0 1 0 1 0 1 0 1 1 0 1 0 1 1 1 1 0 1 0 0 0 1 0 0 1 1 1 0
 0 0 1 1 1 0 0 1 0 1 1 0 

##### Removing the dependent variable from dataset as it is already stored in Y and it is in the midle of the dataset which disturbs the training process

In [12]:
one_hot_encoded_data.drop('Clicked on Ad', inplace=True, axis=1)

##### Features data after performing one hot encoding

In [13]:
print(one_hot_encoded_data)

     Daily Time Spent on Site  Age  Area Income  Daily Internet Usage  Male  \
0                       68.95   35     61833.90                256.09     0   
1                       80.23   31     68441.85                193.77     1   
2                       69.47   26     59785.94                236.50     0   
3                       74.15   29     54806.18                245.89     1   
4                       68.37   35     73889.99                225.58     0   
..                        ...  ...          ...                   ...   ...   
995                     72.97   30     71384.57                208.58     1   
996                     51.30   45     67782.17                134.42     1   
997                     51.63   51     42415.72                120.37     1   
998                     55.55   19     41920.79                187.95     0   
999                     45.01   26     29875.80                178.35     0   

     City_Adamsbury  City_Adamside  City_Adamsstad 

In [14]:
X = one_hot_encoded_data.iloc[:, :].values

#### Splitting the data into training and testing sets

In [16]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 1)

#### Data Preperation

##### Feature scaling the data to ensure that all the features are on the same scale

In [18]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train[:, :] = sc.fit_transform(X_train[:, :])
X_test[:, :] = sc.transform(X_test[:, :])

#### Data after preprocessing

In [19]:
print(X_train)

[[ 0.81717752 -0.82446236  0.90391811 ... -0.05006262 -0.06135239
  -0.07088812]
 [-1.30623849 -0.93903069  0.61312416 ... -0.05006262 -0.06135239
  -0.07088812]
 [-0.63907569  0.20665263  1.28012459 ... -0.05006262 -0.06135239
  -0.07088812]
 ...
 [-0.48025717  1.58147262 -2.42822096 ... -0.05006262 -0.06135239
  -0.07088812]
 [-1.51132696  1.00863096 -1.07422552 ... -0.05006262 -0.06135239
  -0.07088812]
 [ 0.07685816  0.32122096  1.59188097 ... -0.05006262 -0.06135239
  -0.07088812]]


In [20]:
print(X_test)

[[-0.91294383  0.55035763  0.46171784 ... -0.05006262 -0.06135239
  -0.07088812]
 [ 1.08729404 -0.2516207   0.92977273 ... -0.05006262 -0.06135239
  -0.07088812]
 [ 1.00538371 -0.2516207   0.30294236 ... -0.05006262 -0.06135239
  -0.07088812]
 ...
 [ 0.70087735 -0.02248404  1.20850022 ... -0.05006262 -0.06135239
  -0.07088812]
 [-0.15824326  0.43578929  0.64008855 ... -0.05006262 -0.06135239
  -0.07088812]
 [ 1.21860068 -0.70989403  0.55498198 ... -0.05006262 -0.06135239
  -0.07088812]]


Features of X - Daily Time Spent on Site, Age, Area Income, City, Country, Daily Internet Usage, Male
Target variable - Clicked on Ad

By considering the above features, a target variable can be predicted i.e, clicked on ad.
It predicts whether the customer clicked on ad or not based on the features considered

In [21]:
one_hot_encoded_data.columns

Index(['Daily Time Spent on Site', 'Age', 'Area Income',
       'Daily Internet Usage', 'Male', 'City_Adamsbury', 'City_Adamside',
       'City_Adamsstad', 'City_Alanview', 'City_Alexanderfurt',
       ...
       'Country_Uruguay', 'Country_Uzbekistan', 'Country_Vanuatu',
       'Country_Venezuela', 'Country_Vietnam', 'Country_Wallis and Futuna',
       'Country_Western Sahara', 'Country_Yemen', 'Country_Zambia',
       'Country_Zimbabwe'],
      dtype='object', length=1211)

In [22]:
print(y_test)

[1 0 0 0 0 1 1 1 0 1 1 1 1 0 1 0 0 0 1 0 1 0 0 0 0 1 1 1 1 0 1 0 1 1 1 0 1
 1 0 0 0 0 1 0 1 0 0 0 0 1 1 1 0 0 1 0 1 1 0 0 0 1 0 1 1 0 0 1 1 0 1 1 1 0
 0 0 1 1 0 1 0 0 1 1 1 1 1 0 1 0 0 1 1 1 0 1 0 0 0 0 1 0 1 1 1 0 0 0 1 1 1
 1 1 1 0 0 0 0 0 1 1 0 0 0 0 0 0 1 1 1 0 1 0 0 1 1 1 0 1 0 1 1 1 0 1 0 0 1
 0 1 1 1 0 1 1 1 0 0 0 0 1 1 0 0 1 0 0 0 0 1 1 0 1 1 0 0 1 0 0 0 1 0 0 0 1
 0 0 1 1 0 0 0 1 1 1 0 0 0 1 0]


###### As the output variable is categorical one, linear regression cannot be performed. Linear Regression is used to predict the continuous value.
###### Here, the output is categorical one, logistic regression is used to calculate whether the customer clicked on ad or not


#### Training and fitting a logistic regression model on the training set


### 3.Logistic Regression as the target is categorical

In [23]:
from sklearn.linear_model import LogisticRegression

In [32]:
logmodel=LogisticRegression()
logmodel.fit(X_train,y_train)

LogisticRegression()

### 6. Predicting values for the testing data

In [25]:
predictions=logmodel.predict(X_test)

In [26]:
print(predictions)

[1 0 0 0 0 1 1 1 0 1 1 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 1 0 1 1 1 0 1
 1 0 0 0 0 1 0 1 0 0 0 0 0 1 1 0 0 1 0 1 1 0 0 0 1 0 1 0 0 0 1 1 0 1 1 1 0
 0 0 1 1 0 0 0 0 1 1 1 1 1 0 0 0 0 1 0 1 0 1 0 0 0 0 1 0 1 1 1 0 0 0 1 1 1
 1 0 1 0 0 0 0 0 1 1 1 0 0 0 0 0 0 1 1 0 1 0 0 1 0 1 0 1 0 1 1 1 0 1 0 0 1
 0 1 1 1 0 1 1 1 0 0 0 0 0 1 0 0 1 0 0 0 0 1 1 0 1 1 0 0 1 0 0 0 1 0 0 0 1
 0 1 1 1 0 0 0 0 1 1 0 0 0 1 0]


#### Creating a classification report for the model

In [27]:
from sklearn.metrics import classification_report

In [28]:
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.89      0.98      0.93       103
           1       0.98      0.87      0.92        97

    accuracy                           0.93       200
   macro avg       0.93      0.92      0.92       200
weighted avg       0.93      0.93      0.92       200



### 4.RESULTS - Accuracy obtained for testing data is 93% which is good and can be increased.

### 5.Features vs data can be varied to get better results ie, overfitting and underfitting must be balanced.

### 7. From this project milestone, I have learnt that- If there are n features and a target variable which is a categorical one, 
###### then using the logistic regression algorithm, a model can be generated by giving the labelled training data to the model. 
###### The model learns the patterns that are present in the training data and this patterns can be used to predict the values of the testing data.
