In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime

In [2]:
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

In [3]:
DATA_FOLDER = '../data/advertising.csv'

data = pd.read_csv(DATA_FOLDER)

Converting Date Time to numerical

In [4]:
data['Timestamp'] = pd.to_datetime(data['Timestamp'])

In [5]:
data['month'] = data['Timestamp'].dt.month
data['day'] = data['Timestamp'].dt.day
data['hour'] = data['Timestamp'].dt.hour

In [6]:
data.sample(3)

Unnamed: 0,Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Ad Topic Line,City,Male,Country,Timestamp,Clicked on Ad,month,day,hour
887,52.17,44,57594.7,115.37,Optional modular throughput,New Julianberg,1,Equatorial Guinea,2016-02-24 06:17:18,1,2,24,6
352,78.19,30,62475.99,228.81,Triple-buffered needs-based Local Area Network,Mollyport,0,Libyan Arab Jamahiriya,2016-01-31 06:14:10,0,1,31,6
9,69.88,20,55642.32,183.82,Mandatory homogeneous architecture,Ramirezton,1,Ghana,2016-07-11 01:42:51,0,7,11,1


Converting categorical columns to numerical(City, Country, Ad Topic Line)

In [7]:
data['City'].nunique()

969

In [8]:
data['Ad Topic Line'].nunique()

1000

In [9]:
data['Country'].nunique()

237

In [10]:
data['Country']

0                     Tunisia
1                       Nauru
2                  San Marino
3                       Italy
4                     Iceland
                ...          
995                   Lebanon
996    Bosnia and Herzegovina
997                  Mongolia
998                 Guatemala
999                    Brazil
Name: Country, Length: 1000, dtype: object

Converting the country column to numerical(a value between 0-1)

In [11]:
le = LabelEncoder()
data['Country'] = le.fit_transform(data['Country'])

scaler = MinMaxScaler()
data['Country'] = scaler.fit_transform(data[['Country']])


In [12]:
data['Country']

0      0.911017
1      0.622881
2      0.779661
3      0.436441
4      0.406780
         ...   
995    0.491525
996    0.110169
997    0.593220
998    0.360169
999    0.118644
Name: Country, Length: 1000, dtype: float64

Similarly for City and even Ad Topic Line

In [13]:
data['City'] = le.fit_transform(data['City'])
data['City'] = scaler.fit_transform(data[['City']])

In [14]:
data['Ad Topic Line'] = le.fit_transform(data['Ad Topic Line'])
data['Ad Topic Line'] = scaler.fit_transform(data[['Ad Topic Line']])

In [15]:
data.sample(3)

Unnamed: 0,Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Ad Topic Line,City,Male,Country,Timestamp,Clicked on Ad,month,day,hour
27,51.95,52,58295.82,129.23,0.467467,0.804752,0,0.338983,2016-07-19 08:32:10,1,7,19,8
460,48.03,40,25598.75,134.6,0.512513,0.165289,1,0.550847,2016-06-02 22:16:08,1,6,2,22
934,78.41,33,55368.67,248.23,0.477477,0.248967,1,0.610169,2016-06-03 04:51:46,0,6,3,4


In [16]:
X = data.drop(columns=['Clicked on Ad', 'Timestamp'])
y = data['Clicked on Ad']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [17]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((800, 11), (200, 11), (800,), (200,))

In [18]:
X_train.sample(2)

Unnamed: 0,Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Ad Topic Line,City,Male,Country,month,day,hour
454,82.12,52,28679.93,201.15,0.323323,0.146694,1,0.326271,2,3,10
530,65.1,49,59457.52,118.1,0.927928,0.938017,1,0.135593,6,1,3


In [19]:
y_train

29     0
535    0
695    0
557    0
836    1
      ..
106    0
270    1
860    1
435    1
102    0
Name: Clicked on Ad, Length: 800, dtype: int64

In [20]:
rf = RandomForestClassifier(n_estimators=50)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
accuracy_score(y_test, y_pred)

0.95

In [21]:
gb = GradientBoostingClassifier()
gb.fit(X_train, y_train)
y_pred = gb.predict(X_test)
accuracy_score(y_test, y_pred)

0.935

In [22]:
svc = SVC()
svc.fit(X_train, y_train)
y_pred = svc.predict(X_test)
accuracy_score(y_test, y_pred)

0.68

In [23]:
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
y_pred = dt.predict(X_test)
accuracy_score(y_test, y_pred)

0.91

Finding optimal hyperparameter using GridSearchCV

In [24]:
rf = RandomForestClassifier(random_state=42)

param_grid = {
    'n_estimators': [20, 50, 100, 200, 500],
    'max_depth': [None, 10, 20, 30, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf' : [1,2,4,8],
    'max_features': ['sqrt', 'log2']
}

grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=1)

grid_search.fit(X_train, y_train)

print("Best parameters:", grid_search.best_params_)
print("Best cross-validation accuracy:", grid_search.best_score_)


Fitting 5 folds for each of 600 candidates, totalling 3000 fits
Best parameters: {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 20}
Best cross-validation accuracy: 0.9675


  _data = np.array(data, dtype=dtype, copy=copy,


In [25]:
best_model_rf = grid_search.best_estimator_

In [26]:
y_pred = best_model_rf.predict(X_test)
accuracy_score(y_test, y_pred)

0.94

In [27]:
gb = GradientBoostingClassifier(random_state=42)

param_grid = {
    'n_estimators': [50, 100, 200],             
    'learning_rate': [0.01, 0.1, 0.2],         
    'max_depth': [3, 5, 7],           
    'min_samples_split': [2, 5, 10],      
    'min_samples_leaf': [1, 2, 4],                         
}

grid_search_gb = GridSearchCV(
    estimator=gb,
    param_grid=param_grid,
    scoring='accuracy',  
    cv=5,                
    n_jobs=-1,           
    verbose=1            
)

grid_search_gb.fit(X_train, y_train)

print("Best parameters for Gradient Boosting:", grid_search_gb.best_params_)
print("Best cross-validation accuracy:", grid_search_gb.best_score_)


Fitting 5 folds for each of 243 candidates, totalling 1215 fits


  _data = np.array(data, dtype=dtype, copy=copy,


Best parameters for Gradient Boosting: {'learning_rate': 0.1, 'max_depth': 3, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best cross-validation accuracy: 0.9625


In [28]:
best_model_gb = grid_search_gb.best_estimator_

In [29]:
y_pred = best_model_gb.predict(X_test)
accuracy_score(y_pred, y_test)

0.925