In [1]:
# Import pandas & numpy
import pandas as pd
import numpy as np

# import classifier models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# data split module
from sklearn.model_selection import train_test_split

# import metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# import plotting libraries
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# import data
df = pd.read_csv('advertising.csv')
df.head()

Unnamed: 0,Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Ad Topic Line,City,Male,Country,Timestamp,Clicked on Ad
0,68.95,35,61833.9,256.09,Cloned 5thgeneration orchestration,Wrightburgh,0,Tunisia,2016-03-27 00:53:11,0
1,80.23,31,68441.85,193.77,Monitored national standardization,West Jodi,1,Nauru,2016-04-04 01:39:02,0
2,69.47,26,59785.94,236.5,Organic bottom-line service-desk,Davidton,0,San Marino,2016-03-13 20:35:42,0
3,74.15,29,54806.18,245.89,Triple-buffered reciprocal time-frame,West Terrifurt,1,Italy,2016-01-10 02:31:19,0
4,68.37,35,73889.99,225.58,Robust logistical utilization,South Manuel,0,Iceland,2016-06-03 03:36:18,0


In [3]:
df.shape

(1000, 10)

In [4]:
# define input variable
X = df.drop(['Clicked on Ad','Ad Topic Line', 'City', 'Country', 'Timestamp'], axis = 1)
X.head()

Unnamed: 0,Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Male
0,68.95,35,61833.9,256.09,0
1,80.23,31,68441.85,193.77,1
2,69.47,26,59785.94,236.5,0
3,74.15,29,54806.18,245.89,1
4,68.37,35,73889.99,225.58,0


In [5]:
# define target variable
y = df['Clicked on Ad']
y.head()

0    0
1    0
2    0
3    0
4    0
Name: Clicked on Ad, dtype: int64

In [6]:
# Split the data into train and test data sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 3)

In [7]:
def classif_metrics(y_act, y_pred):
    metrics = []
    metrics.append(accuracy_score(y_act, y_pred))
    metrics.append(recall_score(y_act, y_pred))
    metrics.append(precision_score(y_act, y_pred))
    metrics.append(f1_score(y_act, y_pred))
    metrics = [round(elem,3) for elem in metrics]
    return metrics 

In [8]:
def lrModel(X_train, X_test, y_train, y_test):
    # import model 
    lr = LogisticRegression()
    # fit the model on train data set
    lr.fit(X_train, y_train)
    # predict the target variable on train data set
    y_train_pred = lr.predict(X_train)
    # predict the target variable on test data set
    y_test_pred = lr.predict(X_test)
    # Performance metrics of Logistic Regression model
    train_dataSet_metrcis = classif_metrics(y_train, y_train_pred)
    test_dataSet_metrics = classif_metrics(y_test, y_test_pred)
    #
    metrics_df = pd.DataFrame(index = ['Accuracy', 'Recall', 'Precision', 'F1-Score'])
    metrics_df['LR_train'] = train_dataSet_metrcis
    metrics_df['LR_test'] = test_dataSet_metrics
    return metrics_df

In [9]:
def rfModel(X_train, X_test, y_train, y_test):
    # import model 
    rf = RandomForestClassifier()
    # fit the model on train data set
    rf.fit(X_train, y_train)
    # predict the target variable on train data set
    y_train_pred = rf.predict(X_train)
    # predict the target variable on test data set
    y_test_pred = rf.predict(X_test)
    # Performance metrics of Logistic Regression model
    train_dataSet_metrcis = classif_metrics(y_train, y_train_pred)
    test_dataSet_metrics = classif_metrics(y_test, y_test_pred)
    #
    metrics_df = pd.DataFrame(index = ['Accuracy', 'Recall', 'Precision', 'F1-Score'])
    metrics_df['RF_train'] = train_dataSet_metrcis
    metrics_df['RF_test'] = test_dataSet_metrics
    return metrics_df 

In [10]:
def dtModel(X_train, X_test, y_train, y_test):
    # import model 
    dt = DecisionTreeClassifier()
    # fit the model on train data set
    dt.fit(X_train, y_train)
    # predict the target variable on train data set
    y_train_pred = dt.predict(X_train)
    # predict the target variable on test data set
    y_test_pred = dt.predict(X_test)
    # Performance metrics of Logistic Regression model
    train_dataSet_metrcis = classif_metrics(y_train, y_train_pred)
    test_dataSet_metrics = classif_metrics(y_test, y_test_pred)
    #
    metrics_df = pd.DataFrame(index = ['Accuracy', 'Recall', 'Precision', 'F1-Score'])
    metrics_df['DT_train'] = train_dataSet_metrcis
    metrics_df['DT_test'] = test_dataSet_metrics
    return metrics_df

In [11]:
res_df = lrModel(X_train, X_test, y_train, y_test)
res_df = res_df.join(rfModel(X_train, X_test, y_train, y_test))
res_df = res_df.join(dtModel(X_train, X_test, y_train, y_test))
res_df



Unnamed: 0,LR_train,LR_test,RF_train,RF_test,DT_train,DT_test
Accuracy,0.908,0.88,0.995,0.97,1.0,0.945
Recall,0.881,0.816,0.993,0.959,1.0,0.939
Precision,0.932,0.93,0.998,0.979,1.0,0.948
F1-Score,0.905,0.87,0.995,0.969,1.0,0.944


In [12]:
res_df.to_excel('clidk_on_ad_classification_results.xlsx')