In [688]:
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt
from matplotlib import style
style.use('ggplot')
%matplotlib inline
import numpy as np

from sklearn.preprocessing import LabelBinarizer, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.model_selection import StratifiedShuffleSplit

# Data Inputting

In [689]:
df = pd.read_csv('advertising.csv')
df.drop('City', axis=1, inplace=True)
df.head()

Unnamed: 0,Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Ad Topic Line,Male,Country,Timestamp,Clicked on Ad
0,68.95,35,61833.9,256.09,Cloned 5thgeneration orchestration,0,Tunisia,2016-03-27 00:53:11,0
1,80.23,31,68441.85,193.77,Monitored national standardization,1,Nauru,2016-04-04 01:39:02,0
2,69.47,26,59785.94,236.5,Organic bottom-line service-desk,0,San Marino,2016-03-13 20:35:42,0
3,74.15,29,54806.18,245.89,Triple-buffered reciprocal time-frame,1,Italy,2016-01-10 02:31:19,0
4,68.37,35,73889.99,225.58,Robust logistical utilization,0,Iceland,2016-06-03 03:36:18,0


# Data Analysis

In [690]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 9 columns):
Daily Time Spent on Site    1000 non-null float64
Age                         1000 non-null int64
Area Income                 1000 non-null float64
Daily Internet Usage        1000 non-null float64
Ad Topic Line               1000 non-null object
Male                        1000 non-null int64
Country                     1000 non-null object
Timestamp                   1000 non-null object
Clicked on Ad               1000 non-null int64
dtypes: float64(3), int64(3), object(3)
memory usage: 70.4+ KB


In [691]:
df.describe()

Unnamed: 0,Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Male,Clicked on Ad
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,65.0002,36.009,55000.00008,180.0001,0.481,0.5
std,15.853615,8.785562,13414.634022,43.902339,0.499889,0.50025
min,32.6,19.0,13996.5,104.78,0.0,0.0
25%,51.36,29.0,47031.8025,138.83,0.0,0.0
50%,68.215,35.0,57012.3,183.13,0.0,0.5
75%,78.5475,42.0,65470.635,218.7925,1.0,1.0
max,91.43,61.0,79484.8,269.96,1.0,1.0


In [692]:
df.nunique()

Daily Time Spent on Site     900
Age                           43
Area Income                 1000
Daily Internet Usage         966
Ad Topic Line               1000
Male                           2
Country                      237
Timestamp                   1000
Clicked on Ad                  2
dtype: int64

# Pipelines

In [693]:
date_cols = ["Timestamp"]
cat_cols = ['Country']
ad_line_cols = ['Ad Topic Line']
label_col = ['Clicked on Ad']

In [694]:
class DataFrameSelector(TransformerMixin, BaseEstimator):
    def __init__(self, arr):
        self.arr = arr
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return X[self.arr].to_numpy()

In [695]:
class MyLabelBinarizer(TransformerMixin, BaseEstimator):
    def __init__(self, cat_cols):
        self.cat_cols = cat_cols
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        for col_index in range(len(X[0])):
            column = X[:, col_index]
            binarizer = LabelBinarizer()
        
            one_hot_encoded_array = binarizer.fit_transform(column)
            classes = binarizer.classes_
            
            for ind_class in classes:
                self.cat_cols.append(ind_class)
                
            X = np.c_[X, one_hot_encoded_array]

        return X

In [696]:
class DateConverter(TransformerMixin, BaseEstimator):
    def __init__(self, date_cols):
        self.date_str_format = "%Y-%m-%d %H:%M:%S"
        self.date_cols = date_cols
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        time_of_days = []
        
        for row_index in range(len(X)):
            str_datetime = X[row_index,0]
            new_datetime = datetime.strptime(str_datetime, self.date_str_format)
            X[row_index,0] = new_datetime
            hour = new_datetime.hour
            time_of_day = ""
            if hour > 20 or hour < 5:
                time_of_day = "night"
            elif hour < 12:
                time_of_day = "morning"
            elif hour < 17:
                time_of_day = "afternoon"
            else:
                time_of_day = "evening"
            time_of_days.append(time_of_day)
        
        lb = LabelBinarizer()
        one_hot_encoded_time_of_days = lb.fit_transform(time_of_days)
        
        for cl in lb.classes_:
            self.date_cols.append(cl)
        X = np.c_[X,one_hot_encoded_time_of_days]
        
        return X

In [697]:
class AdTopicLineLengthAddition(TransformerMixin, BaseEstimator):
    def __init__(self, ad_line_cols):
        self.ad_line_cols = ad_line_cols
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        length_arr = []
        word_length_arr = []
        
        for i in X:
            length_arr.append(len(i[0]))
        
            word_length = 0
            for letter in i[0]:
                if letter == ' ':
                    word_length += 1
            
            word_length_arr.append(word_length + 1)
        
        self.ad_line_cols.append('Ad Line Length')
        self.ad_line_cols.append('Ad Line Word Length')
        
        length_arr = np.array(length_arr)
        word_length_arr = np.array(word_length_arr)
        
        length_arr = ( length_arr - length_arr.mean() ) / length_arr.std()
        word_length_arr = ( word_length_arr - word_length_arr.mean() ) / word_length_arr.std()
    
        return np.c_[X, length_arr, word_length_arr]

In [698]:
nums = list(df.select_dtypes(exclude=['object']).columns)
cats = list(df.select_dtypes(include=['object']).columns)

In [699]:
cats.remove('Ad Topic Line')
cats.remove('Timestamp')

nums.remove('Clicked on Ad')
nums.remove('Male')

print(cats)
print(nums)

['Country']
['Daily Time Spent on Site', 'Age', 'Area Income', 'Daily Internet Usage']


In [700]:
num_pipeline = Pipeline([
    ('data_selector', DataFrameSelector(nums)), 
    ('standard_scaler', StandardScaler())
])

In [701]:
cat_pipeline = Pipeline([
    ('data_selector', DataFrameSelector(cats)), 
    ('label_binarizer', MyLabelBinarizer(cat_cols))
])

In [702]:
date_pipeline = Pipeline([
    ('data_selector', DataFrameSelector(date_cols)), 
    ('date_converter', DateConverter(date_cols))
])

In [703]:
ad_line_length = Pipeline([
    ('data_selector', DataFrameSelector(ad_line_cols)),
    ('ad_line_length_feature', AdTopicLineLengthAddition(ad_line_cols))
])

In [704]:
add_label = Pipeline([
    ('data_selector', DataFrameSelector(['Clicked on Ad']))
])

In [705]:
full_pipeline = FeatureUnion(transformer_list=[
    ('num_pipeline', num_pipeline),
    ('cat_pipeline', cat_pipeline),
    ('date_pipeline',date_pipeline),
    ('ad_line_length', ad_line_length),
    ('add_label', add_label)
])

# Data Separation

In [706]:
test_size = 0.2

X = df.loc[:, 'Daily Time Spent on Site': 'Timestamp']
y = df['Clicked on Ad']

In [707]:
# stratified on the 'Clicked on Ad' column for proper distributions

shuffle_split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

for train_index, test_index in shuffle_split.split(X, y):
    train_set = df.loc[train_index]
    test_set = df.loc[test_index]

# Creating Train and Test CSV files

In [708]:
train_set.head()

Unnamed: 0,Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Ad Topic Line,Male,Country,Timestamp,Clicked on Ad
747,49.13,32,41097.17,120.49,Optimized intermediate help-desk,0,Fiji,2016-01-29 00:45:19,1
586,78.29,38,57844.96,252.07,Networked regional Local Area Network,0,Liberia,2016-01-05 20:58:42,0
519,35.0,40,46033.73,151.25,Fully-configurable context-sensitive Graphic I...,1,Mongolia,2016-06-18 16:02:34,1
770,71.14,30,69758.31,224.82,Assimilated stable encryption,0,France,2016-06-21 00:52:47,0
600,84.0,48,46868.53,136.21,Inverse discrete extranet,1,Kyrgyz Republic,2016-04-17 05:08:52,1


In [709]:
train_cleaned = full_pipeline.fit_transform(train_set)

In [710]:
columns = nums + cat_cols + date_cols + ad_line_cols + label_col
train_cleaned = pd.DataFrame(data=train_cleaned, columns=columns)

In [711]:
train_cleaned.head()

Unnamed: 0,Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Country,Afghanistan,Albania,Algeria,American Samoa,Andorra,...,Zimbabwe,Timestamp,afternoon,evening,morning,night,Ad Topic Line,Ad Line Length,Ad Line Word Length,Clicked on Ad
0,-1.00327,-0.475928,-1.07451,-1.36901,Fiji,0,0,0,0,0,...,0,2016-01-29 00:45:19,0,0,0,1,Optimized intermediate help-desk,-0.24776,-0.451417,1
1,0.838468,0.211953,0.176047,1.63719,Liberia,0,0,0,0,0,...,0,2016-01-05 20:58:42,0,1,0,0,Networked regional Local Area Network,0.629266,3.72354,0
2,-1.89571,0.441247,-0.705899,-0.666237,Mongolia,0,0,0,0,0,...,0,2016-06-18 16:02:34,1,0,0,0,Fully-configurable context-sensitive Graphic I...,3.61116,1.63606,1
3,0.386877,-0.705222,1.06562,1.01461,France,0,0,0,0,0,...,0,2016-06-21 00:52:47,0,0,0,1,Assimilated stable encryption,-0.773976,-0.451417,0
4,1.19911,1.35842,-0.643565,-1.00985,Kyrgyz Republic,0,0,0,0,0,...,0,2016-04-17 05:08:52,0,0,1,0,Inverse discrete extranet,-1.4756,-0.451417,1
