In [18]:
import pandas as pd
from datetime import datetime

from sklearn.preprocessing import LabelBinarizer, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import TransformerMixin, BaseEstimator

# Data Inputting

In [19]:
df = pd.read_csv('advertising.csv')
df.head()

Unnamed: 0,Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Ad Topic Line,City,Male,Country,Timestamp,Clicked on Ad
0,68.95,35,61833.9,256.09,Cloned 5thgeneration orchestration,Wrightburgh,0,Tunisia,2016-03-27 00:53:11,0
1,80.23,31,68441.85,193.77,Monitored national standardization,West Jodi,1,Nauru,2016-04-04 01:39:02,0
2,69.47,26,59785.94,236.5,Organic bottom-line service-desk,Davidton,0,San Marino,2016-03-13 20:35:42,0
3,74.15,29,54806.18,245.89,Triple-buffered reciprocal time-frame,West Terrifurt,1,Italy,2016-01-10 02:31:19,0
4,68.37,35,73889.99,225.58,Robust logistical utilization,South Manuel,0,Iceland,2016-06-03 03:36:18,0


In [20]:
print(df['Timestamp'][0])

2016-03-27 00:53:11


In [21]:
print(type(df['Timestamp'][0]))

<class 'str'>


# Data Analysis

In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Daily Time Spent on Site  1000 non-null   float64
 1   Age                       1000 non-null   int64  
 2   Area Income               1000 non-null   float64
 3   Daily Internet Usage      1000 non-null   float64
 4   Ad Topic Line             1000 non-null   object 
 5   City                      1000 non-null   object 
 6   Male                      1000 non-null   int64  
 7   Country                   1000 non-null   object 
 8   Timestamp                 1000 non-null   object 
 9   Clicked on Ad             1000 non-null   int64  
dtypes: float64(3), int64(3), object(4)
memory usage: 78.2+ KB


In [23]:
df.describe()

Unnamed: 0,Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Male,Clicked on Ad
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,65.0002,36.009,55000.00008,180.0001,0.481,0.5
std,15.853615,8.785562,13414.634022,43.902339,0.499889,0.50025
min,32.6,19.0,13996.5,104.78,0.0,0.0
25%,51.36,29.0,47031.8025,138.83,0.0,0.0
50%,68.215,35.0,57012.3,183.13,0.0,0.5
75%,78.5475,42.0,65470.635,218.7925,1.0,1.0
max,91.43,61.0,79484.8,269.96,1.0,1.0


In [24]:
df.nunique()

Daily Time Spent on Site     900
Age                           43
Area Income                 1000
Daily Internet Usage         966
Ad Topic Line               1000
City                         969
Male                           2
Country                      237
Timestamp                   1000
Clicked on Ad                  2
dtype: int64

# Pipelines

In [25]:
class DataFrameSelector(TransformerMixin, BaseEstimator):
    def __init__(self, arr):
        self.arr = arr
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return X[self.arr].to_numpy()

In [70]:
class MyLabelBinarizer(TransformerMixin, BaseEstimator):
    def __init__(self):
        self.binarizer = LabelBinarizer()
    
    def fit(self, X, y=None):
        print(X)
        self.binarizer.fit(X)
        return self
    
    def transform(self, X, y=None):
        return self.binarizer.transform(X)

In [71]:
class DateConverter(TransformerMixin, BaseEstimator):
    def __init__(self):
        self.date_str_format = "%Y-%m-%d %H-%M-%S"
    
    def fit(self,X,y=None):
        return self
    
    def transform(self,X,y=None):
#         Example of date
#         2016-03-27 00:53:11
        for row_index in range(len(X)):
            str_datetime = X[row_index]
            new_datetime = datetime.strptime(str_datetime,self.date_str_format)
            X[row_index] = new_datetime

In [72]:
nums = list(df.select_dtypes(exclude=['object']).columns)
cats = list(df.select_dtypes(include=['object']).columns)

# print(f'nums: {nums}')
# print(f'cats: {cats}')
# print(type(nums))

In [73]:
cats.remove('Ad Topic Line')
cats.remove('Timestamp')

nums.remove('Clicked on Ad')
nums.remove('Male')

date_cols = ['Timestamp']

In [79]:
print(cats)
print(nums)
print(date_col)

['City', 'Country']
['Daily Time Spent on Site', 'Age', 'Area Income', 'Daily Internet Usage']
['Timestamp']


In [83]:
df["Country"].nunique()

237

In [74]:
num_pipeline = Pipeline([
    ('data_selector', DataFrameSelector(nums)), 
    ('standard_scaler', StandardScaler())
])

In [75]:
cat_pipeline = Pipeline([
    ('data_selector', DataFrameSelector(cats)), 
    ('label_binarizer', MyLabelBinarizer())
])

In [76]:
date_pipeline = Pipeline([
    ('data_selector', DataFrameSelector(date_col)), 
    ("date_converter", DateConverter())
])

In [77]:
full_pipeline = FeatureUnion(transformer_list=[
    ('num_pipeline', num_pipeline),
    ('cat_pipeline', cat_pipeline),
    ('date_pipeline',date_pipeline)
])

In [78]:
full_pipeline.fit(df)

[['Wrightburgh' 'Tunisia']
 ['West Jodi' 'Nauru']
 ['Davidton' 'San Marino']
 ...
 ['South Jessica' 'Mongolia']
 ['West Steven' 'Guatemala']
 ['Ronniemouth' 'Brazil']]


ValueError: Multioutput target data is not supported with label binarization