In [1]:


#imports
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split

import os
import json
from pandas import json_normalize

def load_df(csv_path='../input/train.csv', nrows=None):
    JSON_COLUMNS = ['device', 'geoNetwork', 'totals', 'trafficSource']
    
    df = pd.read_csv(csv_path, 
                     converters={column: json.loads for column in JSON_COLUMNS}, 
                     dtype={'fullVisitorId': 'str'}, # Important!!
                     nrows=nrows)
    
    for column in JSON_COLUMNS:
        column_as_df = json_normalize(df[column])
        column_as_df.columns = [f"{column}.{subcolumn}" for subcolumn in column_as_df.columns]
        df = df.drop(column, axis=1).merge(column_as_df, right_index=True, left_index=True)
    print(f"Loaded {os.path.basename(csv_path)}. Shape: {df.shape}")
    return df


In [2]:
train = load_df(nrows=100000, csv_path="data/train.csv")
test = load_df("data/test.csv", 100000)
pd.set_option('display.max_columns', None)

Loaded train.csv. Shape: (100000, 55)
Loaded test.csv. Shape: (100000, 53)


In [3]:
#df = df.loc[:, (df.isnull().sum(axis=0) <= max_number_of_nas)]
#train = train.loc[:, (train.eq("not available in demo dataset"))]

# Find the columns where values are not available
useless_cols = [col for col in train.columns if train[col].eq("not available in demo dataset").all()]
# Drop these columns from the dataframe
train.drop(useless_cols,
        axis=1,
        inplace=True)
train.info()

# Find the columns where values are not available
useless_cols = [col for col in train.columns if train[col].isna().all()]
# Drop these columns from the dataframe
train.drop(useless_cols,
        axis=1,
        inplace=True)
train.info()

  res_values = method(rvalues)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 38 columns):
 #   Column                                        Non-Null Count   Dtype 
---  ------                                        --------------   ----- 
 0   channelGrouping                               100000 non-null  object
 1   date                                          100000 non-null  int64 
 2   fullVisitorId                                 100000 non-null  object
 3   sessionId                                     100000 non-null  object
 4   socialEngagementType                          100000 non-null  object
 5   visitId                                       100000 non-null  int64 
 6   visitNumber                                   100000 non-null  int64 
 7   visitStartTime                                100000 non-null  int64 
 8   device.browser                                100000 non-null  object
 9   device.operatingSystem                        100000 non-nul

In [4]:
const_cols = [c for c in train.columns if train[c].nunique(dropna=False)==1 ]
const_cols

['socialEngagementType', 'totals.visits']

In [5]:
print("Variables not in test but in train : ", set(train.columns).difference(set(test.columns)))

Variables not in test but in train :  {'trafficSource.campaignCode', 'totals.transactionRevenue'}


In [6]:
cols_to_drop = const_cols + ['sessionId']

train_df = train.drop(cols_to_drop + ["trafficSource.campaignCode"], axis=1)
test_df = test.drop(cols_to_drop, axis=1)

In [7]:
from sklearn import model_selection, preprocessing, metrics
import datetime

# Impute 0 for missing target values
train_df["totals.transactionRevenue"].fillna(0.0, inplace=True)
train_df["totals.pageviews"].fillna(0.0, inplace=True)
train_df["totals.bounces"].fillna(0.0, inplace=True)
train_df["totals.newVisits"].fillna(0.0, inplace=True)
train_y = train_df["totals.transactionRevenue"].values
train_id = train_df["fullVisitorId"].values
test_id = test_df["fullVisitorId"].values


# label encode the categorical variables and convert the numerical variables to float
cat_cols = ["channelGrouping", "device.browser", 
            "device.deviceCategory", "device.operatingSystem", 
            "geoNetwork.city", "geoNetwork.continent", 
            "geoNetwork.country", "geoNetwork.metro",
            "geoNetwork.networkDomain", "geoNetwork.region", 
            "geoNetwork.subContinent", "trafficSource.adContent", 
            "trafficSource.adwordsClickInfo.adNetworkType", 
            "trafficSource.adwordsClickInfo.gclId", 
            "trafficSource.adwordsClickInfo.page", 
            "trafficSource.adwordsClickInfo.slot", "trafficSource.campaign",
            "trafficSource.keyword", "trafficSource.medium", 
            "trafficSource.referralPath", "trafficSource.source",
            'trafficSource.adwordsClickInfo.isVideoAd', 'trafficSource.isTrueDirect']
for col in cat_cols:
    print(col)
    lbl = preprocessing.LabelEncoder()
    lbl.fit(list(train_df[col].values.astype('str')) + list(test_df[col].values.astype('str')))
    train_df[col] = lbl.transform(list(train_df[col].values.astype('str')))
    test_df[col] = lbl.transform(list(test_df[col].values.astype('str')))

train_df["totals.transactionRevenue"] = train_df["totals.transactionRevenue"].astype(float)

num_cols = ["totals.hits", "totals.pageviews", "visitNumber", "visitStartTime", 'totals.bounces',  'totals.newVisits']    
for col in num_cols:
    train_df[col] = train_df[col].astype(float)
    test_df[col] = test_df[col].astype(float)

# Split the train dataset into development and valid based on time 
#train_df["totals.transactionRevenue"] = train_df["totals.transactionRevenue"].astype('float')
dev_df = train_df[train_df['date']<=20170531]
val_df = train_df[train_df['date']>2017531]
dev_y = np.log1p(dev_df["totals.transactionRevenue"].values)
val_y = np.log1p(val_df["totals.transactionRevenue"].values)

dev_X = dev_df[cat_cols + num_cols] 
val_X = val_df[cat_cols + num_cols] 
test_X = test_df[cat_cols + num_cols]

channelGrouping
device.browser
device.deviceCategory
device.operatingSystem
geoNetwork.city
geoNetwork.continent
geoNetwork.country
geoNetwork.metro
geoNetwork.networkDomain
geoNetwork.region
geoNetwork.subContinent
trafficSource.adContent
trafficSource.adwordsClickInfo.adNetworkType
trafficSource.adwordsClickInfo.gclId
trafficSource.adwordsClickInfo.page
trafficSource.adwordsClickInfo.slot
trafficSource.campaign
trafficSource.keyword
trafficSource.medium
trafficSource.referralPath
trafficSource.source
trafficSource.adwordsClickInfo.isVideoAd
trafficSource.isTrueDirect


In [8]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    dev_X, dev_y, random_state=0)

In [9]:
X_train.head()

Unnamed: 0,channelGrouping,device.browser,device.deviceCategory,device.operatingSystem,geoNetwork.city,geoNetwork.continent,geoNetwork.country,geoNetwork.metro,geoNetwork.networkDomain,geoNetwork.region,geoNetwork.subContinent,trafficSource.adContent,trafficSource.adwordsClickInfo.adNetworkType,trafficSource.adwordsClickInfo.gclId,trafficSource.adwordsClickInfo.page,trafficSource.adwordsClickInfo.slot,trafficSource.campaign,trafficSource.keyword,trafficSource.medium,trafficSource.referralPath,trafficSource.source,trafficSource.adwordsClickInfo.isVideoAd,trafficSource.isTrueDirect,totals.hits,totals.pageviews,visitNumber,visitStartTime,totals.bounces,totals.newVisits
85573,1,12,0,17,459,2,190,78,7450,318,12,47,3,8098,6,3,10,626,2,1386,16,1,1,1.0,1.0,1.0,1480437000.0,1.0,1.0
35822,2,12,0,17,317,4,58,0,7970,124,22,47,3,8098,6,3,3,626,0,1386,0,1,0,2.0,2.0,1.0,1493707000.0,0.0,1.0
80607,2,12,1,1,0,2,142,0,3206,0,15,47,3,8098,6,3,3,626,0,1386,0,1,0,5.0,5.0,1.0,1471878000.0,0.0,1.0
875,4,12,0,3,459,2,190,78,0,318,12,47,3,8098,6,3,3,5,5,1386,74,1,1,18.0,12.0,1.0,1472838000.0,0.0,1.0
87560,2,12,1,1,0,2,190,0,4047,0,12,47,3,8098,6,3,3,626,0,1386,0,1,0,1.0,1.0,1.0,1489842000.0,1.0,1.0


In [10]:
reg = GradientBoostingRegressor(random_state=0)
reg.fit(X_train, y_train)
val_pred = reg.predict(X_test[1:2])

In [11]:
reg.score(X_test, y_test)

0.3315152747907183

In [12]:
from sklearn import metrics
pred_val[pred_val<0] = 0
val_pred_df = pd.DataFrame({"fullVisitorId":val_df["fullVisitorId"].values})
val_pred_df["transactionRevenue"] = val_df["totals.transactionRevenue"].values
val_pred_df["PredictedRevenue"] = np.expm1(pred_val)
#print(np.sqrt(metrics.mean_squared_error(np.log1p(val_pred_df["transactionRevenue"].values), np.log1p(val_pred_df["PredictedRevenue"].values))))
val_pred_df = val_pred_df.groupby("fullVisitorId")["transactionRevenue", "PredictedRevenue"].sum().reset_index()
print(np.sqrt(metrics.mean_squared_error(np.log1p(val_pred_df["transactionRevenue"].values), np.log1p(val_pred_df["PredictedRevenue"].values))))

NameError: name 'pred_val' is not defined

In [13]:
from sklearn.model_selection import StratifiedShuffleSplit

train_df["revenue_cat"] = pd.cut(train_df["totals.transactionRevenue"],
                               bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
                               labels=[1, 2, 3, 4, 5])

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(train_df, train_df["revenue_cat"]):
    strat_train_set = train_df.loc[train_index]
    strat_test_set = train_df.loc[test_index]
    

#for set_ in (strat_train_set, strat_test_set):
 #   set_.drop("income_cat", axis=1, inplace=True)

strat_train_set = strat_train_set.drop(['revenue_cat'], axis=1)
strat_test_set= strat_test_set.drop(['revenue_cat'], axis=1)



ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

We can tell that there appears to be no missing values

In [14]:
payingCustomers = train.loc[train['totals.transactionRevenue'].notna()]
payingCustomers

Unnamed: 0,channelGrouping,date,fullVisitorId,sessionId,socialEngagementType,visitId,visitNumber,visitStartTime,device.browser,device.operatingSystem,device.isMobile,device.deviceCategory,geoNetwork.continent,geoNetwork.subContinent,geoNetwork.country,geoNetwork.region,geoNetwork.metro,geoNetwork.city,geoNetwork.networkDomain,totals.visits,totals.hits,totals.pageviews,totals.bounces,totals.newVisits,totals.transactionRevenue,trafficSource.campaign,trafficSource.source,trafficSource.medium,trafficSource.keyword,trafficSource.isTrueDirect,trafficSource.referralPath,trafficSource.adwordsClickInfo.page,trafficSource.adwordsClickInfo.slot,trafficSource.adwordsClickInfo.gclId,trafficSource.adwordsClickInfo.adNetworkType,trafficSource.adwordsClickInfo.isVideoAd,trafficSource.adContent,trafficSource.campaignCode
752,Direct,20160902,6194193421514403509,6194193421514403509_1472843572,Not Socially Engaged,1472843572,1,1472843572,Chrome,Linux,False,desktop,Americas,Northern America,United States,Michigan,Detroit MI,Ann Arbor,(not set),1,11,11,,1,37860000,(not set),(direct),(none),,True,,,,,,,,
753,Organic Search,20160902,5327166854580374902,5327166854580374902_1472844906,Not Socially Engaged,1472844906,3,1472844906,Chrome,Macintosh,False,desktop,Americas,Northern America,United States,New York,New York NY,New York,(not set),1,11,10,,,306670000,(not set),google,organic,(not provided),True,,,,,,,,
799,Referral,20160902,8885051388942907862,8885051388942907862_1472827393,Not Socially Engaged,1472827393,7,1472827393,Chrome,Linux,False,desktop,Americas,Northern America,United States,New York,New York NY,New York,(not set),1,13,11,,,68030000,(not set),mall.googleplex.com,referral,,True,/,,,,,,,
802,Referral,20160902,0185467632009737931,0185467632009737931_1472846398,Not Socially Engaged,1472846398,6,1472846398,Chrome,Windows,False,desktop,Americas,Northern America,United States,California,San Francisco-Oakland-San Jose CA,Mountain View,(not set),1,13,12,,,26250000,(not set),mall.googleplex.com,referral,,True,/,,,,,,,
859,Referral,20160902,3244885836845029978,3244885836845029978_1472824817,Not Socially Engaged,1472824817,4,1472824817,Chrome,Macintosh,False,desktop,Americas,Northern America,United States,not available in demo dataset,not available in demo dataset,not available in demo dataset,(not set),1,17,14,,,574150000,(not set),mall.googleplex.com,referral,,True,/,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99729,Referral,20161030,3590222967147519334,3590222967147519334_1477882251,Not Socially Engaged,1477882251,3,1477882251,Chrome,Macintosh,False,desktop,Americas,Northern America,United States,Washington,Seattle-Tacoma WA,Seattle,(not set),1,42,28,,,83140000,(not set),mall.googleplex.com,referral,,True,/,,,,,,,
99730,Referral,20161030,6781934558737955524,6781934558737955524_1477852085,Not Socially Engaged,1477852085,3,1477852085,Chrome,Macintosh,False,desktop,Americas,Northern America,United States,Michigan,Detroit MI,Ann Arbor,comcastbusiness.net,1,43,31,,,55970000,(not set),mall.googleplex.com,referral,,True,/,,,,,,,
99733,Referral,20161030,5976063913493194585,5976063913493194585_1477847924,Not Socially Engaged,1477847924,3,1477847924,Chrome,Macintosh,False,desktop,Americas,Northern America,United States,not available in demo dataset,not available in demo dataset,not available in demo dataset,(not set),1,48,38,,,111880000,(not set),mall.googleplex.com,referral,,,/,,,,,,,
99736,Referral,20161030,1710276641442998015,1710276641442998015_1477873893,Not Socially Engaged,1477873893,2,1477873893,Chrome,Macintosh,False,desktop,Americas,Northern America,United States,New York,New York NY,New York,(not set),1,56,47,,,19990000,(not set),mall.googleplex.com,referral,,,/,,,,,,,


In [15]:
payingCustomers.describe()

Unnamed: 0,date,visitId,visitNumber,visitStartTime
count,1399.0,1399.0,1399.0,1399.0
mean,20165550.0,1485392000.0,3.887777,1485392000.0
std,4715.261,9379675.0,10.389323,9379678.0
min,20160810.0,1470908000.0,1.0,1470908000.0
25%,20161120.0,1479507000.0,1.0,1479507000.0
50%,20161220.0,1481930000.0,2.0,1481930000.0
75%,20170520.0,1494876000.0,4.0,1494876000.0
max,20170800.0,1501652000.0,315.0,1501652000.0


In [16]:
payingCustomers.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1399 entries, 752 to 99743
Data columns (total 38 columns):
 #   Column                                        Non-Null Count  Dtype 
---  ------                                        --------------  ----- 
 0   channelGrouping                               1399 non-null   object
 1   date                                          1399 non-null   int64 
 2   fullVisitorId                                 1399 non-null   object
 3   sessionId                                     1399 non-null   object
 4   socialEngagementType                          1399 non-null   object
 5   visitId                                       1399 non-null   int64 
 6   visitNumber                                   1399 non-null   int64 
 7   visitStartTime                                1399 non-null   int64 
 8   device.browser                                1399 non-null   object
 9   device.operatingSystem                        1399 non-null   object
 1

## Get the data

In [17]:
#imports
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split

import os
import json
from pandas import json_normalize

json_cols = ['device', 'geoNetwork', 'totals', 'trafficSource']

def load_df(csv_path='../input/train.csv', nrows=None):
    df = pd.read_csv(csv_path, 
                     converters={column: json.loads for column in json_cols}, 
                     dtype={'fullVisitorId': 'str'}, # Important!!
                     nrows=nrows)
    
    for column in json_cols:
        column_as_df = json_normalize(df[column])
        column_as_df.columns = [f"{column}.{subcolumn}" for subcolumn in column_as_df.columns]
        df = df.drop(column, axis=1).merge(column_as_df, right_index=True, left_index=True)
    print(f"Loaded {os.path.basename(csv_path)}. Shape: {df.shape}")
    return df

## Splitting the data

I just don't see how this works with a pipeline..

In [18]:
from sklearn.model_selection import StratifiedShuffleSplit

train["totals.transactionRevenue"] = pd.cut(train["totals.transactionRevenue"],
                               bins=[0., np.inf],
                               labels=[1, 2])

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(train, train["totals.transactionRevenue"]):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]

TypeError: '<' not supported between instances of 'float' and 'str'

## Transformers

#### Transformer for removing unwanted features

In [None]:
from sklearn.base import TransformerMixin
from sklearn.base import BaseEstimator

class FeatureReducer(BaseEstimator, TransformerMixin):
    def __init__(self, features):
        self.features = features
    
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return X.drop(self.features, axis=1)

#### Transformers for labeling, converting and imputing the data

In [132]:
class Labeler(BaseEstimator, TransformerMixin):
    def __init__(self, cat_cols):
        self.cat_cols = cat_cols
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        for col in self.cat_cols:
            lbl = preprocessing.LabelEncoder()
            lbl.fit(list(X[col].values.astype('str')))
            X[col] = lbl.transform(list(X[col].values.astype('str')))
        return X

class Floatinator(BaseEstimator, TransformerMixin):
    def __init__(self, num_cols):
        self.num_cols = num_cols
    
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        for col in self.num_cols:
            X[col] = X[col].astype(float)
        return X
    
class SimplerImputer(BaseEstimator, TransformerMixin):
    def __init__(self, cols):
        self.cols = cols
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        for col in self.cols:
            X[col].fillna(0.0, inplace=True)
        return X


#### Splitting the data

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit

train["totals.transactionRevenue"] = pd.cut(train["totals.transactionRevenue"],
                               bins=[0., np.inf],
                               labels=[1, 2])

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(train, train["totals.transactionRevenue"]):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]

TypeError: 'NoneType' object is not subscriptable

TypeError: '<' not supported between instances of 'float' and 'str'

Collect all columns which need to be dropped

In [None]:
useless_cols = [col for col in train.columns 
                if train[col].isna().all() 
                or train[col].eq("not available in demo dataset").all()
                or train[col].nunique(dropna=False)==1]
useless_cols = useless_cols + ["trafficSource.campaignCode"] + ["sessionId"]
useless_cols

['socialEngagementType',
 'totals.visits',
 'trafficSource.campaignCode',
 'sessionId']

#### Declare columns which need to be flattened:

In [None]:
json_cols = ['device', 'geoNetwork', 'totals', 'trafficSource']

#### Declare categorical columns

In [None]:
categorical_cols = ["channelGrouping", "device.browser", 
            "device.deviceCategory", "device.operatingSystem", 
            "geoNetwork.city", "geoNetwork.continent", 
            "geoNetwork.country", "geoNetwork.metro",
            "geoNetwork.networkDomain", "geoNetwork.region", 
            "geoNetwork.subContinent", "trafficSource.adContent", 
            "trafficSource.adwordsClickInfo.adNetworkType", 
            "trafficSource.adwordsClickInfo.gclId", 
            "trafficSource.adwordsClickInfo.page", 
            "trafficSource.adwordsClickInfo.slot", "trafficSource.campaign",
            "trafficSource.keyword", "trafficSource.medium", 
            "trafficSource.referralPath", "trafficSource.source",
            'trafficSource.adwordsClickInfo.isVideoAd', 'trafficSource.isTrueDirect']

### Declare numerical columns

In [162]:
num_cols = ["totals.hits", "totals.pageviews", "visitNumber", "visitStartTime", 'totals.bounces',  'totals.newVisits']

### Imports

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

In [None]:
for col in cat_cols:
    print(col)
    lbl = preprocessing.LabelEncoder()
    lbl.fit(list(train_df[col].values.astype('str')) + list(test_df[col].values.astype('str')))
    train_df[col] = lbl.transform(list(train_df[col].values.astype('str')))
    test_df[col] = lbl.transform(list(test_df[col].values.astype('str')))

channelGrouping
device.browser
device.deviceCategory
device.operatingSystem
geoNetwork.city
geoNetwork.continent
geoNetwork.country
geoNetwork.metro
geoNetwork.networkDomain
geoNetwork.region
geoNetwork.subContinent
trafficSource.adContent
trafficSource.adwordsClickInfo.adNetworkType
trafficSource.adwordsClickInfo.gclId
trafficSource.adwordsClickInfo.page
trafficSource.adwordsClickInfo.slot
trafficSource.campaign
trafficSource.keyword
trafficSource.medium
trafficSource.referralPath
trafficSource.source
trafficSource.adwordsClickInfo.isVideoAd
trafficSource.isTrueDirect


In [None]:
prepare_data_pipeline = Pipeline([
    ('flatten', JsonFlattener(json_cols)),
    ('reduce', FeatureReducer(useless_cols))
])

"""label_encoding_pipeline = Pipeline([
    ("encode_labels", ColumnTransformer(
        ("label_transformer", LabelEncoder(), categorical_cols)
    ))
])"""

full_pipeline = Pipeline([
    ("impute_revenue", ColumnTransformer(
        ('revenue_imputer', SimpleImputer(strategy="constant", fill_value=0), "totals.transactionRevenue")
    )),
    ('reduce', FeatureReducer(useless_cols)),
    ("encode_labels", ColumnTransformer(
        ("label_transformer", LabelEncoder(), categorical_cols)
    ))
])

"""
train_pipeline = Pipeline([
    full_pipeline,
    ("impute_revenue", ColumnTransformer([
        ('revenue_imputer', SimpleImputer(strategy="constant", fill_value=0), "totals.transactionRevenue")
    ]))
])"""

## Temp
some_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="constant", fill_value=0)),
    ('GBR', GradientBoostingRegressor(random_state=0))
])

ValueError: not enough values to unpack (expected 2, got 1)

In [166]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

some_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="constant", fill_value=0))
])

## Get data

In [235]:
train = load_df(nrows=100000, csv_path="data/train.csv")
test = load_df("data/test.csv", 100000)
pd.set_option('display.max_columns', None)

Loaded train.csv. Shape: (100000, 55)
Loaded test.csv. Shape: (100000, 53)


#### Declare columns which need to be flattened:

In [236]:
json_cols = ['device', 'geoNetwork', 'totals', 'trafficSource']

#### Declare categorical columns

In [237]:
categorical_cols = ["channelGrouping", "device.browser", 
            "device.deviceCategory", "device.operatingSystem", 
            "geoNetwork.city", "geoNetwork.continent", 
            "geoNetwork.country", "geoNetwork.metro",
            "geoNetwork.networkDomain", "geoNetwork.region", 
            "geoNetwork.subContinent", "trafficSource.adContent", 
            "trafficSource.adwordsClickInfo.adNetworkType", 
            "trafficSource.adwordsClickInfo.gclId", 
            "trafficSource.adwordsClickInfo.page", 
            "trafficSource.adwordsClickInfo.slot", "trafficSource.campaign",
            "trafficSource.keyword", "trafficSource.medium", 
            "trafficSource.referralPath", "trafficSource.source",
            'trafficSource.adwordsClickInfo.isVideoAd', 'trafficSource.isTrueDirect']

### Declare numerical columns

In [238]:
num_cols = ["totals.transactionRevenue", "totals.hits", "totals.pageviews", "visitNumber", "visitStartTime", 'totals.bounces',  'totals.newVisits']

## Preprocess data for training

In [239]:
# Data is already flattened

# Remove useless columns
train = FeatureReducer(useless_cols).transform(train)

In [240]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 34 columns):
 #   Column                                        Non-Null Count   Dtype 
---  ------                                        --------------   ----- 
 0   channelGrouping                               100000 non-null  object
 1   date                                          100000 non-null  int64 
 2   fullVisitorId                                 100000 non-null  object
 3   visitId                                       100000 non-null  int64 
 4   visitNumber                                   100000 non-null  int64 
 5   visitStartTime                                100000 non-null  int64 
 6   device.browser                                100000 non-null  object
 7   device.operatingSystem                        100000 non-null  object
 8   device.isMobile                               100000 non-null  bool  
 9   device.deviceCategory                         100000 non-nul

In [241]:
# Impute values
cols_to_impute = [
    "totals.transactionRevenue",
    "totals.pageviews",
    "totals.bounces",
    "totals.newVisits"
]
train = SimplerImputer(cols_to_impute).transform(train)

In [242]:
train = Labeler(cat_cols).transform(train)

In [243]:
train = Floatinator(num_cols).transform(train)

In [244]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 34 columns):
 #   Column                                        Non-Null Count   Dtype  
---  ------                                        --------------   -----  
 0   channelGrouping                               100000 non-null  int64  
 1   date                                          100000 non-null  int64  
 2   fullVisitorId                                 100000 non-null  object 
 3   visitId                                       100000 non-null  int64  
 4   visitNumber                                   100000 non-null  float64
 5   visitStartTime                                100000 non-null  float64
 6   device.browser                                100000 non-null  int64  
 7   device.operatingSystem                        100000 non-null  int64  
 8   device.isMobile                               100000 non-null  bool   
 9   device.deviceCategory                         100

In [245]:
train.head()

Unnamed: 0,channelGrouping,date,fullVisitorId,visitId,visitNumber,visitStartTime,device.browser,device.operatingSystem,device.isMobile,device.deviceCategory,geoNetwork.continent,geoNetwork.subContinent,geoNetwork.country,geoNetwork.region,geoNetwork.metro,geoNetwork.city,geoNetwork.networkDomain,totals.hits,totals.pageviews,totals.bounces,totals.newVisits,totals.transactionRevenue,trafficSource.campaign,trafficSource.source,trafficSource.medium,trafficSource.keyword,trafficSource.isTrueDirect,trafficSource.referralPath,trafficSource.adwordsClickInfo.page,trafficSource.adwordsClickInfo.slot,trafficSource.adwordsClickInfo.gclId,trafficSource.adwordsClickInfo.adNetworkType,trafficSource.adwordsClickInfo.isVideoAd,trafficSource.adContent
0,4,20160902,1131660440785968503,1472830385,1.0,1472830000.0,5,12,False,0,3,21,168,93,0,118,6360,1.0,1.0,1.0,1.0,0.0,0,49,5,5,1,527,4,2,2389,2,1,21
1,4,20160902,377306020877927890,1472880147,1.0,1472880000.0,8,7,False,0,5,1,9,217,52,289,1762,1.0,1.0,1.0,1.0,0.0,0,49,5,5,1,527,4,2,2389,2,1,21
2,4,20160902,3895546263509774583,1472865386,1.0,1472865000.0,5,12,False,0,4,19,151,49,0,145,6597,1.0,1.0,1.0,1.0,0.0,0,49,5,5,1,527,4,2,2389,2,1,21
3,4,20160902,4763447161404445595,1472881213,1.0,1472881000.0,26,6,False,0,3,16,76,217,52,289,6597,1.0,1.0,1.0,1.0,0.0,0,49,5,203,1,527,4,2,2389,2,1,21
4,4,20160902,27294437909732085,1472822600,2.0,1472823000.0,5,1,True,1,4,13,174,217,52,289,6597,1.0,1.0,1.0,0.0,0.0,0,49,5,5,0,527,4,2,2389,2,1,21


## Split data

In [246]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    train, 
    train["totals.transactionRevenue"], test_size=0.4, random_state=42)

In [247]:
y_train.head()

40507    0.0
72707    0.0
90912    0.0
28532    0.0
13006    0.0
Name: totals.transactionRevenue, dtype: float64

In [248]:
y_test.describe()

count    4.000000e+04
mean     1.813006e+06
std      3.129121e+07
min      0.000000e+00
25%      0.000000e+00
50%      0.000000e+00
75%      0.000000e+00
max      2.365500e+09
Name: totals.transactionRevenue, dtype: float64

In [249]:
X_train = X_train.drop('totals.transactionRevenue', axis=1)
gbr = GradientBoostingRegressor(random_state=0)
gbr.fit(X_train, y_train)
gbr.score(X_test.drop('totals.transactionRevenue', axis=1), y_test)

-0.7378455520438922

In [255]:
X_test# = X_test.drop('totals.transactionRevenue', axis=1)
pred = gbr.predict(X_test)

In [257]:
pred = pred.clip(min=0)
pred

array([     0.        , 286493.87063747,      0.        , ...,
            0.        ,      0.        ,      0.        ])

In [259]:
from sklearn.metrics import mean_squared_error as mse
mse(y_test,pred)

1701258857615180.8

In [None]:
y_test.head()

TODO:

Impute missing values
    Set revnue to 0 if not present
scale

 0   channelGrouping                               100000 non-null  object
* 1   date                                          100000 non-null  int64 
- 2   fullVisitorId                                 100000 non-null  object
- 3   sessionId                                     100000 non-null  object
 4   socialEngagementType                          100000 non-null  object
- 5   visitId                                       100000 non-null  int64 
 6   visitNumber                                   100000 non-null  int64 
* 7   visitStartTime                                100000 non-null  int64 
 8   device.browser                                100000 non-null  object
 9   device.operatingSystem                        100000 non-null  object
 10  device.isMobile                               100000 non-null  bool  
 11  device.deviceCategory                         100000 non-null  object
 12  geoNetwork.continent                          100000 non-null  object
 13  geoNetwork.subContinent                       100000 non-null  object
 14  geoNetwork.country                            100000 non-null  object
 15  geoNetwork.region                             100000 non-null  object
 16  geoNetwork.metro                              100000 non-null  object
 17  geoNetwork.city                               100000 non-null  object
* 18  geoNetwork.networkDomain                      100000 non-null  object
 19  totals.visits                                 100000 non-null  object
 20  totals.hits                                   100000 non-null  object
 21  totals.pageviews                              99993 non-null   object
 22  totals.bounces                                48916 non-null   object
 23  totals.newVisits                              77263 non-null   object
 24  totals.transactionRevenue                     1399 non-null    object
 25  trafficSource.campaign                        100000 non-null  object
 26  trafficSource.source                          100000 non-null  object
 27  trafficSource.medium                          100000 non-null  object
 28  trafficSource.keyword                         44218 non-null   object
 29  trafficSource.isTrueDirect                    30454 non-null   object
 30  trafficSource.referralPath                    36473 non-null   object
 31  trafficSource.adwordsClickInfo.page           2574 non-null    object
 32  trafficSource.adwordsClickInfo.slot           2574 non-null    object
 33  trafficSource.adwordsClickInfo.gclId          2625 non-null    object
 34  trafficSource.adwordsClickInfo.adNetworkType  2574 non-null    object
 35  trafficSource.adwordsClickInfo.isVideoAd      2574 non-null    object
 36  trafficSource.adContent                       1325 non-null    object
 37  trafficSource.campaignCode                    1 non-null       object

In [28]:
train["geoNetwork.networkDomain"].head()

0       ttnet.com.tr
1        dodo.net.au
2    unknown.unknown
3    unknown.unknown
4    unknown.unknown
Name: geoNetwork.networkDomain, dtype: object

In [29]:
payingCustomers["totals.transactionRevenue"].head()

752     37860000
753    306670000
799     68030000
802     26250000
859    574150000
Name: totals.transactionRevenue, dtype: object

In [30]:
payingCustomers["totals.transactionRevenue"].describe()

count         1399
unique         969
top       16990000
freq            28
Name: totals.transactionRevenue, dtype: object

In [31]:
payingCustomers.head()

Unnamed: 0,channelGrouping,date,fullVisitorId,sessionId,socialEngagementType,visitId,visitNumber,visitStartTime,device.browser,device.operatingSystem,device.isMobile,device.deviceCategory,geoNetwork.continent,geoNetwork.subContinent,geoNetwork.country,geoNetwork.region,geoNetwork.metro,geoNetwork.city,geoNetwork.networkDomain,totals.visits,totals.hits,totals.pageviews,totals.bounces,totals.newVisits,totals.transactionRevenue,trafficSource.campaign,trafficSource.source,trafficSource.medium,trafficSource.keyword,trafficSource.isTrueDirect,trafficSource.referralPath,trafficSource.adwordsClickInfo.page,trafficSource.adwordsClickInfo.slot,trafficSource.adwordsClickInfo.gclId,trafficSource.adwordsClickInfo.adNetworkType,trafficSource.adwordsClickInfo.isVideoAd,trafficSource.adContent,trafficSource.campaignCode
752,Direct,20160902,6194193421514403509,6194193421514403509_1472843572,Not Socially Engaged,1472843572,1,1472843572,Chrome,Linux,False,desktop,Americas,Northern America,United States,Michigan,Detroit MI,Ann Arbor,(not set),1,11,11,,1.0,37860000,(not set),(direct),(none),,True,,,,,,,,
753,Organic Search,20160902,5327166854580374902,5327166854580374902_1472844906,Not Socially Engaged,1472844906,3,1472844906,Chrome,Macintosh,False,desktop,Americas,Northern America,United States,New York,New York NY,New York,(not set),1,11,10,,,306670000,(not set),google,organic,(not provided),True,,,,,,,,
799,Referral,20160902,8885051388942907862,8885051388942907862_1472827393,Not Socially Engaged,1472827393,7,1472827393,Chrome,Linux,False,desktop,Americas,Northern America,United States,New York,New York NY,New York,(not set),1,13,11,,,68030000,(not set),mall.googleplex.com,referral,,True,/,,,,,,,
802,Referral,20160902,185467632009737931,0185467632009737931_1472846398,Not Socially Engaged,1472846398,6,1472846398,Chrome,Windows,False,desktop,Americas,Northern America,United States,California,San Francisco-Oakland-San Jose CA,Mountain View,(not set),1,13,12,,,26250000,(not set),mall.googleplex.com,referral,,True,/,,,,,,,
859,Referral,20160902,3244885836845029978,3244885836845029978_1472824817,Not Socially Engaged,1472824817,4,1472824817,Chrome,Macintosh,False,desktop,Americas,Northern America,United States,not available in demo dataset,not available in demo dataset,not available in demo dataset,(not set),1,17,14,,,574150000,(not set),mall.googleplex.com,referral,,True,/,,,,,,,


## Deployment

Suggestion for deployment: Write a web API which can receive data like one or more rows from the test dataset and return a prediction for that data. We are thinking the client will send the data as JSON and receive a JSON response.

In order to make this happen, we will need to have a way to transform our data to JSON format, as well as a way to transform it back to a dataframe. 

In [32]:
train.select_dtypes(exclude=["number","bool_","object_"])

0
1
2
3
4
...
99995
99996
99997
99998
99999


In [33]:
train.select_dtypes(np.number).head()

Unnamed: 0,date,visitId,visitNumber,visitStartTime
0,20160902,1472830385,1,1472830385
1,20160902,1472880147,1,1472880147
2,20160902,1472865386,1,1472865386
3,20160902,1472881213,1,1472881213
4,20160902,1472822600,2,1472822600


In [34]:
train.select_dtypes(exclude=["number","bool_","object_"])

0
1
2
3
4
...
99995
99996
99997
99998
99999


In [48]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler,RobustScaler  
from sklearn.impute import SimpleImputer
from sklearn.pipeline import FeatureUnion
from sklearn.base import BaseEstimator,TransformerMixin
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils import check_array
from sklearn.preprocessing import LabelEncoder
from scipy import sparse

In [55]:
class CategoricalEncoder(BaseEstimator, TransformerMixin):
    

    def __init__(self, encoding='onehot', categories='auto', dtype=np.float64,
                 handle_unknown='error'):
        self.encoding = encoding
        self.categories = categories
        self.dtype = dtype
        self.handle_unknown = handle_unknown
        

    def fit(self, X, y=None):


            #OneHotEncoding all numerical values
            X = check_array(X, dtype=np.object, accept_sparse='csc', copy=True)
            n_samples, n_features = X.shape

            #LabelEncoding all categorical values
            self._label_encoders_ = [LabelEncoder() for _ in range(n_features)]

            for i in range(n_features):
                le = self._label_encoders_[i]
                Xi = X[:, i]
                if self.categories == 'auto':
                    le.fit(Xi)
                else:
                    valid_mask = np.in1d(Xi, self.categories[i])
                    if not np.all(valid_mask):
                        if self.handle_unknown == 'error':
                            diff = np.unique(Xi[~valid_mask])
                            msg = ("Found unknown categories {0} in column {1}"
                                   " during fit".format(diff, i))
                            raise ValueError(msg)
                    le.classes_ = np.array(np.sort(self.categories[i]))

            self.categories_ = [le.classes_ for le in self._label_encoders_]

            return self

    def transform(self, X):
            X = check_array(X, accept_sparse='csc', dtype=np.object, copy=True)
            n_samples, n_features = X.shape
            X_int = np.zeros_like(X, dtype=np.int)
            X_mask = np.ones_like(X, dtype=np.bool)

            for i in range(n_features):
                valid_mask = np.in1d(X[:, i], self.categories_[i])

                if not np.all(valid_mask):
                    if self.handle_unknown == 'error':
                        diff = np.unique(X[~valid_mask, i])
                        msg = ("Found unknown categories {0} in column {1}"
                               " during transform".format(diff, i))
                        raise ValueError(msg)
                    else:
                        X_mask[:, i] = valid_mask
                        X[:, i][~valid_mask] = self.categories_[i][0]
                X_int[:, i] = self._label_encoders_[i].transform(X[:, i])

            if self.encoding == 'ordinal':
                return X_int.astype(self.dtype, copy=False)

            mask = X_mask.ravel()
            n_values = [cats.shape[0] for cats in self.categories_]
            n_values = np.array([0] + n_values)
            indices = np.cumsum(n_values)

            column_indices = (X_int + indices[:-1]).ravel()[mask]
            row_indices = np.repeat(np.arange(n_samples, dtype=np.int32),
                                    n_features)[mask]
            data = np.ones(n_samples * n_features)[mask]

            out = sparse.csc_matrix((data, (row_indices, column_indices)),
                                    shape=(n_samples, indices[-1]),
                                    dtype=self.dtype).tocsr()
            if self.encoding == 'onehot-dense':
                return out.toarray()
            else:
                return out

In [56]:
class DataFrameSelector(BaseEstimator,TransformerMixin):
    def __init__(self,feature_names):
        self.feature_names = feature_names
    def fit(self,X,y=None):
        return self
    def transform(self,X):
        return X[self.feature_names].values
    

# build pipelines
cat_attribs = ["channelGrouping", "device.browser", 
            "device.deviceCategory", "device.operatingSystem", 
            "geoNetwork.city", "geoNetwork.continent", 
            "geoNetwork.country", "geoNetwork.metro",
            "geoNetwork.networkDomain", "geoNetwork.region", 
            "geoNetwork.subContinent", "trafficSource.adContent", 
            "trafficSource.adwordsClickInfo.adNetworkType", 
            "trafficSource.adwordsClickInfo.gclId", 
            "trafficSource.adwordsClickInfo.page", 
            "trafficSource.adwordsClickInfo.slot", "trafficSource.campaign",
            "trafficSource.keyword", "trafficSource.medium", 
            "trafficSource.referralPath", "trafficSource.source",
            'trafficSource.adwordsClickInfo.isVideoAd', 'trafficSource.isTrueDirect']

numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
num_attribs = train.select_dtypes(include=numerics)

num_pipeline = Pipeline([
               ('selector',DataFrameSelector(num_attribs)),      
               ('std_scaler',StandardScaler()), 
                ]) 

# build categorical pipeline
cat_pipeline = Pipeline([
                  ('selector',DataFrameSelector(cat_attribs)),
                  ('cat_encoder',CategoricalEncoder(encoding='onehot-dense')),
              ])


# merge all the transforms using "FeatureUnion"
pipelines = FeatureUnion(transformer_list=
                             [ 
                              ('num_pipeline',num_pipeline),
                              ('cat_pipeline',cat_pipeline),
                             ])

In [57]:
train.dtypes

channelGrouping                                 object
date                                             int64
fullVisitorId                                   object
sessionId                                       object
socialEngagementType                            object
visitId                                          int64
visitNumber                                      int64
visitStartTime                                   int64
device.browser                                  object
device.operatingSystem                          object
device.isMobile                                   bool
device.deviceCategory                           object
geoNetwork.continent                            object
geoNetwork.subContinent                         object
geoNetwork.country                              object
geoNetwork.region                               object
geoNetwork.metro                                object
geoNetwork.city                                 object
geoNetwork

In [58]:
train_prepared = pipelines.fit_transform(train)

ValueError: Boolean array expected for the condition, not int64

## Setting up the server

In [59]:
import flask
from flask import request

app = flask.Flask(__name__)

@app.route('/', methods=['GET'])
def home():
    return "<h1>API for Google store revenue predictions</h1>"

@app.route('/', methods=['POST'])
def predict():
    data = request.get_json()
    data = prepare(pd.read_json(data)) # Prepares the data for a prediction
    pred = model.predict(data)         
    res = prepare_result(pred, data)   # Prepares the result by constructing a dataframe of {id, predicted_revenue}
    return app.response_class(
        response=res,
        status=200,
        mimetype='application/json'
    )
    

app.run()

IndentationError: expected an indented block (<ipython-input-59-8f08ef0fb041>, line 14)

### TODO/Checklist

* Create pipeline
    * Create transformers for:
        * Remove useless columns
        * Impute missing values
        * Transform total revenue to natural log of total revenue
        
        
* Test the model to make sure it performs well-ish
        
* Store model to a file using pickle or joblib
    
    

* Create API - The API will receive raw data in the same format as our initial CSV files. It will then have to:
    * Remove missing columns
    * impute missing values
    * any other preprocessing
    * Make prediction
    * Return a JSON object containing customer ID and transactionrevenue. 
* Test API
   
We might start by only accepting single row data and then expand to allow multiple row data. 

### If we have time:
* Display data in nice ways to gain insights
* Try different models
* Deploy API