In [1]:
import pandas as pd
import numpy as np

# Random seed for reproducibility
seed = 202
np.random.seed(seed)

# Ignore warnings
import warnings
warnings.simplefilter('ignore')

# Garbage collector
import gc
gc.enable()

In [2]:
# Import the dataframes without JSON features and useless features
train_raw_df = pd.read_csv("cleaned_train(2).csv",
    dtype={'date': str, 'fullVisitorId': str}, nrows=None)
test_raw_df = pd.read_csv("cleaned_test(2).csv",
    dtype={'date': str, 'fullVisitorId': str}, nrows=None)
train_raw_df.shape, test_raw_df.shape

((903653, 33), (804684, 32))

## First preprocessing

### Cast string categorical features to lower case

In [None]:
train_raw_df.info()

In [3]:
string_features = ['channelGrouping', 'device.browser', 'device.deviceCategory',
                  'device.operatingSystem', 'geoNetwork.city', 'geoNetwork.continent', 'geoNetwork.country',
                  'geoNetwork.metro', 'geoNetwork.networkDomain', 'geoNetwork.region',
                  'geoNetwork.metro', 'geoNetwork.networkDomain', 'geoNetwork.region',
                  'trafficSource.adContent', 'trafficSource.adwordsClickInfo.adNetworkType', 'trafficSource.adwordsClickInfo.gclId',
                  'trafficSource.adwordsClickInfo.slot', 'trafficSource.campaign', 'trafficSource.referralPath',
                  'geoNetwork.subContinent','trafficSource.medium', 'trafficSource.source', 'trafficSource.keyword']

for col in string_features:
    train_raw_df[col] = train_raw_df[col].str.lower()
    test_raw_df[col] = test_raw_df[col].str.lower()

### NOTE: the next step is optional

### Inspect disguised nans

In [None]:
# check how many categories are present per column (before nans substitution)
for c in train_raw_df.columns:
    print(c, len(np.unique(train_raw_df[c].astype(str))))

In [None]:
from collections import Counter
Counter(train_raw_df['channelGrouping'])

In [None]:
train_raw_df['channelGrouping'].unique()

### Convert disguised nans

In [4]:
nan_list = [
    "(not set)",
    "not available in demo dataset",
    "not.configured",
    "(not provided)",
    "unknown.unknown",
    "/",
    "(Other)",
    "(other)",
    "(none)"
]

nan_dict = {nl:np.nan for nl in nan_list}

# convert all "disguised" missing values to nans
def miss_to_nan(df):
    df.replace(nan_dict, inplace=True) # convert disguised NaNs
    df.dropna(axis=1, how='all', inplace=True)
    return df

In [None]:
'''
import datetime

# convert date from string to datetime
def date_conv(df):
    df['date'] = df['date'].apply(lambda x: datetime.date(int(str(x)[:4]), int(str(x)[4:6]), int(str(x)[6:])))
    return df

# convert all "disguised" missing values to nans
def miss_to_nan(df):
    df.replace(nan_dict, inplace=True) # convert disguised NaNs
    df.dropna(axis=1, how='all', inplace=True)
    return df

# exec the two previous functions
def first_preprocessing(df):
#     df = date_conv(df)
    df = miss_to_nan(df)
    return df
'''

### Checkpoint

In [5]:
# apply the preprocessing up to this point and save into copies
train_df = miss_to_nan(train_raw_df)
test_df = miss_to_nan(test_raw_df)

### Fill nans with zeroes in target column

In [6]:
train_df['totals.transactionRevenue'].fillna(0, inplace=True)

### NOTE: the next step is optional

### Inspect data

In [None]:
# check the nan condition in totals.newVisits
from itertools import compress
sum(list(compress(train_df['visitNumber'] > 1, train_df['totals.newVisits'].isnull()))) == train_df['totals.newVisits'].isnull().sum()

In [None]:
# check the nan condition in totals.newVisits
print(sum(train_df['totals.newVisits'].isnull()))
sum(list(compress(train_df['totals.newVisits'].isnull(), train_df['visitNumber'] > 1)))

In [None]:
# check if there is difference between nan and (none) in df['trafficSource.medium']
print(sum(train_df['trafficSource.medium'].isnull()))
sum(list(compress(train_df['trafficSource.medium'].isnull(), train_df['totals.transactionRevenue'] == 0)))

### c'è un valore con revenue !!!!!

In [None]:
#same shit for trafficSource.source
print(sum(train_df['trafficSource.source'].isnull()))
sum(list(compress(train_df['trafficSource.source'].isnull(), train_df['totals.transactionRevenue'] == 0)))

In [None]:
train_df.isnull().sum()

In [17]:
#check again number of categories in each variable (eg column)
for c in train_df.columns:
    print(c, len(np.unique(train_df[c].astype(str))))

channelGrouping 8
fullVisitorId 714167
sessionId 902755
visitId 886303
visitNumber 384
visitStartTime 887159
device.browser 12
device.deviceCategory 3
device.isMobile 2
device.operatingSystem 9
geoNetwork.city 73
geoNetwork.continent 6
geoNetwork.country 69
geoNetwork.metro 19
geoNetwork.networkDomain 82
geoNetwork.region 61
geoNetwork.subContinent 20
totals.bounces 2
totals.hits 274
totals.newVisits 2
totals.pageviews 213
totals.transactionRevenue 5333
trafficSource.adContent 5
trafficSource.adwordsClickInfo.adNetworkType 3
trafficSource.adwordsClickInfo.gclId 17775
trafficSource.adwordsClickInfo.page 9
trafficSource.adwordsClickInfo.slot 3
trafficSource.campaign 5
trafficSource.isTrueDirect 2
trafficSource.keyword 8
trafficSource.medium 6
trafficSource.referralPath 31
trafficSource.source 23


In [None]:
for i in train_df.columns:    
    if len(train_df[i].unique()) <= 100:
        print(i + '\n')
        print(train_df[i].unique())
        print('\n')

## Second preprocessing

### Missing values filling

In [7]:
def fill_nans(df):
    
    
    df['channelGrouping'].fillna('Organic Search', inplace=True) # few nans, a lot of "Organic Search"
    df['device.browser'].fillna('Chrome', inplace=True) # few nans, a lot of "Chrome"
    df['device.operatingSystem'].fillna('unknown', inplace=True) # no predominance of any value, quite a bit of nans, keep separated category
    df['totals.pageviews'].fillna(1, inplace=True) # many 1s, nans do not bring any revenue -> nans become 1s
    df['trafficSource.medium'].fillna('unknown', inplace=True) # seems to be the same
    df['trafficSource.source'].fillna('unknown', inplace = True) # seems to be the same
    
    
    df['geoNetwork.continent'].fillna('unknown', inplace=True)# keep separate category
    df['geoNetwork.country'].fillna('unknown', inplace=True)# keep separate category
    df['geoNetwork.subContinent'].fillna('unknown', inplace=True)# keep separate category

    
    # Features with huge amout of nans
    df['geoNetwork.city'].fillna('unknown', inplace=True)
    df['geoNetwork.metro'].fillna('unknown', inplace=True)
    df['geoNetwork.networkDomain'].fillna('unknown', inplace=True)
    df['geoNetwork.region'].fillna('unknown', inplace=True)
    df['totals.bounces'].fillna(0, inplace=True) # nan and 1
    df['trafficSource.adContent'].fillna('unknown', inplace=True)
    df['trafficSource.adwordsClickInfo.adNetworkType'].fillna('unknown', inplace=True)
    df['trafficSource.adwordsClickInfo.gclId'].fillna('unknown', inplace=True)
#     df['trafficSource.adwordsClickInfo.isVideoAd'].fillna(True, inplace=True) # nan and False
    df['trafficSource.adwordsClickInfo.page'].fillna(0, inplace=True) # 0 is not cointained in the set
    df['trafficSource.adwordsClickInfo.slot'].fillna('unknown', inplace=True)
    df['trafficSource.campaign'].fillna('unknown', inplace=True)
    df['trafficSource.isTrueDirect'].fillna(False, inplace=True) # nan and True
    df['trafficSource.keyword'].fillna('unknown', inplace=True)
    df['trafficSource.referralPath'].fillna('unknown', inplace=True)
    df['totals.newVisits'].fillna(0, inplace=True)# totals.newVisits is always == nan when visitNumber > 1, we keep it for now but it will be dropped eventually

    return df

In [8]:
train_df = fill_nans(train_df)
test_df = fill_nans(test_df)

In [19]:
train_df.isnull().sum()

channelGrouping                                 0
fullVisitorId                                   0
sessionId                                       0
visitId                                         0
visitNumber                                     0
visitStartTime                                  0
device.browser                                  0
device.deviceCategory                           0
device.isMobile                                 0
device.operatingSystem                          0
geoNetwork.city                                 0
geoNetwork.continent                            0
geoNetwork.country                              0
geoNetwork.metro                                0
geoNetwork.networkDomain                        0
geoNetwork.region                               0
geoNetwork.subContinent                         0
totals.bounces                                  0
totals.hits                                     0
totals.newVisits                                0


### NOTE: the next step is optional (may improve performances but it has to be tested)

### Cleaning rare categories

In [9]:
# Actually we have no longer missing values, and we can manage the problem of the rare categories
# We group them in an overfeature
def clear_rare_categories(df, feature, limit = 900):

    vc = df[feature].value_counts()
    
    common = vc > limit
    common = set(common.index[common].values)
    print("Set", sum(vc <= limit), feature, "categories to 'other';", end=" ")
    
    df.loc[df[feature].map(lambda x: x not in common), feature] = 'other'
    print("now there are", df[feature].nunique(), "categories")
    
    return df

In [10]:
# to be chacked better!!!

# There are features that we don't want to modify
raw_features = [
"channelGrouping",
"device.browser",
"device.deviceCategory",
"device.isMobile",
"device.operatingSystem",
"geoNetwork.city",
"geoNetwork.continent",
"geoNetwork.country",
"geoNetwork.metro",
"geoNetwork.networkDomain",
"geoNetwork.region",
"geoNetwork.subContinent",
"totals.bounces",
"totals.newVisits",
"trafficSource.adContent",
"trafficSource.adwordsClickInfo.adNetworkType",
# "trafficSource.adwordsClickInfo.isVideoAd",
"trafficSource.adwordsClickInfo.slot",
"trafficSource.campaign",
"trafficSource.isTrueDirect",
"trafficSource.keyword",
"trafficSource.medium",
"trafficSource.referralPath",
"trafficSource.source"
]


for feat in raw_features:
    print("\nTRAIN:\n")
    train_df = clear_rare_categories(train_df, feat)
    print("\nTEST:\n")
    test_df = clear_rare_categories(test_df, feat)
    
#     if(train_df[feat].nunique() != test_df[feat].nunique()):
#         print("\n\n\nWARNING:\n")



TRAIN:

Set 1 channelGrouping categories to 'other'; now there are 8 categories

TEST:

Set 1 channelGrouping categories to 'other'; now there are 8 categories

TRAIN:

Set 43 device.browser categories to 'other'; now there are 12 categories

TEST:

Set 96 device.browser categories to 'other'; now there are 14 categories

TRAIN:

Set 0 device.deviceCategory categories to 'other'; now there are 3 categories

TEST:

Set 0 device.deviceCategory categories to 'other'; now there are 3 categories

TRAIN:

Set 0 device.isMobile categories to 'other'; now there are 2 categories

TEST:

Set 0 device.isMobile categories to 'other'; now there are 2 categories

TRAIN:

Set 12 device.operatingSystem categories to 'other'; now there are 9 categories

TEST:

Set 15 device.operatingSystem categories to 'other'; now there are 8 categories

TRAIN:

Set 576 geoNetwork.city categories to 'other'; now there are 73 categories

TEST:

Set 653 geoNetwork.city categories to 'other'; now there are 79 categorie

In [16]:
for i in train_df.columns:    
    if len(train_df[i].unique()) <= 100:
        print(i + '\n')
        print(train_df[i].unique())
        print('\n')

channelGrouping

[3 6 5 0 1 2 7 4]


device.browser

[ 2  4 12  5  9  3  7  6  8 10 13  1]


device.deviceCategory

[0 1 2]


device.isMobile

[False  True]


device.operatingSystem

[7 4 3 0 2 1 5 6 8]


geoNetwork.city

[31 82 44 57 59 76 13  6 33  8 41  1 50 36 75 83 21 46  7 73 51 30 20 54
 79 55 28 81 84 42 63 70 74 58 72 52 18 11 80 39 78  3 71  5 62 49 66 25
 24  2 64 48 19 35 77 22 17 34 27 26  4 69 15 45 29 68 37  9 12 85 10 47
  0]


geoNetwork.continent

[2 4 3 1 0 5]


geoNetwork.country

[62  2 55 27 65 31 42  3 37 26 21  8 12 51  1 45 22 10 60 25 34 16 59 48
 39  6 54 11 29 44 23 35 41 66  4 32 15 57 64 58 46 43 24 69 56 50 40 47
 63 30 52 67  7 14 53 13  0 20 19  9 38 49 36 61 68 17 33  5 18]


geoNetwork.metro

[17  2 15  4 18  3 16 20 12 23  1  9 11 21 10  6 24  0 14]


geoNetwork.networkDomain

[64 33 66 17  5 12 79 63 67 22  6 41 35 82 30  0 36 32 43 37 11 48 29 47
 38 58 13 76 75  4 73 14 61 25 71 78 40 62 74  3 54 15 77 39 57 42 70 56
 34 81 83 18 21 49 50 31 27 44

In [18]:
#check again number of categories in each variable (eg column)
for c in train_df.columns:
    print(c, len(np.unique(train_df[c].astype(str))))

channelGrouping 8
fullVisitorId 714167
sessionId 902755
visitId 886303
visitNumber 384
visitStartTime 887159
device.browser 12
device.deviceCategory 3
device.isMobile 2
device.operatingSystem 9
geoNetwork.city 73
geoNetwork.continent 6
geoNetwork.country 69
geoNetwork.metro 19
geoNetwork.networkDomain 82
geoNetwork.region 61
geoNetwork.subContinent 20
totals.bounces 2
totals.hits 274
totals.newVisits 2
totals.pageviews 213
totals.transactionRevenue 5333
trafficSource.adContent 5
trafficSource.adwordsClickInfo.adNetworkType 3
trafficSource.adwordsClickInfo.gclId 17775
trafficSource.adwordsClickInfo.page 9
trafficSource.adwordsClickInfo.slot 3
trafficSource.campaign 5
trafficSource.isTrueDirect 2
trafficSource.keyword 8
trafficSource.medium 6
trafficSource.referralPath 31
trafficSource.source 23


### Drop meaningless columns

In [12]:
meaningless_columns =[]

def drop_meaningless_columns(df, meaningless_columns):
    df.drop(meaningless_columns, axis=1, inplace=True)
    return df

train_df = drop_meaningless_columns(train_df, meaningless_columns)
test_df = drop_meaningless_columns(test_df, meaningless_columns)
train_df.shape, test_df.shape

((903653, 33), (804684, 32))

In [None]:
"""

meaningless_columns = [
    'geoNetwork.city', 'geoNetwork.metro', 'geoNetwork.networkDomain', 'geoNetwork.region',
    'totals.bounces',
    'trafficSource.adContent', 'trafficSource.adwordsClickInfo.adNetworkType', 'trafficSource.adwordsClickInfo.gclId',
    'trafficSource.adwordsClickInfo.isVideoAd', 'trafficSource.adwordsClickInfo.page', 'trafficSource.adwordsClickInfo.slot',
    'trafficSource.campaign', 'trafficSource.isTrueDirect', 'trafficSource.keyword', 'trafficSource.referralPath'   
]

#NO DROP VERSION
for i in meaningless_columns:
    if train_df[i].dtype == float:

        train_df[i].fillna(0, inplace=True)
        test_df[i].fillna(0, inplace=True)
    else:
        print(i)        
        train_df[i].fillna('(other)', inplace=True)
        test_df[i].fillna('(other)', inplace=True)    

def drop_meaningless_columns(df, meaningless_columns):
    df.drop(meaningless_columns, axis=1, inplace=True)
    return df

#train_df = drop_meaningless_columns(train_df, meaningless_columns)
#test_df = drop_meaningless_columns(test_df, meaningless_columns)
train_df.shape, test_df.shape

"""

###  Label encoder

In [13]:
from sklearn import preprocessing

for col in string_features:
    print(col)
    lbl = preprocessing.LabelEncoder()
    lbl.fit(list(train_df[col].values.astype('str')) + list(test_df[col].values.astype('str')))
    train_df[col] = lbl.transform(list(train_df[col].values.astype('str')))
    test_df[col] = lbl.transform(list(test_df[col].values.astype('str')))

channelGrouping
device.browser
device.deviceCategory
device.operatingSystem
geoNetwork.city
geoNetwork.continent
geoNetwork.country
geoNetwork.metro
geoNetwork.networkDomain
geoNetwork.region
geoNetwork.metro
geoNetwork.networkDomain
geoNetwork.region
trafficSource.adContent
trafficSource.adwordsClickInfo.adNetworkType
trafficSource.adwordsClickInfo.gclId
trafficSource.adwordsClickInfo.slot
trafficSource.campaign
trafficSource.referralPath
geoNetwork.subContinent
trafficSource.medium
trafficSource.source
trafficSource.keyword


### Export Dataframes

In [14]:
train_df.to_csv('preprocessed_train(3).csv', index = False)
test_df.to_csv('preprocessed_test(3).csv', index = False)