In [24]:
import pandas as pd
import numpy as np

# Random seed for reproducibility
seed = 202
np.random.seed(seed)

# Ignore warnings
import warnings
warnings.simplefilter('ignore')

# Garbage collector
import gc
gc.enable()

In [25]:
# Import the dataframes without JSON features and useless features
train_raw_df = pd.read_csv("cleaned_train(2).csv",
    dtype={'date': str, 'fullVisitorId': str}, nrows=None)
test_raw_df = pd.read_csv("cleaned_test(2).csv",
    dtype={'date': str, 'fullVisitorId': str}, nrows=None)
train_raw_df.shape, test_raw_df.shape

((903653, 33), (804684, 32))

## First preprocessing

### Cast string categorical features to lower case

In [26]:
train_raw_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 903653 entries, 0 to 903652
Data columns (total 33 columns):
channelGrouping                                 903653 non-null object
fullVisitorId                                   903653 non-null object
sessionId                                       903653 non-null object
visitId                                         903653 non-null int64
visitNumber                                     903653 non-null int64
visitStartTime                                  903653 non-null int64
device.browser                                  903653 non-null object
device.deviceCategory                           903653 non-null object
device.isMobile                                 903653 non-null bool
device.operatingSystem                          903653 non-null object
geoNetwork.city                                 903653 non-null object
geoNetwork.continent                            903653 non-null object
geoNetwork.country                        

In [27]:
string_features = ['channelGrouping', 'device.browser', 'device.deviceCategory',
                  'device.operatingSystem', 'geoNetwork.city', 'geoNetwork.continent', 'geoNetwork.country',
                  'geoNetwork.metro', 'geoNetwork.networkDomain', 'geoNetwork.region',
                  'geoNetwork.metro', 'geoNetwork.networkDomain', 'geoNetwork.region',
                  'trafficSource.adContent', 'trafficSource.adwordsClickInfo.adNetworkType', 'trafficSource.adwordsClickInfo.gclId',
                  'trafficSource.adwordsClickInfo.slot', 'trafficSource.campaign', 'trafficSource.referralPath',
                  'geoNetwork.subContinent','trafficSource.medium', 'trafficSource.source', 'trafficSource.keyword']

for col in string_features:
    train_raw_df[col] = train_raw_df[col].str.lower()
    test_raw_df[col] = test_raw_df[col].str.lower()

### NOTE: the next step is optional

### Inspect disguised nans

In [28]:
# check how many categories are present per column (before nans substitution)
for c in train_raw_df.columns:
    print(c, len(np.unique(train_raw_df[c].astype(str))))

channelGrouping 8
fullVisitorId 714167
sessionId 902755
visitId 886303
visitNumber 384
visitStartTime 887159
device.browser 54
device.deviceCategory 3
device.isMobile 2
device.operatingSystem 20
geoNetwork.city 649
geoNetwork.continent 6
geoNetwork.country 222
geoNetwork.metro 94
geoNetwork.networkDomain 28064
geoNetwork.region 376
geoNetwork.subContinent 23
totals.bounces 2
totals.hits 274
totals.newVisits 2
totals.pageviews 214
totals.transactionRevenue 5333
trafficSource.adContent 43
trafficSource.adwordsClickInfo.adNetworkType 3
trafficSource.adwordsClickInfo.gclId 17775
trafficSource.adwordsClickInfo.page 9
trafficSource.adwordsClickInfo.slot 3
trafficSource.campaign 10
trafficSource.isTrueDirect 2
trafficSource.keyword 3397
trafficSource.medium 7
trafficSource.referralPath 1474
trafficSource.source 380


In [29]:
from collections import Counter
Counter(train_raw_df['channelGrouping'])

Counter({'organic search': 381561,
         'referral': 104838,
         'paid search': 25326,
         'affiliates': 16403,
         'direct': 143026,
         'display': 6262,
         'social': 226117,
         '(other)': 120})

In [30]:
train_raw_df['channelGrouping'].unique()

array(['organic search', 'referral', 'paid search', 'affiliates',
       'direct', 'display', 'social', '(other)'], dtype=object)

### Convert disguised nans

In [31]:
nan_list = [
    "(not set)",
    "not available in demo dataset",
    "not.configured",
    "(not provided)",
    "unknown.unknown",
    "/",
    "(Other)",
    "(other)",
    "(none)"
]

nan_dict = {nl:np.nan for nl in nan_list}

# convert all "disguised" missing values to nans
def miss_to_nan(df):
    df.replace(nan_dict, inplace=True) # convert disguised NaNs
    df.dropna(axis=1, how='all', inplace=True)
    return df

In [32]:
'''
import datetime

# convert date from string to datetime
def date_conv(df):
    df['date'] = df['date'].apply(lambda x: datetime.date(int(str(x)[:4]), int(str(x)[4:6]), int(str(x)[6:])))
    return df

# convert all "disguised" missing values to nans
def miss_to_nan(df):
    df.replace(nan_dict, inplace=True) # convert disguised NaNs
    df.dropna(axis=1, how='all', inplace=True)
    return df

# exec the two previous functions
def first_preprocessing(df):
#     df = date_conv(df)
    df = miss_to_nan(df)
    return df
'''

'\nimport datetime\n\n# convert date from string to datetime\ndef date_conv(df):\n    df[\'date\'] = df[\'date\'].apply(lambda x: datetime.date(int(str(x)[:4]), int(str(x)[4:6]), int(str(x)[6:])))\n    return df\n\n# convert all "disguised" missing values to nans\ndef miss_to_nan(df):\n    df.replace(nan_dict, inplace=True) # convert disguised NaNs\n    df.dropna(axis=1, how=\'all\', inplace=True)\n    return df\n\n# exec the two previous functions\ndef first_preprocessing(df):\n#     df = date_conv(df)\n    df = miss_to_nan(df)\n    return df\n'

### Checkpoint

In [33]:
# apply the preprocessing up to this point and save into copies
train_df = miss_to_nan(train_raw_df)
test_df = miss_to_nan(test_raw_df)

### Fill nans with zeroes in target column

In [34]:
train_df['totals.transactionRevenue'].fillna(0, inplace=True)

### NOTE: the next step is optional

### Inspect data

In [35]:
# check the nan condition in totals.newVisits
from itertools import compress
sum(list(compress(train_df['visitNumber'] > 1, train_df['totals.newVisits'].isnull()))) == train_df['totals.newVisits'].isnull().sum()

True

In [11]:
# check the nan condition in totals.newVisits
print(sum(train_df['totals.newVisits'].isnull()))
sum(list(compress(train_df['totals.newVisits'].isnull(), train_df['visitNumber'] > 1)))

200593


200593

In [12]:
# check if there is difference between nan and (none) in df['trafficSource.medium']
print(sum(train_df['trafficSource.medium'].isnull()))
sum(list(compress(train_df['trafficSource.medium'].isnull(), train_df['totals.transactionRevenue'] == 0)))

### c'è un valore con revenue !!!!!

120


119

In [13]:
#same shit for trafficSource.source
print(sum(train_df['trafficSource.source'].isnull()))
sum(list(compress(train_df['trafficSource.source'].isnull(), train_df['totals.transactionRevenue'] == 0)))

69


69

In [14]:
train_df.isnull().sum()

channelGrouping                                      0
date                                                 0
fullVisitorId                                        0
sessionId                                            0
visitId                                              0
visitNumber                                          0
visitStartTime                                       0
device.browser                                       8
device.deviceCategory                                0
device.isMobile                                      0
device.operatingSystem                            4695
geoNetwork.city                                 542491
geoNetwork.continent                              1468
geoNetwork.country                                1468
geoNetwork.metro                                709995
geoNetwork.networkDomain                        390996
geoNetwork.region                               536056
geoNetwork.subContinent                           1468
totals.bou

In [15]:
#check again number of categories in each variable (eg column)
for c in train_df.columns:
    print(c, len(np.unique(train_df[c].astype(str))))

channelGrouping 8
date 366
fullVisitorId 714167
sessionId 902755
visitId 886303
visitNumber 384
visitStartTime 887159
device.browser 54
device.deviceCategory 3
device.isMobile 2
device.operatingSystem 20
geoNetwork.city 648
geoNetwork.continent 6
geoNetwork.country 222
geoNetwork.metro 93
geoNetwork.networkDomain 28062
geoNetwork.region 375
geoNetwork.subContinent 23
totals.bounces 2
totals.hits 274
totals.newVisits 2
totals.pageviews 214
totals.transactionRevenue 5333
trafficSource.adContent 45
trafficSource.adwordsClickInfo.adNetworkType 3
trafficSource.adwordsClickInfo.gclId 17775
trafficSource.adwordsClickInfo.isVideoAd 2
trafficSource.adwordsClickInfo.page 9
trafficSource.adwordsClickInfo.slot 3
trafficSource.campaign 10
trafficSource.isTrueDirect 2
trafficSource.keyword 3659
trafficSource.medium 7
trafficSource.referralPath 1475
trafficSource.source 380


In [36]:
for i in train_df.columns:    
    if len(train_df[i].unique()) <= 100:
        print(i + '\n')
        print(train_df[i].unique())
        print('\n')

channelGrouping

['Organic Search' 'Referral' 'Paid Search' 'Affiliates' 'Direct' 'Display'
 'Social' nan]


device.browser

['Chrome' 'Firefox' 'UC Browser' 'Internet Explorer' 'Safari' 'Edge'
 'Opera Mini' 'Opera' 'BlackBerry' 'Safari (in-app)' 'Coc Coc'
 'Mozilla Compatible Agent' 'ADM' 'MRCHROME' 'Amazon Silk' 'YaBrowser'
 'Android Webview' 'Puffin' 'Nokia Browser' 'Maxthon' 'Nintendo Browser'
 'Android Browser' 'Lunascape' 'IE with Chrome Frame' 'ThumbSniper'
 'LYF_LS_4002_12' 'Mozilla' 'osee2unifiedRelease' 'NokiaE52-1' 'Iron'
 '[Use default User-agent string] LIVRENPOCHE' 'LYF_LS_4002_11' 'M5'
 'Android Runtime' 'Apple-iPhone7C2' 'SeaMonkey' 'Konqueror' 'Seznam'
 'Changa 99695759' 'no-ua' 'MQQBrowser' 'Nichrome' 'HTC802t_TD'
 'DASH_JR_3G' 'DoCoMo' 'subjectAgent: NoticiasBoom' 'YE' 'User Agent' '0'
 'Hisense M20-M_LTE' 'Reddit' 'TCL P500M' 'CSM Click']


device.deviceCategory

['desktop' 'mobile' 'tablet']


device.isMobile

[False  True]


device.operatingSystem

['Windows' 'Mac

## Second preprocessing

### Missing values filling

In [36]:
def fill_nans(df):
    
    
    df['channelGrouping'].fillna('Organic Search', inplace=True) # few nans, a lot of "Organic Search"
    df['device.browser'].fillna('Chrome', inplace=True) # few nans, a lot of "Chrome"
    df['device.operatingSystem'].fillna('unknown', inplace=True) # no predominance of any value, quite a bit of nans, keep separated category
    df['totals.pageviews'].fillna(1, inplace=True) # many 1s, nans do not bring any revenue -> nans become 1s
    df['trafficSource.medium'].fillna('unknown', inplace=True) # seems to be the same
    df['trafficSource.source'].fillna('unknown', inplace = True) # seems to be the same
    
    
    df['geoNetwork.continent'].fillna('unknown', inplace=True)# keep separate category
    df['geoNetwork.country'].fillna('unknown', inplace=True)# keep separate category
    df['geoNetwork.subContinent'].fillna('unknown', inplace=True)# keep separate category

    
    # Features with huge amout of nans
    df['geoNetwork.city'].fillna('unknown', inplace=True)
    df['geoNetwork.metro'].fillna('unknown', inplace=True)
    df['geoNetwork.networkDomain'].fillna('unknown', inplace=True)
    df['geoNetwork.region'].fillna('unknown', inplace=True)
    df['totals.bounces'].fillna(0, inplace=True) # nan and 1
    df['trafficSource.adContent'].fillna('unknown', inplace=True)
    df['trafficSource.adwordsClickInfo.adNetworkType'].fillna('unknown', inplace=True)
    df['trafficSource.adwordsClickInfo.gclId'].fillna('unknown', inplace=True)
#     df['trafficSource.adwordsClickInfo.isVideoAd'].fillna(True, inplace=True) # nan and False
    df['trafficSource.adwordsClickInfo.page'].fillna(0, inplace=True) # 0 is not cointained in the set
    df['trafficSource.adwordsClickInfo.slot'].fillna('unknown', inplace=True)
    df['trafficSource.campaign'].fillna('unknown', inplace=True)
    df['trafficSource.isTrueDirect'].fillna(False, inplace=True) # nan and True
    df['trafficSource.keyword'].fillna('unknown', inplace=True)
    df['trafficSource.referralPath'].fillna('unknown', inplace=True)
    df['totals.newVisits'].fillna(0, inplace=True)# totals.newVisits is always == nan when visitNumber > 1, we keep it for now but it will be dropped eventually

    return df

In [37]:
train_df = fill_nans(train_df)
test_df = fill_nans(test_df)

In [38]:
train_df.isnull().sum()

channelGrouping                                 0
fullVisitorId                                   0
sessionId                                       0
visitId                                         0
visitNumber                                     0
visitStartTime                                  0
device.browser                                  0
device.deviceCategory                           0
device.isMobile                                 0
device.operatingSystem                          0
geoNetwork.city                                 0
geoNetwork.continent                            0
geoNetwork.country                              0
geoNetwork.metro                                0
geoNetwork.networkDomain                        0
geoNetwork.region                               0
geoNetwork.subContinent                         0
totals.bounces                                  0
totals.hits                                     0
totals.newVisits                                0


### NOTE: the next step is optional (may improve performances but it has to be tested)

### Cleaning rare categories

In [39]:
# Actually we have no longer missing values, and we can manage the problem of the rare categories
# We group them in an overfeature
def clear_rare_categories(df, feature, limit = 900):

    vc = df[feature].value_counts()
    
    common = vc > limit
    common = set(common.index[common].values)
    print("Set", sum(vc <= limit), feature, "categories to 'other';", end=" ")
    
    df.loc[df[feature].map(lambda x: x not in common), feature] = 'other'
    print("now there are", df[feature].nunique(), "categories")
    
    return df

In [40]:
# to be chacked better!!!

# There are features that we don't want to modify
raw_features = [
"channelGrouping",
"device.browser",
"device.deviceCategory",
"device.isMobile",
"device.operatingSystem",
"geoNetwork.city",
"geoNetwork.continent",
"geoNetwork.country",
"geoNetwork.metro",
"geoNetwork.networkDomain",
"geoNetwork.region",
"geoNetwork.subContinent",
"totals.bounces",
"totals.hits",
"totals.newVisits",
"totals.pageviews",
"trafficSource.adContent",
"trafficSource.adwordsClickInfo.adNetworkType",
"trafficSource.adwordsClickInfo.gclId",
# "trafficSource.adwordsClickInfo.isVideoAd",
"trafficSource.adwordsClickInfo.page",
"trafficSource.adwordsClickInfo.slot",
"trafficSource.campaign",
"trafficSource.isTrueDirect",
"trafficSource.keyword",
"trafficSource.medium",
"trafficSource.referralPath",
"trafficSource.source"
]


for feat in raw_features:
    print("\nTRAIN:\n")
    train_df = clear_rare_categories(train_df, feat)
    print("\nTEST:\n")
    test_df = clear_rare_categories(test_df, feat)
    
#     if(train_df[feat].nunique() != test_df[feat].nunique()):
#         print("\n\n\nWARNING:\n")



TRAIN:

Set 1 channelGrouping categories to 'other'; now there are 8 categories

TEST:

Set 1 channelGrouping categories to 'other'; now there are 8 categories

TRAIN:

Set 43 device.browser categories to 'other'; now there are 12 categories

TEST:

Set 96 device.browser categories to 'other'; now there are 14 categories

TRAIN:

Set 0 device.deviceCategory categories to 'other'; now there are 3 categories

TEST:

Set 0 device.deviceCategory categories to 'other'; now there are 3 categories

TRAIN:

Set 0 device.isMobile categories to 'other'; now there are 2 categories

TEST:

Set 0 device.isMobile categories to 'other'; now there are 2 categories

TRAIN:

Set 12 device.operatingSystem categories to 'other'; now there are 9 categories

TEST:

Set 15 device.operatingSystem categories to 'other'; now there are 8 categories

TRAIN:

Set 576 geoNetwork.city categories to 'other'; now there are 73 categories

TEST:

Set 653 geoNetwork.city categories to 'other'; now there are 79 categorie

In [17]:
#check again number of categories in each variable (eg column)
for c in train_df.columns:
    print(c, len(np.unique(train_df[c].astype(str))))

channelGrouping 7
date 366
fullVisitorId 714167
sessionId 902755
visitId 886303
visitNumber 384
visitStartTime 887159
device.browser 12
device.deviceCategory 3
device.isMobile 2
device.operatingSystem 9
geoNetwork.city 73
geoNetwork.continent 6
geoNetwork.country 69
geoNetwork.metro 19
geoNetwork.networkDomain 82
geoNetwork.region 61
geoNetwork.subContinent 20
totals.bounces 2
totals.hits 34
totals.newVisits 2
totals.pageviews 29
totals.transactionRevenue 5333
trafficSource.adContent 5
trafficSource.adwordsClickInfo.adNetworkType 3
trafficSource.adwordsClickInfo.gclId 2
trafficSource.adwordsClickInfo.isVideoAd 2
trafficSource.adwordsClickInfo.page 3
trafficSource.adwordsClickInfo.slot 3
trafficSource.campaign 5
trafficSource.isTrueDirect 2
trafficSource.keyword 8
trafficSource.medium 6
trafficSource.referralPath 31
trafficSource.source 23


### Drop meaningless columns

In [41]:
meaningless_columns =[]

def drop_meaningless_columns(df, meaningless_columns):
    df.drop(meaningless_columns, axis=1, inplace=True)
    return df

train_df = drop_meaningless_columns(train_df, meaningless_columns)
test_df = drop_meaningless_columns(test_df, meaningless_columns)
train_df.shape, test_df.shape

((903653, 33), (804684, 32))

In [20]:
"""

meaningless_columns = [
    'geoNetwork.city', 'geoNetwork.metro', 'geoNetwork.networkDomain', 'geoNetwork.region',
    'totals.bounces',
    'trafficSource.adContent', 'trafficSource.adwordsClickInfo.adNetworkType', 'trafficSource.adwordsClickInfo.gclId',
    'trafficSource.adwordsClickInfo.isVideoAd', 'trafficSource.adwordsClickInfo.page', 'trafficSource.adwordsClickInfo.slot',
    'trafficSource.campaign', 'trafficSource.isTrueDirect', 'trafficSource.keyword', 'trafficSource.referralPath'   
]

#NO DROP VERSION
for i in meaningless_columns:
    if train_df[i].dtype == float:

        train_df[i].fillna(0, inplace=True)
        test_df[i].fillna(0, inplace=True)
    else:
        print(i)        
        train_df[i].fillna('(other)', inplace=True)
        test_df[i].fillna('(other)', inplace=True)    

def drop_meaningless_columns(df, meaningless_columns):
    df.drop(meaningless_columns, axis=1, inplace=True)
    return df

#train_df = drop_meaningless_columns(train_df, meaningless_columns)
#test_df = drop_meaningless_columns(test_df, meaningless_columns)
train_df.shape, test_df.shape

"""

"\n\nmeaningless_columns = [\n    'geoNetwork.city', 'geoNetwork.metro', 'geoNetwork.networkDomain', 'geoNetwork.region',\n    'totals.bounces',\n    'trafficSource.adContent', 'trafficSource.adwordsClickInfo.adNetworkType', 'trafficSource.adwordsClickInfo.gclId',\n    'trafficSource.adwordsClickInfo.isVideoAd', 'trafficSource.adwordsClickInfo.page', 'trafficSource.adwordsClickInfo.slot',\n    'trafficSource.campaign', 'trafficSource.isTrueDirect', 'trafficSource.keyword', 'trafficSource.referralPath'   \n]\n\n#NO DROP VERSION\nfor i in meaningless_columns:\n    if train_df[i].dtype == float:\n\n        train_df[i].fillna(0, inplace=True)\n        test_df[i].fillna(0, inplace=True)\n    else:\n        print(i)        \n        train_df[i].fillna('(other)', inplace=True)\n        test_df[i].fillna('(other)', inplace=True)    \n\ndef drop_meaningless_columns(df, meaningless_columns):\n    df.drop(meaningless_columns, axis=1, inplace=True)\n    return df\n\n#train_df = drop_meaningless_co

###  Label encoder

In [42]:
from sklearn import preprocessing

for col in string_features:
    print(col)
    lbl = preprocessing.LabelEncoder()
    lbl.fit(list(train_df[col].values.astype('str')) + list(test_df[col].values.astype('str')))
    train_df[col] = lbl.transform(list(train_df[col].values.astype('str')))
    test_df[col] = lbl.transform(list(test_df[col].values.astype('str')))

channelGrouping
device.browser
device.deviceCategory
device.operatingSystem
geoNetwork.city
geoNetwork.continent
geoNetwork.country
geoNetwork.metro
geoNetwork.networkDomain
geoNetwork.region
geoNetwork.metro
geoNetwork.networkDomain
geoNetwork.region
trafficSource.adContent
trafficSource.adwordsClickInfo.adNetworkType
trafficSource.adwordsClickInfo.gclId
trafficSource.adwordsClickInfo.slot
trafficSource.campaign
trafficSource.referralPath
geoNetwork.subContinent
trafficSource.medium
trafficSource.source
trafficSource.keyword


### Export Dataframes

In [43]:
train_df.to_csv('preprocessed_train(3).csv', index = False)
test_df.to_csv('preprocessed_test(3).csv', index = False)