In [1]:
import gc
import pandas as pd
import numpy as np

# Random seed for reproducibility
seed = 202
np.random.seed(seed)

# Ignore warnings
import warnings
warnings.simplefilter('ignore')

In [2]:
# Import the dataframes without JSON features and useless features
train_raw_df = pd.read_csv("cleaned_train(2).csv",
    dtype={'date': str, 'fullVisitorId': str}, nrows=None)
test_raw_df = pd.read_csv("cleaned_test(2).csv",
    dtype={'date': str, 'fullVisitorId': str}, nrows=None)
train_raw_df.shape, test_raw_df.shape

((903653, 34), (804684, 33))

### Inspect disguised Nans

In [3]:
# check how many categories are present per column (before nans substitution)
for c in train_raw_df.columns:
    print(c, len(np.unique(train_raw_df[c].astype(str))))

channelGrouping 8
date 366
fullVisitorId 714167
visitId 886303
visitNumber 384
visitStartTime 887159
device.browser 54
device.deviceCategory 3
device.isMobile 2
device.operatingSystem 20
geoNetwork.city 649
geoNetwork.continent 6
geoNetwork.country 222
geoNetwork.metro 94
geoNetwork.networkDomain 28064
geoNetwork.region 376
geoNetwork.subContinent 23
totals.bounces 2
totals.hits 274
totals.newVisits 2
totals.pageviews 214
totals.transactionRevenue 5333
trafficSource.adContent 45
trafficSource.adwordsClickInfo.adNetworkType 3
trafficSource.adwordsClickInfo.gclId 17775
trafficSource.adwordsClickInfo.isVideoAd 2
trafficSource.adwordsClickInfo.page 9
trafficSource.adwordsClickInfo.slot 3
trafficSource.campaign 10
trafficSource.isTrueDirect 2
trafficSource.keyword 3660
trafficSource.medium 7
trafficSource.referralPath 1476
trafficSource.source 380


In [8]:
train_raw_df['channelGrouping'].unique()

array(['Organic Search', 'Referral', 'Paid Search', 'Affiliates',
       'Direct', 'Display', 'Social', '(Other)'], dtype=object)

In [9]:
from collections import Counter
Counter(train_raw_df['channelGrouping'])

Counter({'Organic Search': 381561,
         'Referral': 104838,
         'Paid Search': 25326,
         'Affiliates': 16403,
         'Direct': 143026,
         'Display': 6262,
         'Social': 226117,
         '(Other)': 120})

### Convert disguised Nans and date to datetime

In [8]:
nan_list = [
    "(not set)",
    "not available in demo dataset",
    "not.configured",
    "(not provided)",
    "unknown.unknown",
    "/"
]

nan_dict = {nl:np.nan for nl in nan_list}

In [9]:
import datetime

# convert date from string to datetime
def date_conv(df):
    df['date'] = df['date'].apply(lambda x: datetime.date(int(str(x)[:4]), int(str(x)[4:6]), int(str(x)[6:])))
    return df

# convert all "disguised" missing values to nans
def miss_to_nan(df):
    df.replace(nan_dict, inplace=True) # convert disguised NaNs
    df.dropna(axis=1, how='all', inplace=True)
    return df

# exec the two previous functions
def first_preprocessing(df):
    df = date_conv(df)
    df = miss_to_nan(df)
    return df

### Checkpoint

In [10]:
# apply the preprocessing up to this point and save into copies
train_df = first_preprocessing(train_raw_df)
test_df = first_preprocessing(test_raw_df)

In [12]:
# fill nans with zeroes in target column
train_df['totals.transactionRevenue'].fillna(0, inplace=True)

In [20]:
# check the nan condition in totals.newVisits
from itertools import compress
sum(list(compress(train_df['visitNumber'] > 1, train_df['totals.newVisits'].isnull()))) == train_df['totals.newVisits'].isnull().sum()

True

In [21]:
# check the nan condition in totals.newVisits
print(sum(train_df['totals.newVisits'].isnull()))
sum(list(compress(train_df['totals.newVisits'].isnull(), train_df['visitNumber'] > 1)))

200593


200593

In [22]:
# check if there is difference between nan and (none) in df['trafficSource.medium']
print(sum(train_df['trafficSource.medium'].isnull()))
sum(list(compress(train_df['trafficSource.medium'].isnull(), train_df['totals.transactionRevenue'] == 0)))

### c'è un valore con revenue !!!!!

120


119

In [23]:
#same shit for trafficSource.source
print(sum(train_df['trafficSource.source'].isnull()))
sum(list(compress(train_df['trafficSource.source'].isnull(), train_df['totals.transactionRevenue'] == 0)))

69


69

In [24]:
train_df.isnull().sum()

channelGrouping                                      0
date                                                 0
fullVisitorId                                        0
visitId                                              0
visitNumber                                          0
visitStartTime                                       0
device.browser                                       8
device.deviceCategory                                0
device.isMobile                                      0
device.operatingSystem                            4695
geoNetwork.city                                 542491
geoNetwork.continent                              1468
geoNetwork.country                                1468
geoNetwork.metro                                709995
geoNetwork.networkDomain                        390996
geoNetwork.region                               536056
geoNetwork.subContinent                           1468
totals.bounces                                  453023
totals.hit

In [25]:
#check again number of categories in each variable (eg column)
for c in train_df.columns:
    print(c, len(np.unique(train_df[c].astype(str))))

channelGrouping 8
date 366
fullVisitorId 714167
visitId 886303
visitNumber 384
visitStartTime 887159
device.browser 54
device.deviceCategory 3
device.isMobile 2
device.operatingSystem 20
geoNetwork.city 648
geoNetwork.continent 6
geoNetwork.country 222
geoNetwork.metro 93
geoNetwork.networkDomain 28062
geoNetwork.region 375
geoNetwork.subContinent 23
totals.bounces 2
totals.hits 274
totals.newVisits 2
totals.pageviews 214
totals.transactionRevenue 5333
trafficSource.adContent 45
trafficSource.adwordsClickInfo.adNetworkType 3
trafficSource.adwordsClickInfo.gclId 17775
trafficSource.adwordsClickInfo.isVideoAd 2
trafficSource.adwordsClickInfo.page 9
trafficSource.adwordsClickInfo.slot 3
trafficSource.campaign 10
trafficSource.isTrueDirect 2
trafficSource.keyword 3659
trafficSource.medium 7
trafficSource.referralPath 1475
trafficSource.source 380


In [26]:
for i in train_df.columns:    
    if len(train_df[i].unique()) <= 100:
        print(i + '\n')
        print(train_df[i].unique())
        print('\n')

channelGrouping

['Organic Search' 'Referral' 'Paid Search' 'Affiliates' 'Direct' 'Display'
 'Social' '(Other)']


device.browser

['Chrome' 'Firefox' 'UC Browser' 'Internet Explorer' 'Safari' 'Edge'
 'Opera Mini' 'Opera' 'BlackBerry' 'Safari (in-app)' 'Coc Coc'
 'Mozilla Compatible Agent' 'ADM' 'MRCHROME' 'Amazon Silk' 'YaBrowser'
 'Android Webview' 'Puffin' 'Nokia Browser' 'Maxthon' 'Nintendo Browser'
 'Android Browser' 'Lunascape' 'IE with Chrome Frame' 'ThumbSniper'
 'LYF_LS_4002_12' 'Mozilla' 'osee2unifiedRelease' 'NokiaE52-1' 'Iron'
 '[Use default User-agent string] LIVRENPOCHE' nan 'LYF_LS_4002_11' 'M5'
 'Android Runtime' 'Apple-iPhone7C2' 'SeaMonkey' 'Konqueror' 'Seznam'
 'Changa 99695759' 'no-ua' 'MQQBrowser' 'Nichrome' 'HTC802t_TD'
 'DASH_JR_3G' 'DoCoMo' 'subjectAgent: NoticiasBoom' 'YE' 'User Agent' '0'
 'Hisense M20-M_LTE' 'Reddit' 'TCL P500M' 'CSM Click']


device.deviceCategory

['desktop' 'mobile' 'tablet']


device.isMobile

[False  True]


device.operatingSystem

['Win

In [13]:
# # -> possibile da rivalutare
# _ -> ok
# #? -> non mi ricordo

def fill_nans(df):
    
    
    df['device.browser'].fillna('Chrome', inplace=True) #few nans, a lot of "Chrome"
    df['device.operatingSystem'].fillna('(other)', inplace=True) #no predominance of any value, quite a bit of nans, keep separated category
    df['totals.pageviews'].fillna(1, inplace=True) #many 1s, nans do not bring any revenue -> nans become 1s
    df['trafficSource.medium'].fillna('(none)', inplace=True) #seems to be the same
    df['trafficSource.source'].fillna('(other)', inplace = True) #seems to be the same
    
    df['geoNetwork.continent'].fillna('(other)', inplace=True)# keep separate category
    df['geoNetwork.country'].fillna('(other)', inplace=True)# keep separate category
    df['geoNetwork.subContinent'].fillna('(other)', inplace=True)# keep separate category

    
    df['totals.newVisits'].fillna(0, inplace=True)# totals.newVisits is always == nan when visitNumber > 1, we keep it for now but it will be dropped eventually
    
    
    # verificare che non si possano fare migliori assunzioni
    return df

In [14]:
train_df = fill_nans(train_df)
test_df = fill_nans(test_df)

### Drop meaningless columns

In [31]:
train_df.isnull().sum()

channelGrouping                                      0
date                                                 0
fullVisitorId                                        0
visitId                                              0
visitNumber                                          0
visitStartTime                                       0
device.browser                                       0
device.deviceCategory                                0
device.isMobile                                      0
device.operatingSystem                               0
geoNetwork.city                                 542491
geoNetwork.continent                                 0
geoNetwork.country                                   0
geoNetwork.metro                                709995
geoNetwork.networkDomain                        390996
geoNetwork.region                               536056
geoNetwork.subContinent                              0
totals.bounces                                  453023
totals.hit

In [16]:
meaningless_columns = [
    'geoNetwork.city', 'geoNetwork.metro', 'geoNetwork.networkDomain', 'geoNetwork.region',
    'totals.bounces',
    'trafficSource.adContent', 'trafficSource.adwordsClickInfo.adNetworkType', 'trafficSource.adwordsClickInfo.gclId',
    'trafficSource.adwordsClickInfo.isVideoAd', 'trafficSource.adwordsClickInfo.page', 'trafficSource.adwordsClickInfo.slot',
    'trafficSource.campaign', 'trafficSource.isTrueDirect', 'trafficSource.keyword', 'trafficSource.referralPath'   
]

def drop_meaningless_columns(df, meaningless_columns):
    df.drop(meaningless_columns, axis=1, inplace=True)
    return df

train_df = drop_meaningless_columns(train_df, meaningless_columns)
test_df = drop_meaningless_columns(test_df, meaningless_columns)
train_df.shape, test_df.shape

((903653, 19), (804684, 18))

In [34]:
train_df.isnull().sum()

channelGrouping              0
date                         0
fullVisitorId                0
visitId                      0
visitNumber                  0
visitStartTime               0
device.browser               0
device.deviceCategory        0
device.isMobile              0
device.operatingSystem       0
geoNetwork.continent         0
geoNetwork.country           0
geoNetwork.subContinent      0
totals.hits                  0
totals.newVisits             0
totals.pageviews             0
totals.transactionRevenue    0
trafficSource.medium         0
trafficSource.source         0
dtype: int64

### Export Dataframes

In [18]:
train_df.to_csv('preprocessed_train(3).csv', index = False)
test_df.to_csv('preprocessed_test(3).csv', index = False)