In [1]:
import gc
import pandas as pd
import numpy as np

# Random seed for reproducibility
seed = 202
np.random.seed(seed)

# Ignore warnings
import warnings
warnings.simplefilter('ignore')

In [2]:
# Import the dataframes without JSON features and useless features
train_raw_df = pd.read_csv("cleaned_train(2).csv",
    dtype={'date': str, 'fullVisitorId': str}, nrows=None)
test_raw_df = pd.read_csv("cleaned_test(2).csv",
    dtype={'date': str, 'fullVisitorId': str}, nrows=None)
train_raw_df.shape, test_raw_df.shape

((903653, 34), (804684, 33))

### Inspect disguised Nans

In [3]:
train_raw_df

Unnamed: 0,channelGrouping,date,fullVisitorId,visitId,visitNumber,visitStartTime,device.browser,device.deviceCategory,device.isMobile,device.operatingSystem,...,trafficSource.adwordsClickInfo.gclId,trafficSource.adwordsClickInfo.isVideoAd,trafficSource.adwordsClickInfo.page,trafficSource.adwordsClickInfo.slot,trafficSource.campaign,trafficSource.isTrueDirect,trafficSource.keyword,trafficSource.medium,trafficSource.referralPath,trafficSource.source
0,Organic Search,20160902,1131660440785968503,1472830385,1,1472830385,Chrome,desktop,False,Windows,...,,,,,(not set),,(not provided),organic,,google
1,Organic Search,20160902,377306020877927890,1472880147,1,1472880147,Firefox,desktop,False,Macintosh,...,,,,,(not set),,(not provided),organic,,google
2,Organic Search,20160902,3895546263509774583,1472865386,1,1472865386,Chrome,desktop,False,Windows,...,,,,,(not set),,(not provided),organic,,google
3,Organic Search,20160902,4763447161404445595,1472881213,1,1472881213,UC Browser,desktop,False,Linux,...,,,,,(not set),,google + online,organic,,google
4,Organic Search,20160902,27294437909732085,1472822600,2,1472822600,Chrome,mobile,True,Android,...,,,,,(not set),True,(not provided),organic,,google
5,Organic Search,20160902,2938943183656635653,1472807194,1,1472807194,Chrome,desktop,False,Windows,...,,,,,(not set),,(not provided),organic,,google
6,Organic Search,20160902,1905672039242460897,1472817241,1,1472817241,Chrome,desktop,False,Windows,...,,,,,(not set),,(not provided),organic,,google
7,Organic Search,20160902,537222803633850821,1472812602,1,1472812602,Chrome,desktop,False,Windows,...,,,,,(not set),,(not provided),organic,,google
8,Organic Search,20160902,4445454811831400414,1472805784,1,1472805784,Internet Explorer,desktop,False,Windows,...,,,,,(not set),,(not provided),organic,,google
9,Organic Search,20160902,9499785259412240342,1472812272,1,1472812272,Firefox,desktop,False,Windows,...,,,,,(not set),,(not provided),organic,,google


In [4]:
train_raw_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 903653 entries, 0 to 903652
Data columns (total 34 columns):
channelGrouping                                 903653 non-null object
date                                            903653 non-null object
fullVisitorId                                   903653 non-null object
visitId                                         903653 non-null int64
visitNumber                                     903653 non-null int64
visitStartTime                                  903653 non-null int64
device.browser                                  903653 non-null object
device.deviceCategory                           903653 non-null object
device.isMobile                                 903653 non-null bool
device.operatingSystem                          903653 non-null object
geoNetwork.city                                 903653 non-null object
geoNetwork.continent                            903653 non-null object
geoNetwork.country                        

In [5]:
#check number of nans before substituting
train_raw_df.isnull().sum()

channelGrouping                                      0
date                                                 0
fullVisitorId                                        0
visitId                                              0
visitNumber                                          0
visitStartTime                                       0
device.browser                                       0
device.deviceCategory                                0
device.isMobile                                      0
device.operatingSystem                               0
geoNetwork.city                                      0
geoNetwork.continent                                 0
geoNetwork.country                                   0
geoNetwork.metro                                     0
geoNetwork.networkDomain                             0
geoNetwork.region                                    0
geoNetwork.subContinent                              0
totals.bounces                                       0
totals.hit

In [6]:
#check how many categories are present per column (before nans substitution)
for c in train_raw_df.columns:
    print(c, len(np.unique(train_raw_df[c].astype(str))))

channelGrouping 8
date 366
fullVisitorId 714167
visitId 886303
visitNumber 384
visitStartTime 887159
device.browser 54
device.deviceCategory 3
device.isMobile 2
device.operatingSystem 20
geoNetwork.city 649
geoNetwork.continent 6
geoNetwork.country 222
geoNetwork.metro 94
geoNetwork.networkDomain 28064
geoNetwork.region 376
geoNetwork.subContinent 23
totals.bounces 2
totals.hits 274
totals.newVisits 2
totals.pageviews 214
totals.transactionRevenue 5333
trafficSource.adContent 45
trafficSource.adwordsClickInfo.adNetworkType 3
trafficSource.adwordsClickInfo.gclId 17775
trafficSource.adwordsClickInfo.isVideoAd 2
trafficSource.adwordsClickInfo.page 9
trafficSource.adwordsClickInfo.slot 3
trafficSource.campaign 10
trafficSource.isTrueDirect 2
trafficSource.keyword 3660
trafficSource.medium 7
trafficSource.referralPath 1476
trafficSource.source 380


In [9]:
### TODO
# totals.bounces, totals.newVisits : i Nan sono 0
# trafficSource.adContent: ci sono valori categorici uguali scritti in maniera diversa ma frega cazzi tanto la deletiamo
# formattare ogni colonna a lower case e a string


# inserire parser fatto da riccardo

In [6]:
train_raw_df['fullVisitorId'].unique()

array(['1131660440785968503', '377306020877927890', '3895546263509774583',
       ..., '5123779100307500332', '7231728964973959842',
       '5744576632396406899'], dtype=object)

In [7]:
from collections import Counter
Counter(train_raw_df['channelGrouping'])

Counter({'Organic Search': 381561,
         'Referral': 104838,
         'Paid Search': 25326,
         'Affiliates': 16403,
         'Direct': 143026,
         'Display': 6262,
         'Social': 226117,
         '(Other)': 120})

### Convert disguised Nans and date to datetime

In [9]:
nan_list = [
    "(not set)",
    "not available in demo dataset",
    "not.configured",
    "(not provided)",
    "unknown.unknown",
    "/"
]

nan_dict = {nl:np.nan for nl in nan_list}

In [10]:
import datetime

#convert date from str to datetime
def date_conv(df):
    df['date'] = df['date'].apply(lambda x: datetime.date(int(str(x)[:4]), int(str(x)[4:6]), int(str(x)[6:])))
    return df

#convert all "disguised" missing values to nans
def miss_to_nan(df):
    df.replace(nan_dict, inplace=True) # convert disguised NaNs
    df.dropna(axis=1, how='all', inplace=True)
    return df

#exec the two previous functions
def first_preprocessing(df):
    df = date_conv(df)
    df = miss_to_nan(df)
    return df

### Checkpoint

In [11]:
#apply the preprocessing up to this point and save into copies
train_df = first_preprocessing(train_raw_df)
test_df = first_preprocessing(test_raw_df)

In [35]:
#fill nans with zeroes in target column
train_df['totals.transactionRevenue'].fillna(0, inplace=True)

In [36]:
#check the nan condition in totals.newVisits
from itertools import compress
sum(i == True for i in list(compress(train_df['visitNumber'] > 1, train_df['totals.newVisits'].isnull()))) == train_df['totals.newVisits'].isnull().sum()

True

In [40]:
#check if there is difference between nan and (none) in df['trafficSource.medium']
print(sum(train_df['trafficSource.medium'].isnull()))
sum(i == True for i in list(compress(train_df['trafficSource.medium'].isnull(), train_df['totals.transactionRevenue'] != 0)))

120


120

In [None]:
#same shit for trafficSource.source
print(sum(train_df['trafficSource.source'].isnull()))
sum(i == True for i in list(compress(train_df['trafficSource.medium'].isnull(), train_df['totals.transactionRevenue'] != 0)))

In [45]:
train_df.isnull().sum()

channelGrouping                                      0
date                                                 0
fullVisitorId                                        0
visitId                                              0
visitNumber                                          0
visitStartTime                                       0
device.browser                                       0
device.deviceCategory                                0
device.isMobile                                      0
device.operatingSystem                               0
geoNetwork.city                                 542491
geoNetwork.continent                                 0
geoNetwork.country                                   0
geoNetwork.metro                                709995
geoNetwork.networkDomain                        390996
geoNetwork.region                               536056
geoNetwork.subContinent                              0
totals.bounces                                       0
totals.hit

In [14]:
#check again number of categories in each variable (eg column)
for c in train_df.columns:
    print(c, len(np.unique(train_df[c].astype(str))))

channelGrouping 8
date 366
fullVisitorId 714167
visitId 886303
visitNumber 384
visitStartTime 887159
device.browser 54
device.deviceCategory 3
device.isMobile 2
device.operatingSystem 20
geoNetwork.city 648
geoNetwork.continent 6
geoNetwork.country 222
geoNetwork.metro 93
geoNetwork.networkDomain 28062
geoNetwork.region 375
geoNetwork.subContinent 23
totals.bounces 2
totals.hits 274
totals.newVisits 2
totals.pageviews 214
totals.transactionRevenue 5333
trafficSource.adContent 45
trafficSource.adwordsClickInfo.adNetworkType 3
trafficSource.adwordsClickInfo.gclId 17775
trafficSource.adwordsClickInfo.isVideoAd 2
trafficSource.adwordsClickInfo.page 9
trafficSource.adwordsClickInfo.slot 3
trafficSource.campaign 10
trafficSource.isTrueDirect 2
trafficSource.keyword 3659
trafficSource.medium 7
trafficSource.referralPath 1475
trafficSource.source 380


In [46]:
for i in train_df.columns:    
    if len(train_df[i].unique()) <= 400:
        print(i + '\n')
        print(train_df[i].unique())
        print('\n')

channelGrouping

['Organic Search' 'Referral' 'Paid Search' 'Affiliates' 'Direct' 'Display'
 'Social' '(Other)']


date

[datetime.date(2016, 9, 2) datetime.date(2017, 1, 26)
 datetime.date(2017, 6, 23) datetime.date(2017, 3, 12)
 datetime.date(2017, 2, 3) datetime.date(2016, 8, 11)
 datetime.date(2017, 6, 13) datetime.date(2017, 1, 13)
 datetime.date(2016, 11, 18) datetime.date(2016, 11, 10)
 datetime.date(2016, 11, 12) datetime.date(2017, 6, 21)
 datetime.date(2016, 11, 23) datetime.date(2017, 5, 1)
 datetime.date(2017, 6, 11) datetime.date(2017, 6, 30)
 datetime.date(2016, 12, 13) datetime.date(2017, 5, 19)
 datetime.date(2017, 1, 11) datetime.date(2017, 6, 24)
 datetime.date(2016, 10, 14) datetime.date(2017, 5, 29)
 datetime.date(2017, 5, 15) datetime.date(2017, 4, 20)
 datetime.date(2016, 8, 31) datetime.date(2017, 8, 1)
 datetime.date(2017, 2, 28) datetime.date(2016, 12, 9)
 datetime.date(2016, 9, 17) datetime.date(2017, 6, 17)
 datetime.date(2016, 12, 16) datetime.date(2016, 8, 

visitNumber

[  1   2   3   5  11   4  57   6  56   7  20   8  15   9  25  24  14  89
 136  13  85  10 105  22  21  48  18  26  17  83  38  84  27  42 100  31
  16  30  59  50  19  51  29 160  52  12  63  23  47  49  28  34 178  88
  33 317  93 140  35  41  91  92  74 309  43  61  46  45  32 108  86 195
  36 236  72 162 235  90  76  77  94  96  73  78  97  87 304 106 107 389
  68  98 132  58 266 267 221  39 121 143 142 174 175 189 191  55 190 104
 245 204  37  82 137 206 101  62 110 156 158 157 159 138 109 194 193 147
 254  71  53 144 145 146  80 253  99 134 135 102  44 130  95  75 103 315
  70  69 117 163 154 153 283  65 262 263 295 297 298 219  64 296 155 141
 169 170  60 207  54 187 133 118 264  67  66 126  40 122 325 326 197 167
 196 188 279 185 230 161 186 278 139 234 233 168 115 114 119 150 149 148
  79 306 123 131 171 124 129 258 112 218 183 199 261 259 260 200 220 152
 151 111 202 203 205 173 125 198 280 281 177 182 127 120 116  81 373 113
 212 213 226 225 348 224 128 176 269 2


[   1.    2.    3.    4.    5.    6.    7.    8.    9.   10.   11.   12.
   13.   14.   15.   16.   17.   18.   20.   19.   21.   25.   22.   26.
   23.   24.   30.   27.   29.   28.   32.   31.   34.   33.   38.   35.
   40.   36.   37.   39.   43.   45.   47.   52.   44.   50.   46.   49.
   51.   56.   59.   71.   57.   82.   84.   92.   90.   41.   79.  138.
   48.   69.  112.   42.   65.   60.   63.   68.   58.   74.  165.   53.
   54.   55.   61.   70.   80.   78.   72.  188.   73.   62.   75.   86.
  197.  148.   67.  102.   64.  103.  144.   81.  119.  104.  128.   87.
  111.   66.   83.  135.   94.   91.   99.   77.  118.  202.  127.   93.
  106.  155.  117.   76.   89.   85.  143.  105.  115.   95.  110.  137.
  120.  116.  208.  126.  400.  101.  108.  333.   97.   96.   98.  125.
  113.  327.  154.  121.  183.  123.  141.  224.   88.  195.  343.  156.
  193.  100.  150.  122.  145.  174.  147.  131.  169.  164.  249.  114.
  189.  223.  124.  139.  341.  469.  309.  162.  

In [47]:
for i in train_df['trafficSource.source'].unique():
    print(sum(train_df['trafficSource.source'] == i))

400788
3356
260
2983
1530
2097
16172
16411
143028
66416
4669
55
1813
6
1457
83
528
175
24
5686
388
277
3
222
49
212602
795
2296
1546
3365
2022
356
11
419
742
1480
1529
10
207
142
146
143
3
57
17
46
524
181
47
5
97
3
101
1063
22
110
185
1025
95
364
3
270
94
13
194
2
20
5
61
85
148
26
80
1
4
23
35
6
2
7
1
27
2
1
14
52
37
49
6
9
3
38
1
23
1
29
10
4
25
23
1
18
3
45
1
8
41
12
56
6
4
3
6
37
22
11
80
28
4
7
4
7
13
14
50
3
14
3
3
0
34
22
8
1
2
6
5
30
7
19
104
19
35
2
5
26
17
19
2
2
4
1
9
24
18
7
5
5
2
126
10
31
12
12
45
5
7
33
4
2
4
6
7
39
2
23
14
5
2
1
1
5
1
3
2
18
4
1
6
85
2
2
9
6
1
13
11
5
15
1
2
3
1
11
8
3
5
7
2
8
1
2
1
6
3
6
1
3
2
4
1
3
4
9
1
1
1
8
1
10
2
2
1
1
1
3
30
1
1
1
2
4
3
3
1
1
1
3
3
1
5
8
4
2
1
3
4
1
1
1
1
2
3
2
1
1
1
1
2
1
1
5
1
2
1
1
7
1
1
7
1
1
1
1
2
3
1
1
1
3
2
1
1
1
1
1
1
1
1
1
1
1
1
1
2
2
2
2
5
1
1
1
3
1
3
1
5
1
1
1
2
1
1
1
2
1
1
1
1
2
1
1
1
1
2
1
1
1
1
1
2
2
1
1
1
2
1
1
2
1
2
1
2
1
2
1
1
1
1
1
4
1
1
1
1
1
1
1
1
1
1
1
1
1
3
1
1
1
1
1


In [41]:
# # -> possibile da rivalutare
# _ -> ok
# #? -> non mi ricordo

def fill_nans(df):
    
    
    df['device.browser'].fillna('Chrome', inplace=True) #few nans, a lot of "Chrome"
    df['device.operatingSystem'].fillna('(Other)', inplace=True) #no predominance of any value, quite a bit of nans, keep separated category
    df['totals.pageviews'].fillna(1, inplace=True) #many 1s, nans do not bring any revenue -> nans become 1s
    df['trafficSource.medium'].fillna('(none)', inplace=True) #seems to be the same
    df['trafficSource.source'].fillna('other', inplace = True) #seems to be the same
    
    df['geoNetwork.continent'].fillna('(Other)', inplace=True)# keep separate category
    df['geoNetwork.country'].fillna('(Other)', inplace=True)# keep separate category
    df['geoNetwork.subContinent'].fillna('(Other)', inplace=True)# keep separate category

    
    df['totals.newVisits'].fillna(0, inplace=True)# totals.newVisits is always == nan when visitNumber > 1, we keep it for now but it will be dropped eventually
    return df

In [44]:
train_df = fill_nans(train_df)