In [1]:
import pandas as pd
import numpy as np

# Random seed for reproducibility
seed = 202
np.random.seed(seed)

# Ignore warnings
import warnings
warnings.simplefilter('ignore')

# Garbage collector
import gc
gc.enable()

In [2]:
# Import the dataframes without JSON features and useless features
train_raw_df = pd.read_csv("cleaned_train_v2(2).csv",
    dtype={'date': str, 'fullVisitorId': str}, nrows=None)
test_raw_df = pd.read_csv("cleaned_test_v2(2).csv",
    dtype={'date': str, 'fullVisitorId': str}, nrows=None)
train_raw_df.shape, test_raw_df.shape

((1708337, 38), (401589, 38))

## First preprocessing

### NOTE: the next step is optional

### Inspect disguised nans

In [None]:
# check how many categories are present per column (before nans substitution)
for c in train_raw_df.columns:
    print(c, len(np.unique(train_raw_df[c].astype(str))))

In [None]:
from collections import Counter
Counter(train_raw_df['channelGrouping'])

In [None]:
train_raw_df['trafficSource.adwordsClickInfo.slot'].unique()

### Convert disguised nans

In [3]:
nan_list = [
    "(not set)",
    "not available in demo dataset",
    "not.configured",
    "(not provided)",
    "unknown.unknown",
    "/",
    "(Other)",
    "(other)",
    "(none)",
    "nan"
]

nan_dict = {nl:np.nan for nl in nan_list}

# convert all "disguised" missing values to nans
def miss_to_nan(df):
    df.replace(nan_dict, inplace=True) # convert disguised NaNs
    df.dropna(axis=1, how='all', inplace=True)
    return df

### Checkpoint

In [4]:
# apply the preprocessing up to this point and save into copies
train_df = miss_to_nan(train_raw_df)
test_df = miss_to_nan(test_raw_df)

### Fill nans with zeroes in target column

In [5]:
train_df['totals.transactionRevenue'].fillna(0, inplace=True)

### NOTE: the next step is optional

### Inspect data

In [None]:
# check the nan condition in totals.newVisits
from itertools import compress
sum(list(compress(train_df['visitNumber'] > 1, train_df['totals.newVisits'].isnull()))) == train_df['totals.newVisits'].isnull().sum()

In [None]:
# check the nan condition in totals.newVisits
print(sum(train_df['totals.newVisits'].isnull()))
sum(list(compress(train_df['totals.newVisits'].isnull(), train_df['visitNumber'] > 1)))

In [None]:
# check if there is difference between nan and (none) in df['trafficSource.medium']
print(sum(train_df['trafficSource.medium'].isnull()))
sum(list(compress(train_df['trafficSource.medium'].isnull(), train_df['totals.transactionRevenue'] == 0)))

### c'è un valore con revenue !!!!!

In [None]:
#same shit for trafficSource.source
print(sum(train_df['trafficSource.source'].isnull()))
sum(list(compress(train_df['trafficSource.source'].isnull(), train_df['totals.transactionRevenue'] == 0)))

In [None]:
train_df.isnull().sum()

In [None]:
#check again number of categories in each variable (eg column)
for c in train_df.columns:
    print(c, len(np.unique(train_df[c].astype(str))))

In [None]:
for i in train_df.columns:    
    if len(train_df[i].unique()) <= 100:
        print(i + '\n')
        print(train_df[i].unique())
        print('\n')

## Second preprocessing

### Missing values filling

In [6]:
def fill_nans(df):
    
    
    df['channelGrouping'].fillna('Organic Search', inplace=True) # few nans, a lot of "Organic Search"
    df['device.browser'].fillna('Chrome', inplace=True) # few nans, a lot of "Chrome"
    df['device.operatingSystem'].fillna('unknown', inplace=True) # no predominance of any value, quite a bit of nans, keep separated category
    df['totals.pageviews'].fillna(1, inplace=True) # many 1s, nans do not bring any revenue -> nans become 1s
    df['trafficSource.medium'].fillna('unknown', inplace=True) # seems to be the same
    df['trafficSource.source'].fillna('unknown', inplace = True) # seems to be the same
    
    
    df['geoNetwork.continent'].fillna('unknown', inplace=True)# keep separate category
    df['geoNetwork.country'].fillna('unknown', inplace=True)# keep separate category
    df['geoNetwork.subContinent'].fillna('unknown', inplace=True)# keep separate category

    
    # Features with huge amout of nans
    df['geoNetwork.city'].fillna('unknown', inplace=True)
    df['geoNetwork.metro'].fillna('unknown', inplace=True)
    df['geoNetwork.networkDomain'].fillna('unknown', inplace=True)
    df['geoNetwork.region'].fillna('unknown', inplace=True)
    df['totals.bounces'].fillna(0, inplace=True) # nan and 1
    df['trafficSource.adContent'].fillna('unknown', inplace=True)
    df['trafficSource.adwordsClickInfo.adNetworkType'].fillna('unknown', inplace=True)
    df['trafficSource.adwordsClickInfo.gclId'].fillna('unknown', inplace=True)
#     df['trafficSource.adwordsClickInfo.isVideoAd'].fillna(True, inplace=True) # nan and False
    df['trafficSource.adwordsClickInfo.page'].fillna(0, inplace=True) # 0 is not cointained in the set
    df['trafficSource.adwordsClickInfo.slot'].fillna('unknown', inplace=True)
    df['trafficSource.campaign'].fillna('unknown', inplace=True)
    df['trafficSource.isTrueDirect'].fillna(False, inplace=True) # nan and True
    df['trafficSource.keyword'].fillna('unknown', inplace=True)
    df['trafficSource.referralPath'].fillna('unknown', inplace=True)
    df['totals.newVisits'].fillna(0, inplace=True)# totals.newVisits is always == nan when visitNumber > 1, we keep it for now but it will be dropped eventually

    return df

In [7]:
train_df = fill_nans(train_df)
test_df = fill_nans(test_df)

In [8]:
train_df.isnull().sum()

channelGrouping                                 0
fullVisitorId                                   0
sessionId                                       0
visitId                                         0
visitNumber                                     0
visitStartTime                                  0
device.browser                                  0
device.deviceCategory                           0
device.isMobile                                 0
device.operatingSystem                          0
geoNetwork.city                                 0
geoNetwork.continent                            0
geoNetwork.country                              0
geoNetwork.metro                                0
geoNetwork.networkDomain                        0
geoNetwork.region                               0
geoNetwork.subContinent                         0
totals.bounces                                  0
totals.hits                                     0
totals.newVisits                                0


### Cast numerical features to float

In [None]:
# float_features = ['visitId', 'visitNumber', 'visitStartTime', 'totals.bounces', 'totals.hits',
#                  'totals.newVisits', 'totals.pageviews', 'trafficSource.adwordsClickInfo.page',
#                  'trafficSource.isTrueDirect']

# for col in float_features:
#     train_df[col] = train_df[col].astype(float)
#     test_df[col] = test_df[col].astype(float)

### Cast string categorical features to lower case

In [10]:
string_features = ['channelGrouping', 'sessionId', 'device.browser', 'device.deviceCategory',
                  'device.operatingSystem', 'geoNetwork.city', 'geoNetwork.continent', 'geoNetwork.country',
                  'geoNetwork.metro', 'geoNetwork.networkDomain', 'geoNetwork.region', 'geoNetwork.subContinent',
                  'trafficSource.adContent', 'trafficSource.adwordsClickInfo.adNetworkType', 'trafficSource.adwordsClickInfo.gclId',
                  'trafficSource.adwordsClickInfo.slot', 'trafficSource.campaign', 'trafficSource.referralPath',
                  'trafficSource.medium', 'trafficSource.source', 'trafficSource.keyword']

for col in string_features:
    train_df[col] = train_df[col].str.lower().astype(str)
    train_df[col] = train_df[col].astype(str)
    test_df[col] = test_df[col].str.lower().astype(str)
    test_df[col] = test_df[col].astype(str)

In [9]:
train_df.head()

Unnamed: 0,channelGrouping,fullVisitorId,sessionId,visitId,visitNumber,visitStartTime,device.browser,device.deviceCategory,device.isMobile,device.operatingSystem,...,trafficSource.adwordsClickInfo.adNetworkType,trafficSource.adwordsClickInfo.gclId,trafficSource.adwordsClickInfo.page,trafficSource.adwordsClickInfo.slot,trafficSource.campaign,trafficSource.isTrueDirect,trafficSource.keyword,trafficSource.medium,trafficSource.referralPath,trafficSource.source
0,Organic Search,1131660440785968503,1131660440785968503_1472830385,1472830385,1,1472830385,Chrome,desktop,False,Windows,...,unknown,unknown,0.0,unknown,unknown,False,unknown,organic,unknown,google
1,Organic Search,377306020877927890,377306020877927890_1472880147,1472880147,1,1472880147,Firefox,desktop,False,Macintosh,...,unknown,unknown,0.0,unknown,unknown,False,unknown,organic,unknown,google
2,Organic Search,3895546263509774583,3895546263509774583_1472865386,1472865386,1,1472865386,Chrome,desktop,False,Windows,...,unknown,unknown,0.0,unknown,unknown,False,unknown,organic,unknown,google
3,Organic Search,4763447161404445595,4763447161404445595_1472881213,1472881213,1,1472881213,UC Browser,desktop,False,Linux,...,unknown,unknown,0.0,unknown,unknown,False,google + online,organic,unknown,google
4,Organic Search,27294437909732085,27294437909732085_1472822600,1472822600,2,1472822600,Chrome,mobile,True,Android,...,unknown,unknown,0.0,unknown,unknown,True,unknown,organic,unknown,google


### NOTE: the next step is optional (may improve performances but it has to be tested)

### Cleaning rare categories

In [None]:
# Actually we have no longer missing values, and we can manage the problem of the rare categories
# We group them in an overfeature
def clear_rare_categories(df, feature, limit = 900):

    vc = df[feature].value_counts()
    
    common = vc > limit
    common = set(common.index[common].values)
    print("Set", sum(vc <= limit), feature, "categories to 'other';", end=" ")
    
    df.loc[df[feature].map(lambda x: x not in common), feature] = 'other'
    print("now there are", df[feature].nunique(), "categories")
    
    return df

In [None]:
# to be chacked better!!!

# There are features that we don't want to modify
raw_features = [
"channelGrouping",
"device.browser",
"device.deviceCategory",
"device.isMobile",
"device.operatingSystem",
"geoNetwork.city",
"geoNetwork.continent",
"geoNetwork.country",
"geoNetwork.metro",
"geoNetwork.networkDomain",
"geoNetwork.region",
"geoNetwork.subContinent",
"totals.bounces",
"totals.newVisits",
"trafficSource.adContent",
"trafficSource.adwordsClickInfo.adNetworkType",
# "trafficSource.adwordsClickInfo.isVideoAd",
"trafficSource.adwordsClickInfo.slot",
"trafficSource.campaign",
"trafficSource.isTrueDirect",
"trafficSource.keyword",
"trafficSource.medium",
"trafficSource.referralPath",
"trafficSource.source"
]


for feat in raw_features:
    print("\nTRAIN:\n")
    train_df = clear_rare_categories(train_df, feat)
    print("\nTEST:\n")
    test_df = clear_rare_categories(test_df, feat)
    
#     if(train_df[feat].nunique() != test_df[feat].nunique()):
#         print("\n\n\nWARNING:\n")


In [None]:
for i in train_df.columns:    
    if len(train_df[i].unique()) <= 100:
        print(i + '\n')
        print(train_df[i].unique())
        print('\n')

In [None]:
#check again number of categories in each variable (eg column)
for c in train_df.columns:
    print(c, len(np.unique(train_df[c].astype(str))))

### Drop meaningless columns

In [9]:
meaningless_columns =[]

def drop_meaningless_columns(df, meaningless_columns):
    df.drop(meaningless_columns, axis=1, inplace=True)
    return df

train_df = drop_meaningless_columns(train_df, meaningless_columns)
test_df = drop_meaningless_columns(test_df, meaningless_columns)
train_df.shape, test_df.shape

((903653, 33), (804684, 32))

In [None]:
"""

meaningless_columns = [
    'geoNetwork.city', 'geoNetwork.metro', 'geoNetwork.networkDomain', 'geoNetwork.region',
    'totals.bounces',
    'trafficSource.adContent', 'trafficSource.adwordsClickInfo.adNetworkType', 'trafficSource.adwordsClickInfo.gclId',
    'trafficSource.adwordsClickInfo.isVideoAd', 'trafficSource.adwordsClickInfo.page', 'trafficSource.adwordsClickInfo.slot',
    'trafficSource.campaign', 'trafficSource.isTrueDirect', 'trafficSource.keyword', 'trafficSource.referralPath'   
]

#NO DROP VERSION
for i in meaningless_columns:
    if train_df[i].dtype == float:

        train_df[i].fillna(0, inplace=True)
        test_df[i].fillna(0, inplace=True)
    else:
        print(i)        
        train_df[i].fillna('(other)', inplace=True)
        test_df[i].fillna('(other)', inplace=True)    

def drop_meaningless_columns(df, meaningless_columns):
    df.drop(meaningless_columns, axis=1, inplace=True)
    return df

#train_df = drop_meaningless_columns(train_df, meaningless_columns)
#test_df = drop_meaningless_columns(test_df, meaningless_columns)
train_df.shape, test_df.shape

"""

### NOTE: the next step is optional

###  Label encoder

In [None]:
from sklearn import preprocessing

for col in string_features:
    print(col)
    lbl = preprocessing.LabelEncoder()
    lbl.fit(list(train_df[col].values.astype('str')) + list(test_df[col].values.astype('str')))
    train_df[col] = lbl.transform(list(train_df[col].values.astype('str')))
    test_df[col] = lbl.transform(list(test_df[col].values.astype('str')))

channelGrouping
sessionId


### Export Dataframes

In [10]:
train_df.to_csv('preprocessed_train(3).csv', index = False)
test_df.to_csv('preprocessed_test(3).csv', index = False)