# Preprocessing

In [1]:
import pandas as pd
import numpy as np
import json

## Import

In [2]:
%%time
train_frame = pd.read_csv("./input/train_flattened.csv")



Wall time: 3min 57s


## Examine Columns
#### Category
Categories are a datatype in pandas that reduces memory footprint and reduces a feature to an indexed category speeding up transactions. We want to examine the features and see if we can move from an object type to category.

In [3]:
pd.set_option('display.max_columns', 50)
train_frame.head(8)

Unnamed: 0,channelGrouping,customDimensions,date,fullVisitorId,hits,visitId,visitNumber,visitStartTime,device.browser,device.deviceCategory,device.isMobile,device.operatingSystem,geoNetwork.city,geoNetwork.continent,geoNetwork.country,geoNetwork.metro,geoNetwork.networkDomain,geoNetwork.region,geoNetwork.subContinent,totals.bounces,totals.hits,totals.newVisits,totals.pageviews,totals.sessionQualityDim,totals.timeOnSite,totals.totalTransactionRevenue,totals.transactionRevenue,totals.transactions,trafficSource.adContent,trafficSource.adwordsClickInfo.adNetworkType,trafficSource.adwordsClickInfo.gclId,trafficSource.adwordsClickInfo.isVideoAd,trafficSource.adwordsClickInfo.page,trafficSource.adwordsClickInfo.slot,trafficSource.campaign,trafficSource.campaignCode,trafficSource.isTrueDirect,trafficSource.keyword,trafficSource.medium,trafficSource.referralPath,trafficSource.source
0,Organic Search,"[{'index': '4', 'value': 'EMEA'}]",20171016,3162355547410993243,"[{'hitNumber': '1', 'time': '0', 'hour': '17',...",1508198450,1,1508198450,Firefox,desktop,False,Windows,not available in demo dataset,Europe,Germany,not available in demo dataset,(not set),not available in demo dataset,Western Europe,1.0,1,1.0,1.0,1.0,,,,,,,,,,,(not set),,,water bottle,organic,,google
1,Referral,"[{'index': '4', 'value': 'North America'}]",20171016,8934116514970143966,"[{'hitNumber': '1', 'time': '0', 'hour': '10',...",1508176307,6,1508176307,Chrome,desktop,False,Chrome OS,Cupertino,Americas,United States,San Francisco-Oakland-San Jose CA,(not set),California,Northern America,,2,,2.0,2.0,28.0,,,,,,,,,,(not set),,,,referral,/a/google.com/transportation/mtv-services/bike...,sites.google.com
2,Direct,"[{'index': '4', 'value': 'North America'}]",20171016,7992466427990357681,"[{'hitNumber': '1', 'time': '0', 'hour': '17',...",1508201613,1,1508201613,Chrome,mobile,True,Android,not available in demo dataset,Americas,United States,not available in demo dataset,windjammercable.net,not available in demo dataset,Northern America,,2,1.0,2.0,1.0,38.0,,,,,,,,,,(not set),,True,,(none),,(direct)
3,Organic Search,"[{'index': '4', 'value': 'EMEA'}]",20171016,9075655783635761930,"[{'hitNumber': '1', 'time': '0', 'hour': '9', ...",1508169851,1,1508169851,Chrome,desktop,False,Windows,not available in demo dataset,Asia,Turkey,not available in demo dataset,unknown.unknown,not available in demo dataset,Western Asia,,2,1.0,2.0,1.0,1.0,,,,,,,,,,(not set),,,(not provided),organic,,google
4,Organic Search,"[{'index': '4', 'value': 'Central America'}]",20171016,6960673291025684308,"[{'hitNumber': '1', 'time': '0', 'hour': '14',...",1508190552,1,1508190552,Chrome,desktop,False,Windows,not available in demo dataset,Americas,Mexico,not available in demo dataset,prod-infinitum.com.mx,not available in demo dataset,Central America,,2,1.0,2.0,1.0,52.0,,,,,,,,,,(not set),,,(not provided),organic,,google
5,Referral,"[{'index': '4', 'value': 'North America'}]",20171016,166277907528479249,"[{'hitNumber': '1', 'time': '0', 'hour': '16',...",1508196701,1,1508196701,Chrome,desktop,False,Macintosh,San Francisco,Americas,United States,San Francisco-Oakland-San Jose CA,unknown.unknown,California,Northern America,,2,1.0,2.0,2.0,12.0,,,,,,,,,,(not set),,,,(none),/offer/2145,(direct)
6,Referral,"[{'index': '4', 'value': 'EMEA'}]",20171016,8349655975937271469,"[{'hitNumber': '1', 'time': '0', 'hour': '4', ...",1508152478,1,1508152478,Chrome,desktop,False,Macintosh,London,Europe,United Kingdom,London,(not set),England,Northern Europe,,2,1.0,2.0,1.0,9.0,,,,,,,,,,(not set),,,,referral,/a/google.com/nest-vision/dropcam-field-tester...,sites.google.com
7,Organic Search,[],20171016,1332629902468998662,"[{'hitNumber': '1', 'time': '0', 'hour': '19',...",1508206208,1,1508206208,Chrome,desktop,False,Windows,not available in demo dataset,Europe,Denmark,not available in demo dataset,fullrate.ninja,not available in demo dataset,Northern Europe,,2,1.0,2.0,1.0,15.0,,,,,,,,,,(not set),,,(not provided),organic,,google


Looking at the data it appears there are several features we can classify as categories.

In [5]:
cat_feats = ["channelGrouping", "device.browser", "device.deviceCategory", "device.operatingSystem", 
             "geoNetwork.city", "geoNetwork.continent", "geoNetwork.country", "geoNetwork.metro",
             "geoNetwork.networkDomain", "geoNetwork.region", "geoNetwork.subContinent", "trafficSource.adContent", 
             "trafficSource.adwordsClickInfo.adNetworkType", "trafficSource.adwordsClickInfo.gclId", 
             "trafficSource.adwordsClickInfo.page", "trafficSource.adwordsClickInfo.slot", "trafficSource.campaign",
             "trafficSource.campaignCode", "trafficSource.keyword", "trafficSource.medium", "trafficSource.referralPath", 
             "trafficSource.source", 'trafficSource.adwordsClickInfo.isVideoAd', 'trafficSource.isTrueDirect']
for feature in cat_feats:
    train_frame[feature] = train_frame[feature].astype('category')
    

### customDimensions

In [6]:
train_frame.customDimensions.head()

0               [{'index': '4', 'value': 'EMEA'}]
1      [{'index': '4', 'value': 'North America'}]
2      [{'index': '4', 'value': 'North America'}]
3               [{'index': '4', 'value': 'EMEA'}]
4    [{'index': '4', 'value': 'Central America'}]
Name: customDimensions, dtype: object

A lot of what is here is visible in the geoNetwork feature, and looking at the documentation on the dataset it appears that it's just browser based session settings.

> **customDimensions** - This section contains any user-level or session-level custom dimensions that are set for a session. This is a repeated field and has an entry for each dimension that is set.

In [7]:
train_frame.drop("customDimensions", axis=1, inplace=True)

### Hits

In [8]:
train_frame["hits"][0]

"[{'hitNumber': '1', 'time': '0', 'hour': '17', 'minute': '0', 'isInteraction': True, 'isEntrance': True, 'isExit': True, 'referer': 'https://www.google.co.uk/search?q=water+bottle&ie=utf-8&num=100&oe=utf-8&hl=en&gl=GB&uule=w+CAIQIFISCamRx0IRO1oCEXoliDJDoPjE&glp=1&gws_rd=cr&fg=1', 'page': {'pagePath': '/google+redesign/bags/water+bottles+and+tumblers', 'hostname': 'shop.googlemerchandisestore.com', 'pageTitle': 'Water Bottles & Tumblers | Drinkware | Google Merchandise Store', 'pagePathLevel1': '/google+redesign/', 'pagePathLevel2': '/bags/', 'pagePathLevel3': '/water+bottles+and+tumblers', 'pagePathLevel4': ''}, 'transaction': {'currencyCode': 'USD'}, 'item': {'currencyCode': 'USD'}, 'appInfo': {'screenName': 'shop.googlemerchandisestore.com/google+redesign/bags/water+bottles+and+tumblers', 'landingScreenName': 'shop.googlemerchandisestore.com/google+redesign/bags/water+bottles+and+tumblers', 'exitScreenName': 'shop.googlemerchandisestore.com/google+redesign/bags/water+bottles+and+tum

Wow, that's a ton of information! How much of it is useful in the time given? Probably not much..... 

In [9]:
train_frame.drop("hits", axis=1, inplace=True)

## Equalize Test & Train Columns

In [11]:
test_frame = pd.read_csv("./input/test_flattened.csv")

  interactivity=interactivity, compiler=compiler, result=result)


In [12]:
eq_train = set(train_frame.columns).difference(set(test_frame.columns))
eq_test = set(test_frame.columns).difference(set(train_frame.columns))
print("Variables not in test but in train : ", eq_train)
print("Variables not in train but in test : ", eq_test)

Variables not in test but in train :  {'trafficSource.campaignCode'}
Variables not in train but in test :  {'customDimensions', 'hits'}


In [13]:
train_frame.drop(eq_train, axis=1, inplace=True)
test_frame.drop(eq_test, axis=1, inplace=True)

In [14]:
train_frame.to_csv("./input/train_flat_pre.csv", index=False)
test_frame.to_csv("./input/test_flat_pre.csv", index=False)