In [1]:
import pandas as pd
import numpy as np

# Random seed for reproducibility
seed = 202
np.random.seed(seed)

# Ignore warnings
import warnings
warnings.simplefilter('ignore')

# Garbage collector
import gc
gc.enable()

# https://www.kaggle.com/sudalairajkumar/simple-exploration-baseline-ga-customer-revenue

### Import dataframes

In [2]:
# Import the dataframes without JSON features
train_raw_df = pd.read_csv("toDF_train_v2(1).csv",
    dtype={'fullVisitorId': str}, nrows=None)
test_raw_df = pd.read_csv("toDF_test_v2(1).csv",
    dtype={'fullVisitorId': str}, nrows=None)
train_raw_df.shape, test_raw_df.shape

((1708337, 59), (401589, 58))

### Number of visitors and common visiotrs

In [3]:
print("Number of unique visitors in train set : ",train_raw_df.fullVisitorId.nunique(), " out of rows : ",train_raw_df.shape[0])
print("Number of unique visitors in test set : ",test_raw_df.fullVisitorId.nunique(), " out of rows : ",test_raw_df.shape[0])
print("Number of common visitors in train and test set : ",len(set(train_raw_df.fullVisitorId.unique()).intersection(set(test_raw_df.fullVisitorId.unique()))))

Number of unique visitors in train set :  1323730  out of rows :  1708337
Number of unique visitors in test set :  296530  out of rows :  401589
Number of common visitors in train and test set :  2759


### Columns with constant values

In [4]:
const_cols = [c for c in train_raw_df.columns if train_raw_df[c].nunique(dropna=False)==1]
const_cols

['socialEngagementType',
 'device_browserSize',
 'device_browserVersion',
 'device_flashVersion',
 'device_language',
 'device_mobileDeviceBranding',
 'device_mobileDeviceInfo',
 'device_mobileDeviceMarketingName',
 'device_mobileDeviceModel',
 'device_mobileInputSelector',
 'device_operatingSystemVersion',
 'device_screenColors',
 'device_screenResolution',
 'geoNetwork_cityId',
 'geoNetwork_latitude',
 'geoNetwork_longitude',
 'geoNetwork_networkLocation',
 'totals_visits',
 'trafficSource_adwordsClickInfo.criteriaParameters']

## Checkpoint

### Drop constant columns

In [5]:
def drop_constant_columns(df, features):
    df.drop(features, axis=1, inplace=True)
    return df

train_df = drop_constant_columns(train_raw_df, const_cols)
test_df = drop_constant_columns(test_raw_df, const_cols)
train_df.shape, test_df.shape

((1708337, 40), (401589, 39))

### Features differences between train and test

In [9]:
print("Variables not in test but in train : ", set(train_df.columns).difference(set(test_df.columns)))

Variables not in test but in train :  set()


### Drop trafficSource.campaignCode and date (from both)

In [10]:
train_df.drop(['trafficSource_campaignCode', 'date'], axis=1, inplace=True)
test_df.drop(['date'], axis=1, inplace=True)

KeyError: "['trafficSource_campaignCode' 'date'] not found in axis"

In [11]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1708337 entries, 0 to 1708336
Data columns (total 38 columns):
channelGrouping                                 object
customDimensions                                object
fullVisitorId                                   object
visitId                                         int64
visitNumber                                     int64
visitStartTime                                  int64
device_browser                                  object
device_deviceCategory                           object
device_isMobile                                 bool
device_operatingSystem                          object
geoNetwork_city                                 object
geoNetwork_continent                            object
geoNetwork_country                              object
geoNetwork_metro                                object
geoNetwork_networkDomain                        object
geoNetwork_region                               object
geoNetwork_s

In [12]:
train_df.head()

Unnamed: 0,channelGrouping,customDimensions,fullVisitorId,visitId,visitNumber,visitStartTime,device_browser,device_deviceCategory,device_isMobile,device_operatingSystem,...,trafficSource_adwordsClickInfo.gclId,trafficSource_adwordsClickInfo.isVideoAd,trafficSource_adwordsClickInfo.page,trafficSource_adwordsClickInfo.slot,trafficSource_campaign,trafficSource_isTrueDirect,trafficSource_keyword,trafficSource_medium,trafficSource_referralPath,trafficSource_source
0,Organic Search,"[{'index': '4', 'value': 'EMEA'}]",3162355547410993243,1508198450,1,1508198450,Firefox,desktop,False,Windows,...,,,,,(not set),,water bottle,organic,,google
1,Referral,"[{'index': '4', 'value': 'North America'}]",8934116514970143966,1508176307,6,1508176307,Chrome,desktop,False,Chrome OS,...,,,,,(not set),,,referral,/a/google.com/transportation/mtv-services/bike...,sites.google.com
2,Direct,"[{'index': '4', 'value': 'North America'}]",7992466427990357681,1508201613,1,1508201613,Chrome,mobile,True,Android,...,,,,,(not set),True,,(none),,(direct)
3,Organic Search,"[{'index': '4', 'value': 'EMEA'}]",9075655783635761930,1508169851,1,1508169851,Chrome,desktop,False,Windows,...,,,,,(not set),,(not provided),organic,,google
4,Organic Search,"[{'index': '4', 'value': 'Central America'}]",6960673291025684308,1508190552,1,1508190552,Chrome,desktop,False,Windows,...,,,,,(not set),,(not provided),organic,,google


In [13]:
test_df.head()

Unnamed: 0,channelGrouping,customDimensions,fullVisitorId,visitId,visitNumber,visitStartTime,device_browser,device_deviceCategory,device_isMobile,device_operatingSystem,...,trafficSource_adwordsClickInfo.gclId,trafficSource_adwordsClickInfo.isVideoAd,trafficSource_adwordsClickInfo.page,trafficSource_adwordsClickInfo.slot,trafficSource_campaign,trafficSource_isTrueDirect,trafficSource_keyword,trafficSource_medium,trafficSource_referralPath,trafficSource_source
0,Organic Search,"[{'index': '4', 'value': 'APAC'}]",7460955084541987166,1526099341,2,1526099341,Chrome,mobile,True,Android,...,,,,,(not set),True,(not provided),organic,(not set),google
1,Direct,"[{'index': '4', 'value': 'North America'}]",460252456180441002,1526064483,166,1526064483,Chrome,desktop,False,Macintosh,...,,,,,(not set),True,(not set),(none),(not set),(direct)
2,Organic Search,"[{'index': '4', 'value': 'North America'}]",3461808543879602873,1526067157,2,1526067157,Chrome,desktop,False,Chrome OS,...,,,,,(not set),True,(not provided),organic,(not set),google
3,Direct,"[{'index': '4', 'value': 'North America'}]",975129477712150630,1526107551,4,1526107551,Chrome,mobile,True,iOS,...,,,,,(not set),True,(not set),(none),(not set),(direct)
4,Organic Search,"[{'index': '4', 'value': 'North America'}]",8381672768065729990,1526060254,1,1526060254,Internet Explorer,tablet,True,Windows,...,,,,,(not set),,(not provided),organic,(not set),google


### Export Dataframes

In [14]:
train_df.to_csv('cleaned_train_v2(2).csv', index = False)
test_df.to_csv('cleaned_test_v2(2).csv', index = False)