In [2]:
import pandas as pd
import numpy as np

# Random seed for reproducibility
seed = 202
np.random.seed(seed)

# Ignore warnings
import warnings
warnings.simplefilter('ignore')

# https://www.kaggle.com/sudalairajkumar/simple-exploration-baseline-ga-customer-revenue

### Import dataframes

In [3]:
# Import the dataframes without JSON features
train_raw_df = pd.read_csv("toDF_train(1).csv",
    dtype={'fullVisitorId': str}, nrows=None)
test_raw_df = pd.read_csv("toDF_test(1).csv",
    dtype={'fullVisitorId': str}, nrows=None)
train_raw_df.shape, test_raw_df.shape

((903653, 55), (804684, 53))

### Number of visitors and common visiotrs

In [4]:
print("Number of unique visitors in train set : ",train_raw_df.fullVisitorId.nunique(), " out of rows : ",train_raw_df.shape[0])
print("Number of unique visitors in test set : ",test_raw_df.fullVisitorId.nunique(), " out of rows : ",test_raw_df.shape[0])
print("Number of common visitors in train and test set : ",len(set(train_raw_df.fullVisitorId.unique()).intersection(set(test_raw_df.fullVisitorId.unique()))))

Number of unique visitors in train set :  714167  out of rows :  903653
Number of unique visitors in test set :  617242  out of rows :  804684
Number of common visitors in train and test set :  7679


### Columns with constant values

In [5]:
const_cols = [c for c in train_raw_df.columns if train_raw_df[c].nunique(dropna=False)==1]
const_cols

['socialEngagementType',
 'device.browserSize',
 'device.browserVersion',
 'device.flashVersion',
 'device.language',
 'device.mobileDeviceBranding',
 'device.mobileDeviceInfo',
 'device.mobileDeviceMarketingName',
 'device.mobileDeviceModel',
 'device.mobileInputSelector',
 'device.operatingSystemVersion',
 'device.screenColors',
 'device.screenResolution',
 'geoNetwork.cityId',
 'geoNetwork.latitude',
 'geoNetwork.longitude',
 'geoNetwork.networkLocation',
 'totals.visits',
 'trafficSource.adwordsClickInfo.criteriaParameters']

## Checkpoint

### Drop constant columns

In [6]:
def drop_constant_columns(df, features):
    df.drop(features, axis=1, inplace=True)
    return df

train_df = drop_constant_columns(train_raw_df, const_cols)
test_df = drop_constant_columns(test_raw_df, const_cols)
train_df.shape, test_df.shape

((903653, 36), (804684, 34))

### Features differences between train and test

In [7]:
print("Variables not in test but in train : ", set(train_df.columns).difference(set(test_df.columns)))

Variables not in test but in train :  {'totals.transactionRevenue', 'trafficSource.campaignCode'}


### Drop trafficSource.campaignCode from train_df

In [8]:
train_df.drop(['trafficSource.campaignCode'], axis=1, inplace=True)

# train_df.drop(['trafficSource.campaignCode', 'sessionId', 'visitId'], axis=1, inplace=True)
# test_df.drop(['sessionId', 'visitId'], axis=1, inplace=True)

In [9]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 903653 entries, 0 to 903652
Data columns (total 35 columns):
channelGrouping                                 903653 non-null object
date                                            903653 non-null int64
fullVisitorId                                   903653 non-null object
sessionId                                       903653 non-null object
visitId                                         903653 non-null int64
visitNumber                                     903653 non-null int64
visitStartTime                                  903653 non-null int64
device.browser                                  903653 non-null object
device.deviceCategory                           903653 non-null object
device.isMobile                                 903653 non-null bool
device.operatingSystem                          903653 non-null object
geoNetwork.city                                 903653 non-null object
geoNetwork.continent                       

In [10]:
train_df.head()

Unnamed: 0,channelGrouping,date,fullVisitorId,sessionId,visitId,visitNumber,visitStartTime,device.browser,device.deviceCategory,device.isMobile,...,trafficSource.adwordsClickInfo.gclId,trafficSource.adwordsClickInfo.isVideoAd,trafficSource.adwordsClickInfo.page,trafficSource.adwordsClickInfo.slot,trafficSource.campaign,trafficSource.isTrueDirect,trafficSource.keyword,trafficSource.medium,trafficSource.referralPath,trafficSource.source
0,Organic Search,20160902,1131660440785968503,1131660440785968503_1472830385,1472830385,1,1472830385,Chrome,desktop,False,...,,,,,(not set),,(not provided),organic,,google
1,Organic Search,20160902,377306020877927890,377306020877927890_1472880147,1472880147,1,1472880147,Firefox,desktop,False,...,,,,,(not set),,(not provided),organic,,google
2,Organic Search,20160902,3895546263509774583,3895546263509774583_1472865386,1472865386,1,1472865386,Chrome,desktop,False,...,,,,,(not set),,(not provided),organic,,google
3,Organic Search,20160902,4763447161404445595,4763447161404445595_1472881213,1472881213,1,1472881213,UC Browser,desktop,False,...,,,,,(not set),,google + online,organic,,google
4,Organic Search,20160902,27294437909732085,27294437909732085_1472822600,1472822600,2,1472822600,Chrome,mobile,True,...,,,,,(not set),True,(not provided),organic,,google


In [11]:
test_df.head()

Unnamed: 0,channelGrouping,date,fullVisitorId,sessionId,visitId,visitNumber,visitStartTime,device.browser,device.deviceCategory,device.isMobile,...,trafficSource.adwordsClickInfo.gclId,trafficSource.adwordsClickInfo.isVideoAd,trafficSource.adwordsClickInfo.page,trafficSource.adwordsClickInfo.slot,trafficSource.campaign,trafficSource.isTrueDirect,trafficSource.keyword,trafficSource.medium,trafficSource.referralPath,trafficSource.source
0,Organic Search,20171016,6167871330617112363,6167871330617112363_1508151024,1508151024,2,1508151024,Chrome,desktop,False,...,,,,,(not set),True,(not provided),organic,,google
1,Organic Search,20171016,643697640977915618,0643697640977915618_1508175522,1508175522,1,1508175522,Chrome,desktop,False,...,,,,,(not set),,(not provided),organic,,google
2,Organic Search,20171016,6059383810968229466,6059383810968229466_1508143220,1508143220,1,1508143220,Chrome,desktop,False,...,,,,,(not set),,(not provided),organic,,google
3,Organic Search,20171016,2376720078563423631,2376720078563423631_1508193530,1508193530,1,1508193530,Safari,mobile,True,...,,,,,(not set),,(not provided),organic,,google
4,Organic Search,20171016,2314544520795440038,2314544520795440038_1508217442,1508217442,1,1508217442,Safari,desktop,False,...,,,,,(not set),,(not provided),organic,,google


### Export Dataframes

In [12]:
train_df.to_csv('cleaned_train(2).csv', index = False)
test_df.to_csv('cleaned_test(2).csv', index = False)