In [1]:
import pandas as pd
import numpy as np
from pandas.io.json import json_normalize
import os

In [2]:
def load_df(csv_path='../input/train.csv', nrows=None):
    JSON_COLUMNS = [ 'device', 'geoNetwork', 'totals', 'trafficSource']
    
    df = pd.read_csv(csv_path, 
                     converters={column: json.loads for column in JSON_COLUMNS}, 
                     dtype={'fullVisitorId': 'str'}, # Important!!
                     nrows=nrows)
    
    for column in JSON_COLUMNS:
        column_as_df = json_normalize(df[column])
        column_as_df.columns = [f"{column}.{subcolumn}" for subcolumn in column_as_df.columns]
        df = df.drop(column, axis=1).merge(column_as_df, right_index=True, left_index=True)
    print(f"Loaded {os.path.basename(csv_path)}. Shape: {df.shape}")
    return df

In [3]:
training_sample_dataset = pd.read_csv("C:\\Sridhar\\AI_ML\\Algorithms\\datasets\\ga-customer-revenue-prediction\\training_sample.csv")

In [4]:
training_sample_dataset.shape

(40000, 14)

In [5]:
test_sample_dataset = pd.read_csv("C:\\Sridhar\\AI_ML\\Algorithms\\datasets\\ga-customer-revenue-prediction\\test_sample.csv")

In [6]:
test_sample_dataset.shape

(40000, 14)

In [7]:
 def memoryManagement(base_dataset):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    df=base_dataset
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))    
    return df

In [8]:
training_sample = memoryManagement(training_sample_dataset)

Memory usage of dataframe is 4.27 MB
Memory usage after optimization is: 5.15 MB
Decreased by -20.5%


In [9]:
test_sample = memoryManagement(test_sample_dataset)

Memory usage of dataframe is 4.27 MB
Memory usage after optimization is: 3.93 MB
Decreased by 7.9%


In [46]:
training_sample_dataset_1 = training_sample_dataset[0:20000]
test_sample_dataset_1 = test_sample_dataset[0:20000]

In [29]:
training_sample = training_sample_dataset[0:20000]
test_sample = test_sample[0:20000]

In [35]:
training_sample_dataset_1.to_csv("C:\\Sridhar\\AI_ML\\Algorithms\\datasets\\ga-customer-revenue-prediction\\df_training.csv")

In [36]:
training_sample_dataset_1 = load_df("C:\\Sridhar\\AI_ML\\Algorithms\\datasets\\ga-customer-revenue-prediction\\df_training.csv")

Loaded df_training.csv. Shape: (20000, 58)


In [32]:
training_sample_dataset_1.head()

Unnamed: 0.1,Unnamed: 0,channelGrouping,customDimensions,date,device,fullVisitorId,geoNetwork,hits,socialEngagementType,totals,trafficSource,visitId,visitNumber,visitStartTime
0,0,Organic Search,"[{'index': '4', 'value': 'EMEA'}]",20171016,"{""browser"": ""Firefox"", ""browserVersion"": ""not ...",3162355547410993243,"{""continent"": ""Europe"", ""subContinent"": ""Weste...","[{'hitNumber': '1', 'time': '0', 'hour': '17',...",Not Socially Engaged,"{""visits"": ""1"", ""hits"": ""1"", ""pageviews"": ""1"",...","{""campaign"": ""(not set)"", ""source"": ""google"", ...",1508198450,1,1508198450
1,1,Referral,"[{'index': '4', 'value': 'North America'}]",20171016,"{""browser"": ""Chrome"", ""browserVersion"": ""not a...",8934116514970143966,"{""continent"": ""Americas"", ""subContinent"": ""Nor...","[{'hitNumber': '1', 'time': '0', 'hour': '10',...",Not Socially Engaged,"{""visits"": ""1"", ""hits"": ""2"", ""pageviews"": ""2"",...","{""referralPath"": ""/a/google.com/transportation...",1508176307,6,1508176307
2,2,Direct,"[{'index': '4', 'value': 'North America'}]",20171016,"{""browser"": ""Chrome"", ""browserVersion"": ""not a...",7992466427990357681,"{""continent"": ""Americas"", ""subContinent"": ""Nor...","[{'hitNumber': '1', 'time': '0', 'hour': '17',...",Not Socially Engaged,"{""visits"": ""1"", ""hits"": ""2"", ""pageviews"": ""2"",...","{""campaign"": ""(not set)"", ""source"": ""(direct)""...",1508201613,1,1508201613
3,3,Organic Search,"[{'index': '4', 'value': 'EMEA'}]",20171016,"{""browser"": ""Chrome"", ""browserVersion"": ""not a...",9075655783635761930,"{""continent"": ""Asia"", ""subContinent"": ""Western...","[{'hitNumber': '1', 'time': '0', 'hour': '9', ...",Not Socially Engaged,"{""visits"": ""1"", ""hits"": ""2"", ""pageviews"": ""2"",...","{""campaign"": ""(not set)"", ""source"": ""google"", ...",1508169851,1,1508169851
4,4,Organic Search,"[{'index': '4', 'value': 'Central America'}]",20171016,"{""browser"": ""Chrome"", ""browserVersion"": ""not a...",6960673291025684308,"{""continent"": ""Americas"", ""subContinent"": ""Cen...","[{'hitNumber': '1', 'time': '0', 'hour': '14',...",Not Socially Engaged,"{""visits"": ""1"", ""hits"": ""2"", ""pageviews"": ""2"",...","{""campaign"": ""(not set)"", ""source"": ""google"", ...",1508190552,1,1508190552


In [37]:
training_sample_dataset_1.drop(["customDimensions","hits","Unnamed: 0"],axis=1,inplace=True)

KeyError: "['customDimensions' 'hits'] not found in axis"

In [34]:
training_sample_dataset_1.head()

Unnamed: 0,channelGrouping,date,device,fullVisitorId,geoNetwork,socialEngagementType,totals,trafficSource,visitId,visitNumber,visitStartTime
0,Organic Search,20171016,"{""browser"": ""Firefox"", ""browserVersion"": ""not ...",3162355547410993243,"{""continent"": ""Europe"", ""subContinent"": ""Weste...",Not Socially Engaged,"{""visits"": ""1"", ""hits"": ""1"", ""pageviews"": ""1"",...","{""campaign"": ""(not set)"", ""source"": ""google"", ...",1508198450,1,1508198450
1,Referral,20171016,"{""browser"": ""Chrome"", ""browserVersion"": ""not a...",8934116514970143966,"{""continent"": ""Americas"", ""subContinent"": ""Nor...",Not Socially Engaged,"{""visits"": ""1"", ""hits"": ""2"", ""pageviews"": ""2"",...","{""referralPath"": ""/a/google.com/transportation...",1508176307,6,1508176307
2,Direct,20171016,"{""browser"": ""Chrome"", ""browserVersion"": ""not a...",7992466427990357681,"{""continent"": ""Americas"", ""subContinent"": ""Nor...",Not Socially Engaged,"{""visits"": ""1"", ""hits"": ""2"", ""pageviews"": ""2"",...","{""campaign"": ""(not set)"", ""source"": ""(direct)""...",1508201613,1,1508201613
3,Organic Search,20171016,"{""browser"": ""Chrome"", ""browserVersion"": ""not a...",9075655783635761930,"{""continent"": ""Asia"", ""subContinent"": ""Western...",Not Socially Engaged,"{""visits"": ""1"", ""hits"": ""2"", ""pageviews"": ""2"",...","{""campaign"": ""(not set)"", ""source"": ""google"", ...",1508169851,1,1508169851
4,Organic Search,20171016,"{""browser"": ""Chrome"", ""browserVersion"": ""not a...",6960673291025684308,"{""continent"": ""Americas"", ""subContinent"": ""Cen...",Not Socially Engaged,"{""visits"": ""1"", ""hits"": ""2"", ""pageviews"": ""2"",...","{""campaign"": ""(not set)"", ""source"": ""google"", ...",1508190552,1,1508190552


In [47]:
test_sample_dataset_1.to_csv("C:\\Sridhar\\AI_ML\\Algorithms\\datasets\\ga-customer-revenue-prediction\\df_test.csv")

In [48]:
test_sample_dataset_1 = load_df("C:\\Sridhar\\AI_ML\\Algorithms\\datasets\\ga-customer-revenue-prediction\\df_test.csv")

Loaded df_test.csv. Shape: (20000, 61)


In [41]:
test_sample_dataset_1.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,channelGrouping,customDimensions,date,fullVisitorId,hits,socialEngagementType,visitId,visitNumber,...,trafficSource.adwordsClickInfo.gclId,trafficSource.adwordsClickInfo.isVideoAd,trafficSource.adwordsClickInfo.page,trafficSource.adwordsClickInfo.slot,trafficSource.campaign,trafficSource.isTrueDirect,trafficSource.keyword,trafficSource.medium,trafficSource.referralPath,trafficSource.source
0,0,0,Organic Search,"[{'index': '4', 'value': 'APAC'}]",20180511,7.460955e+18,"[{'hitNumber': '1', 'time': '0', 'hour': '21',...",Not Socially Engaged,1526099341,2,...,,,,,(not set),True,(not provided),organic,(not set),google
1,1,1,Direct,"[{'index': '4', 'value': 'North America'}]",20180511,4.6025244e+17,"[{'hitNumber': '1', 'time': '0', 'hour': '11',...",Not Socially Engaged,1526064483,166,...,,,,,(not set),True,(not set),(none),(not set),(direct)
2,2,2,Organic Search,"[{'index': '4', 'value': 'North America'}]",20180511,3.4618085e+18,"[{'hitNumber': '1', 'time': '0', 'hour': '12',...",Not Socially Engaged,1526067157,2,...,,,,,(not set),True,(not provided),organic,(not set),google
3,3,3,Direct,"[{'index': '4', 'value': 'North America'}]",20180511,9.7512944e+17,"[{'hitNumber': '1', 'time': '0', 'hour': '23',...",Not Socially Engaged,1526107551,4,...,,,,,(not set),True,(not set),(none),(not set),(direct)
4,4,4,Organic Search,"[{'index': '4', 'value': 'North America'}]",20180511,8.381673e+18,"[{'hitNumber': '1', 'time': '0', 'hour': '10',...",Not Socially Engaged,1526060254,1,...,,,,,(not set),,(not provided),organic,(not set),google


In [49]:
test_sample_dataset_1.drop(["Unnamed: 0","Unnamed: 0.1","customDimensions","hits"],axis=1,inplace=True)

In [43]:
test_sample_dataset_1.head()

Unnamed: 0,channelGrouping,date,fullVisitorId,socialEngagementType,visitId,visitNumber,visitStartTime,device.browser,device.browserSize,device.browserVersion,...,trafficSource.adwordsClickInfo.gclId,trafficSource.adwordsClickInfo.isVideoAd,trafficSource.adwordsClickInfo.page,trafficSource.adwordsClickInfo.slot,trafficSource.campaign,trafficSource.isTrueDirect,trafficSource.keyword,trafficSource.medium,trafficSource.referralPath,trafficSource.source
0,Organic Search,20180511,7.460955e+18,Not Socially Engaged,1526099341,2,1526099341,Chrome,not available in demo dataset,not available in demo dataset,...,,,,,(not set),True,(not provided),organic,(not set),google
1,Direct,20180511,4.6025244e+17,Not Socially Engaged,1526064483,166,1526064483,Chrome,not available in demo dataset,not available in demo dataset,...,,,,,(not set),True,(not set),(none),(not set),(direct)
2,Organic Search,20180511,3.4618085e+18,Not Socially Engaged,1526067157,2,1526067157,Chrome,not available in demo dataset,not available in demo dataset,...,,,,,(not set),True,(not provided),organic,(not set),google
3,Direct,20180511,9.7512944e+17,Not Socially Engaged,1526107551,4,1526107551,Chrome,not available in demo dataset,not available in demo dataset,...,,,,,(not set),True,(not set),(none),(not set),(direct)
4,Organic Search,20180511,8.381673e+18,Not Socially Engaged,1526060254,1,1526060254,Internet Explorer,not available in demo dataset,not available in demo dataset,...,,,,,(not set),,(not provided),organic,(not set),google


In [51]:
training_sample_dataset_1.shape,test_sample_dataset_1.shape

((20000, 58), (20000, 57))

In [45]:
training_sample_dataset_1.columns

Index(['Unnamed: 0', 'channelGrouping', 'date', 'fullVisitorId',
       'socialEngagementType', 'visitId', 'visitNumber', 'visitStartTime',
       'device.browser', 'device.browserSize', 'device.browserVersion',
       'device.deviceCategory', 'device.flashVersion', 'device.isMobile',
       'device.language', 'device.mobileDeviceBranding',
       'device.mobileDeviceInfo', 'device.mobileDeviceMarketingName',
       'device.mobileDeviceModel', 'device.mobileInputSelector',
       'device.operatingSystem', 'device.operatingSystemVersion',
       'device.screenColors', 'device.screenResolution', 'geoNetwork.city',
       'geoNetwork.cityId', 'geoNetwork.continent', 'geoNetwork.country',
       'geoNetwork.latitude', 'geoNetwork.longitude', 'geoNetwork.metro',
       'geoNetwork.networkDomain', 'geoNetwork.networkLocation',
       'geoNetwork.region', 'geoNetwork.subContinent', 'totals.bounces',
       'totals.hits', 'totals.newVisits', 'totals.pageviews',
       'totals.sessionQualityDim

In [52]:
(training_sample_dataset_1.isna().sum()/training_sample_dataset_1.shape[0])*100

Unnamed: 0                                            0.000
channelGrouping                                       0.000
date                                                  0.000
fullVisitorId                                         0.000
socialEngagementType                                  0.000
visitId                                               0.000
visitNumber                                           0.000
visitStartTime                                        0.000
device.browser                                        0.000
device.browserSize                                    0.000
device.browserVersion                                 0.000
device.deviceCategory                                 0.000
device.flashVersion                                   0.000
device.isMobile                                       0.000
device.language                                       0.000
device.mobileDeviceBranding                           0.000
device.mobileDeviceInfo                 

In [53]:
(test_sample_dataset_1.isna().sum()/test_sample_dataset_1.shape[0])*100

channelGrouping                                       0.000
date                                                  0.000
fullVisitorId                                         0.000
socialEngagementType                                  0.000
visitId                                               0.000
visitNumber                                           0.000
visitStartTime                                        0.000
device.browser                                        0.000
device.browserSize                                    0.000
device.browserVersion                                 0.000
device.deviceCategory                                 0.000
device.flashVersion                                   0.000
device.isMobile                                       0.000
device.language                                       0.000
device.mobileDeviceBranding                           0.000
device.mobileDeviceInfo                               0.000
device.mobileDeviceMarketingName        

In [26]:
training_sample_dataset_1[training_sample_dataset_1['totals.transactionRevenue']!='NaN']

Unnamed: 0,channelGrouping,date,fullVisitorId,socialEngagementType,visitId,visitNumber,visitStartTime,device.browser,device.browserSize,device.browserVersion,...,trafficSource.adwordsClickInfo.gclId,trafficSource.adwordsClickInfo.isVideoAd,trafficSource.adwordsClickInfo.page,trafficSource.adwordsClickInfo.slot,trafficSource.campaign,trafficSource.isTrueDirect,trafficSource.keyword,trafficSource.medium,trafficSource.referralPath,trafficSource.source
0,Organic Search,20171016,3162355547410993243,Not Socially Engaged,1508198450,1,1508198450,Firefox,not available in demo dataset,not available in demo dataset,...,,,,,(not set),,water bottle,organic,,google
1,Referral,20171016,8934116514970143966,Not Socially Engaged,1508176307,6,1508176307,Chrome,not available in demo dataset,not available in demo dataset,...,,,,,(not set),,,referral,/a/google.com/transportation/mtv-services/bike...,sites.google.com
2,Direct,20171016,7992466427990357681,Not Socially Engaged,1508201613,1,1508201613,Chrome,not available in demo dataset,not available in demo dataset,...,,,,,(not set),True,,(none),,(direct)
3,Organic Search,20171016,9075655783635761930,Not Socially Engaged,1508169851,1,1508169851,Chrome,not available in demo dataset,not available in demo dataset,...,,,,,(not set),,(not provided),organic,,google
4,Organic Search,20171016,6960673291025684308,Not Socially Engaged,1508190552,1,1508190552,Chrome,not available in demo dataset,not available in demo dataset,...,,,,,(not set),,(not provided),organic,,google
5,Referral,20171016,0166277907528479249,Not Socially Engaged,1508196701,1,1508196701,Chrome,not available in demo dataset,not available in demo dataset,...,,,,,(not set),,,(none),/offer/2145,(direct)
6,Referral,20171016,8349655975937271469,Not Socially Engaged,1508152478,1,1508152478,Chrome,not available in demo dataset,not available in demo dataset,...,,,,,(not set),,,referral,/a/google.com/nest-vision/dropcam-field-tester...,sites.google.com
7,Organic Search,20171016,1332629902468998662,Not Socially Engaged,1508206208,1,1508206208,Chrome,not available in demo dataset,not available in demo dataset,...,,,,,(not set),,(not provided),organic,,google
8,Organic Search,20171016,632878546807742341,Not Socially Engaged,1508207516,1,1508207516,Chrome,not available in demo dataset,not available in demo dataset,...,,,,,(not set),,(not provided),organic,,google
9,Organic Search,20171016,1259490915281096752,Not Socially Engaged,1508165159,2,1508165159,Safari,not available in demo dataset,not available in demo dataset,...,,,,,(not set),True,(not provided),organic,,google


In [94]:
def null_values(base_dataset):
    print(base_dataset.isna().sum())
    # null value percentage     
    null_value_table=(base_dataset.isna().sum()/base_dataset.shape[0])*100
    # null value percentage beyond threshold drop , else treat the columns 
    
    retained_columns=null_value_table[null_value_table<int(input())].index
    #if any variable as null value greater than input(like 30% of the data) 
    #value than those variable are consider as drop
    drop_columns=null_value_table[null_value_table>int(input())].index
    base_dataset.drop(drop_columns,axis=1,inplace=True)
    len(base_dataset.isna().sum().index)
    cont=base_dataset.describe().columns
    cat=[i for i in base_dataset.columns if i not in base_dataset.describe().columns]
    for i in cat:
        base_dataset[i].fillna(base_dataset[i].value_counts().index[0],inplace=True)
    for i in cont:
        base_dataset[i].fillna(base_dataset[i].median(),inplace=True)
    print(base_dataset.isna().sum())
    return base_dataset,cat,cont

In [95]:
train_dataset,train_cat,train_continuos = null_values(training_sample_dataset_1)

channelGrouping                                         0
date                                                    0
fullVisitorId                                           0
socialEngagementType                                    0
visitId                                                 0
visitNumber                                             0
visitStartTime                                          0
device.browser                                          0
device.browserSize                                      0
device.browserVersion                                   0
device.deviceCategory                                   0
device.flashVersion                                     0
device.isMobile                                         0
device.language                                         0
device.mobileDeviceBranding                             0
device.mobileDeviceInfo                                 0
device.mobileDeviceMarketingName                        0
device.mobileD

In [96]:
test_dataset,test_cat,test_continuos = null_values(test_sample_dataset_1)

channelGrouping                                         0
date                                                    0
fullVisitorId                                           0
socialEngagementType                                    0
visitId                                                 0
visitNumber                                             0
visitStartTime                                          0
device.browser                                          0
device.browserSize                                      0
device.browserVersion                                   0
device.deviceCategory                                   0
device.flashVersion                                     0
device.isMobile                                         0
device.language                                         0
device.mobileDeviceBranding                             0
device.mobileDeviceInfo                                 0
device.mobileDeviceMarketingName                        0
device.mobileD

In [97]:
train_dataset.shape

(5000, 42)

In [98]:
test_dataset.shape

(5000, 46)

In [None]:
train_dataset['total.r']