In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
import gc
import sys
import math

from pandas.io.json import json_normalize
from datetime import datetime

import os
# print(os.listdir("../input"))

In [3]:
gc.enable()

In [4]:
features = ['channelGrouping', 'date', 'fullVisitorId', 'visitId',\
       'visitNumber', 'visitStartTime', 'device.browser',\
       'device.deviceCategory', 'device.isMobile', 'device.operatingSystem',\
       'geoNetwork.city', 'geoNetwork.continent', 'geoNetwork.country',\
       'geoNetwork.metro', 'geoNetwork.networkDomain', 'geoNetwork.region',\
       'geoNetwork.subContinent', 'totals.bounces', 'totals.hits',\
       'totals.newVisits', 'totals.pageviews', 'totals.transactionRevenue',\
       'trafficSource.adContent', 'trafficSource.campaign',\
       'trafficSource.isTrueDirect', 'trafficSource.keyword',\
       'trafficSource.medium', 'trafficSource.referralPath',\
       'trafficSource.source', 'customDimensions']

In [5]:
def load_df(csv_path):
    JSON_COLUMNS = ['device', 'geoNetwork', 'totals', 'trafficSource']
    ans = pd.DataFrame()
    dfs = pd.read_csv(csv_path, sep=',',
            converters={column: json.loads for column in JSON_COLUMNS}, 
            dtype={'fullVisitorId': 'str'}, # Important!!
            chunksize=100000)
    for df in dfs:
        df.reset_index(drop=True, inplace=True)
        for column in JSON_COLUMNS:
            column_as_df = json_normalize(df[column])
            column_as_df.columns = [f"{column}.{subcolumn}" for subcolumn in column_as_df.columns]
            df = df.drop(column, axis=1).merge(column_as_df, right_index=True, left_index=True)

        #print(f"Loaded {os.path.basename(csv_path)}. Shape: {df.shape}")
        use_df = df[features]
        del df
        gc.collect()
        ans = pd.concat([ans, use_df], axis=0).reset_index(drop=True)
        #print(ans.shape)
    return ans

In [6]:
train = load_df("C:\\Sridhar\\AI_ML\\Algorithms\\datasets\\ga-customer-revenue-prediction\\training_sample.csv")

In [7]:
test = load_df("C:\\Sridhar\\AI_ML\\Algorithms\\datasets\\ga-customer-revenue-prediction\\test_sample.csv")

In [8]:
print('train date:', min(train['date']), 'to', max(train['date']))
print('test date:', min(test['date']), 'to', max(test['date']))

train date: 20160811 to 20180415
test date: 20180502 to 20181015


In [9]:
 def memoryManagement(base_dataset):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    df=base_dataset
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))    
    return df

In [10]:
train = memoryManagement(train)

Memory usage of dataframe is 8.89 MB
Memory usage after optimization is: 3.79 MB
Decreased by 57.4%


In [11]:
test = memoryManagement(test)

Memory usage of dataframe is 8.89 MB
Memory usage after optimization is: 3.73 MB
Decreased by 58.0%


In [14]:
for c in train.columns.values:
    if c not in test.columns.values:
        print(c)

In [15]:
train['totals.transactionRevenue'].fillna(0, inplace=True)

ValueError: fill value must be in categories

In [29]:
for i in train['totals.transactionRevenue'].values:
#     print(i)
    if i == 'NaN':
        print(i)
        train['totals.transactionRevenue'] = 0
    

In [27]:
train['totals.transactionRevenue']

0        NaN
1        NaN
2        NaN
3        NaN
4        NaN
5        NaN
6        NaN
7        NaN
8        NaN
9        NaN
10       NaN
11       NaN
12       NaN
13       NaN
14       NaN
15       NaN
16       NaN
17       NaN
18       NaN
19       NaN
20       NaN
21       NaN
22       NaN
23       NaN
24       NaN
25       NaN
26       NaN
27       NaN
28       NaN
29       NaN
        ... 
39970    NaN
39971    NaN
39972    NaN
39973    NaN
39974    NaN
39975    NaN
39976    NaN
39977    NaN
39978    NaN
39979    NaN
39980    NaN
39981    NaN
39982    NaN
39983    NaN
39984    NaN
39985    NaN
39986    NaN
39987    NaN
39988    NaN
39989    NaN
39990    NaN
39991    NaN
39992    NaN
39993    NaN
39994    NaN
39995    NaN
39996    NaN
39997    NaN
39998    NaN
39999    NaN
Name: totals.transactionRevenue, Length: 40000, dtype: category
Categories (323, object): [100000000, 103240000, 103360000, 10370000, ..., 95970000, 96550000, 96770000, 9980000]