<h1>Customer Revenue Prediction</h1>
Reference:<br>
https://www.kaggle.com/plasticgrammer/customer-revenue-prediction-v2-playground<br>
https://www.kaggle.com/julian3833/1-quick-start-read-csv-and-flatten-json-fields<br>

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import os
import matplotlib.pyplot as plt
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import json
from pandas.io.json import json_normalize
from datetime import datetime

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

print(os.listdir("../../input"))

# Any results you write to the current directory are saved as output.

FileNotFoundError: [Errno 2] No such file or directory: '../input'

In [None]:
features = ['channelGrouping', 'date', 'fullVisitorId', 'visitId',\
       'visitNumber', 'visitStartTime', 'device.operatingSystem',\
       'geoNetwork.city', 'geoNetwork.country','geoNetwork.metro',\
       'geoNetwork.networkDomain', 'geoNetwork.region', 'totals.hits',\
       'totals.pageviews', 'totals.transactionRevenue', 'trafficSource.adContent',\
       'trafficSource.isTrueDirect', 'trafficSource.referralPath', 'trafficSource.source']

In [None]:
#DataIterator for batch learning
class DataIterator():
    def __init__(self, filePath='../input/train_v2.csv'):
        self.JSON_COLUMNS = ['device', 'geoNetwork', 'totals', 'trafficSource']
        self.iter = pd.read_csv(filePath, converters={column: json.loads for column in self.JSON_COLUMNS}, dtype={'fullVisitorId': 'str'}, iterator=True)

        print(f'Loaded {os.path.basename (filePath)}.')
    
    def getChunk(self, chunkSize):
        df = self.iter.get_chunk(chunkSize)

        for column in self.JSON_COLUMNS:
            column_as_df = json_normalize(df[column])
            column_as_df.columns = [f"{column}.{subcolumn}" for subcolumn in column_as_df.columns]
            column_as_df.index = df.index
            df = df.drop(column, axis=1).merge(column_as_df, right_index=True, left_index=True)

        df['totals.transactionRevenue'].fillna(0, inplace=True)
        df['totals.transactionRevenue'] = np.log1p(df['totals.transactionRevenue'].astype(float))
            
        # fillna object feature
        for col in ['trafficSource.keyword',
                    'trafficSource.referralPath',
                    'trafficSource.adContent']:
            df[col].fillna('unknown', inplace=True)

        # fillna numeric feature
        df['totals.pageviews'].fillna(1, inplace=True)
        df['totals.newVisits'].fillna(0, inplace=True)
        df['totals.bounces'].fillna(0, inplace=True)
        df['totals.pageviews'] = df['totals.pageviews'].astype(int)
        df['totals.newVisits'] = df['totals.newVisits'].astype(int)
        df['totals.bounces'] = df['totals.bounces'].astype(int)

        # fillna boolean feature
        df['trafficSource.isTrueDirect'].fillna(False, inplace=True)
        
        df = df[features]
        
        
        format_str = '%Y%m%d'
        df['formated_date'] = df['date'].apply(lambda x: datetime.strptime(str(x), format_str))
        df['_month'] = df['formated_date'].apply(lambda x:x.month)
        df['_day'] = df['formated_date'].apply(lambda x:x.day)
        df.drop(['date','formated_date'], axis=1, inplace=True)
        
        df.drop(['visitId'],axis=1,inplace=True)

        for i, t in df.loc[:, df.columns != 'fullVisitorId'].dtypes.iteritems():
            if t == object:
                df[i].fillna('unknown', inplace=True)
                df[i] = pd.factorize(df[i])[0]
        
        return df
    
    

In [None]:
train_data = DataIterator()
train = train_data.getChunk(100000)
test_data = DataIterator('../input/test_v2.csv')
test = test_data.getChunk(100000)

In [None]:
train_id = train['fullVisitorId']
test_id = test['fullVisitorId']

Y_train = train.pop('totals.transactionRevenue')
Y_test = test.pop('totals.transactionRevenue')
X_train = train.drop(['fullVisitorId'], axis=1)
X_test  = test.drop(['fullVisitorId'], axis=1)

In [None]:
print(X_train.shape, X_test.shape)
print(Y_train.shape, Y_test.shape)

In [None]:
from sklearn.model_selection import StratifiedKFold, GroupKFold
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
from sklearn.linear_model import SGDRegressor

In [None]:
params={'learning_rate': 0.01,
        'objective':'regression',
        'metric':'rmse',
        'num_leaves': 31,
        'verbose': 1,
        'random_state':42,
        'bagging_fraction': 0.6,
        'feature_fraction': 0.6
       }

folds = GroupKFold(n_splits=5)

oof_preds = np.zeros(X_train.shape[0])
sub_preds = np.zeros(X_test.shape[0])
for fold_, (trn_, val_) in enumerate(folds.split(X_train, Y_train, groups=train_id)):
    trn_x, trn_y = X_train.iloc[trn_], Y_train.iloc[trn_]
    val_x, val_y = X_train.iloc[val_], Y_train.iloc[val_]
    
    reg = lgb.LGBMRegressor(**params, n_estimators=3000)
    #reg = SGDRegressor()
    reg.fit(trn_x, trn_y, eval_set=[(val_x, val_y)], early_stopping_rounds=50, verbose=500)
    #reg.partial_fit(trn_x, trn_y)
    
    oof_preds[val_] = reg.predict(val_x, num_iteration=reg.best_iteration_)
    #oof_preds[val_] = reg.predict(val_x)
    sub_preds += reg.predict(X_test, num_iteration=reg.best_iteration_) / folds.n_splits
    #sub_preds = reg.predict(X_test)

pred = sub_preds

In [None]:
# Plot feature importance
feature_importance = reg.feature_importances_
feature_importance = 100.0 * (feature_importance / feature_importance.max())
sorted_idx = np.argsort(feature_importance)
sorted_idx = sorted_idx[len(feature_importance) - 30:]
pos = np.arange(sorted_idx.shape[0]) + .5

plt.figure(figsize=(12,8))
plt.barh(pos, feature_importance[sorted_idx], align='center')
plt.yticks(pos, X_train.columns[sorted_idx])
plt.xlabel('Relative Importance')
plt.title('Variable Importance')
plt.show()

In [None]:
submission = pd.DataFrame({'fullVisitorId':test_id, 'PredictedLogRevenue':pred})

submission["PredictedLogRevenue"] = np.expm1(submission["PredictedLogRevenue"])
submission["PredictedLogRevenue"] = submission["PredictedLogRevenue"].apply(lambda x : 0.0 if x < 0 else x)
submission["PredictedLogRevenue"] = submission["PredictedLogRevenue"].fillna(0.0)

submission_sum = submission[['fullVisitorId', 'PredictedLogRevenue']].groupby('fullVisitorId').sum().reset_index()
submission_sum["PredictedLogRevenue"] = np.log1p(submission_sum["PredictedLogRevenue"])
submission_sum.to_csv("submission.csv", index=False)
submission_sum.head(20)