In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Load packages for modeling

In [None]:
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostRegressor

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

def rmse(y_true, y_pred):
    return round(np.sqrt(mean_squared_error(y_true, y_pred)), 5)

In [None]:
import pandas as pd
import numpy as np
import time
from datetime import datetime
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv('../input/ga-dataset-clean/train_clean.csv', converters={'fullVisitorId': str})

In [None]:
columns = [col for col in df.columns if df[col].nunique() > 1]
df = df[columns]

In [None]:
df.head()

# Use datetime to split the dataset into training and testing dataset and set validation data

In [None]:
import datetime
df["date"] = pd.to_datetime(df["date"]).dt.date

In [None]:
train_df = df[df['date']<=datetime.date(2017,5,31)]

test_df = df[df['date']>datetime.date(2017,5,31)]

In [None]:
target = train_df['totals.transactionRevenue'].fillna(0).astype(float)
target = target.apply(lambda x: np.log1p(x))
del train_df['totals.transactionRevenue']

target_test = test_df['totals.transactionRevenue'].fillna(0).astype(float)
target_test = target_test.apply(lambda x: np.log1p(x))
del test_df['totals.transactionRevenue']

In [None]:
columns = [col for col in train_df.columns if train_df[col].nunique() > 1]
train_df = train_df[columns]
test_df = test_df[columns]
##Before performing label encoding, we merge the test and train sets to insure we have consistent labels in the two sets:
trn_len = train_df.shape[0]
merged_df = pd.concat([train_df, test_df])

In [None]:
for col in merged_df.columns:
    if col in ['fullVisitorId', 'month', 'day', 'weekday', 'visithour']: continue
    if merged_df[col].dtypes == object or merged_df[col].dtypes == bool:
        merged_df[col], indexer = pd.factorize(merged_df[col])

In [None]:
numerics = [col for col in merged_df.columns if 'totals.' in col]
numerics += ['visitNumber', 'mean_hits_per_day', 'fullVisitorId']
categorical_feats =  [col for col in merged_df.columns if col not in numerics]

In [None]:
for col in categorical_feats:
    merged_df[col] = merged_df[col].astype(int)
merged_df['fullVisitorId'] = merged_df['fullVisitorId'].astype(float)

In [None]:
train_df = merged_df[:trn_len]
test_df = merged_df[trn_len:]

In [None]:
 params = {
        "objective" : "regression",
        "metric" : "rmse",
        "num_leaves" : 40,
        "learning_rate" : 0.005,
        "bagging_fraction" : 0.6,
        "feature_fraction" : 0.6,
        "bagging_frequency" : 6,
        "bagging_seed" : 42,
        "verbosity" : -1,
        "seed": 1
    }

In [None]:
trn_cols = [col for col in train_df.columns if col not in ['fullVisitorId']]

In [None]:
# cross—validation
folds = KFold(n_splits=5, shuffle=True, random_state=15)  
# set dataframe
oof = np.zeros(len(train_df))
start = time.time()
features = list(train_df[trn_cols].columns)
feature_importance_df = pd.DataFrame()

In [None]:
for fold_, (trn_idx, val_idx) in enumerate(folds.split(train_df.values, target.values)):
    # train data
    trn_data = lgb.Dataset(train_df.iloc[trn_idx][trn_cols], label=target.iloc[trn_idx], categorical_feature=categorical_feats) 
    # alidation data
    val_data = lgb.Dataset(train_df.iloc[val_idx][trn_cols], label=target.iloc[val_idx], categorical_feature=categorical_feats)

    
evals_result = {} 

# Modeling

## 1. LightGB

In [None]:
num_round = 10000
lgbmodel = lgb.train(params, trn_data, num_round, valid_sets = [trn_data, val_data],evals_result=evals_result,verbose_eval=100,early_stopping_rounds = 100)

## Results & evaluation

In [None]:
# prediction
oof[val_idx] = lgbmodel.predict(train_df.iloc[val_idx][trn_cols], num_iteration=lgbmodel.best_iteration)
oof[trn_idx] = lgbmodel.predict(train_df.iloc[trn_idx][trn_cols], num_iteration=lgbmodel.best_iteration)

print(f"LGB : RMSE val: {rmse(target.iloc[val_idx], oof[val_idx] )}  - RMSE train: {rmse(target.iloc[trn_idx], oof[trn_idx])}")

In [None]:
oof_test  = lgbmodel.predict(test_df[trn_cols], num_iteration=lgbmodel.best_iteration) 
print(f"LGB : RMSE test: {rmse(target_test, oof_test )}  ")

In [None]:
ax = lgb.plot_metric(evals_result, metric='rmse')
plt.show()

In [None]:
## feature importance    
fold_importance_df = pd.DataFrame()
fold_importance_df["feature"] = features
fold_importance_df["importance"] = lgbmodel.feature_importance()
fold_importance_df["fold"] = fold_ + 1
feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)

In [None]:
cols = feature_importance_df[["feature", "importance"]].groupby("feature").mean().sort_values(
    by="importance", ascending=False)[:1000].index

best_features = feature_importance_df.loc[feature_importance_df.feature.isin(cols)]

plt.figure(figsize=(14,10))
sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False))
plt.title('LightGBM Features (avg over folds)')
plt.tight_layout()
plt.savefig('lgbm_importances.png')

In [None]:
import shap

explainer = shap.TreeExplainer(lgbmodel)

shap_values = explainer.shap_values(train_df[trn_cols])

shap.summary_plot(shap_values, train_df[trn_cols])

In [None]:
import shap

explainer = shap.TreeExplainer(lgbmodel)

shap_values = explainer.shap_values(test_df[trn_cols])

shap.summary_plot(shap_values, test_df[trn_cols])

In [None]:
####  metric MAE

In [None]:
params = {
        "objective" : "regression",
        "metric" : "mae",
        "num_leaves" : 40,
        "learning_rate" : 0.005,
        "bagging_fraction" : 0.6,
        "feature_fraction" : 0.6,
        "bagging_frequency" : 6,
        "bagging_seed" : 42,
        "verbosity" : -1,
        "seed": 1
    }

for fold_, (trn_idx, val_idx) in enumerate(folds.split(train_df.values, target.values)):
    ##train data
    trn_data_mae = lgb.Dataset(train_df.iloc[trn_idx][trn_cols], label=target.iloc[trn_idx], categorical_feature=categorical_feats) 
    ##validation data
    val_data_mae = lgb.Dataset(train_df.iloc[val_idx][trn_cols], label=target.iloc[val_idx], categorical_feature=categorical_feats)


num_round = 10000
lgbmodel_mae = lgb.train(params, trn_data_mae, num_round, valid_sets = [trn_data_mae, val_data_mae],verbose_eval=100,early_stopping_rounds = 100)

In [None]:
# prediction
oof_mae = np.zeros(len(train_df))
oof_mae[val_idx] = lgbmodel_mae.predict(train_df.iloc[val_idx][trn_cols], num_iteration=lgbmodel_mae.best_iteration)
oof_mae[trn_idx] = lgbmodel_mae.predict(train_df.iloc[trn_idx][trn_cols], num_iteration=lgbmodel_mae.best_iteration)

print(f"LGB_MAE : RMSE val: {rmse(target.iloc[val_idx], oof_mae[val_idx] )}  - RMSE train: {rmse(target.iloc[trn_idx], oof_mae[trn_idx])}")

In [None]:
oof_mae_test = lgbmodel_mae.predict(test_df[trn_cols], num_iteration=lgbmodel.best_iteration) 
print(f"LGB_MAE : RMSE test: {rmse(target_test, oof_mae_test )}  ")

Reference:https://www.kaggle.com/fabiendaniel/lgbm-starter?scriptVersionId=5983064

##  2. XGBoosting

https://yunyaniu.blog.csdn.net/article/details/103938851

https://blog.csdn.net/qq_26684561/article/details/102574708

https://www.bbsmax.com/A/kvJ33rxwJg/

https://www.jianshu.com/p/5504c1f9e562

https://zhuanlan.zhihu.com/p/64799119

In [None]:
params = {'objective': 'reg:linear',
              'eval_metric': 'rmse',
              'eta': 0.001,
              'max_depth': 10,
              'subsample': 0.6,
              'colsample_bytree': 0.6,
              'alpha':0.001,
              'random_state': 1,
              'silent': True}

In [None]:
xgb_trn_data = xgb.DMatrix(train_df.iloc[trn_idx][trn_cols], target.iloc[trn_idx])
xgb_val_data = xgb.DMatrix(train_df.iloc[val_idx][trn_cols], target.iloc[val_idx])
xgb_test = xgb.DMatrix(test_df[trn_cols])

In [None]:
xgbmodel = xgb.train(params, xgb_trn_data, 
                      num_boost_round=2000, 
                      evals= [(xgb_trn_data, 'train'), (xgb_val_data, 'valid')],
                      early_stopping_rounds=100, 
                      verbose_eval=100
                     )

y_pred_train = xgbmodel.predict(xgb_trn_data, ntree_limit=xgbmodel.best_ntree_limit)
y_pred_val = xgbmodel.predict(xgb_val_data, ntree_limit=xgbmodel.best_ntree_limit)


print(f"XGB : RMSE val: {rmse(target.iloc[val_idx], y_pred_val)}  - RMSE train: {rmse(target.iloc[trn_idx], y_pred_train)}")

## Results & evaluation

In [None]:
y_xgb_test = xgbmodel.predict(xgb_test, ntree_limit=xgbmodel.best_ntree_limit)
print(f"XGB : RMSE test: {rmse(target_test, y_xgb_test )}  ")

In [None]:
from xgboost import XGBRegressor
from xgboost import plot_importance

In [None]:
xgb.plot_importance(xgbmodel,max_num_features=10)

In [None]:
xgb.plot_importance(xgbmodel,max_num_features=10,importance_type='gain')

In [None]:
explainer = shap.TreeExplainer(xgbmodel)

shap_values = explainer.shap_values(test_df[trn_cols])

shap.summary_plot(shap_values, test_df[trn_cols])

## 3. Catboost

In [None]:
catmodel = CatBoostRegressor(iterations=1000,
                             learning_rate=0.05,
                             depth=10,
                             eval_metric='RMSE',
                             random_seed = 1,
                             bagging_temperature = 0.2,
                             od_type='Iter',
                             metric_period = 50,
                             od_wait=20)
catmodel.fit(train_df.iloc[trn_idx][trn_cols], target.iloc[trn_idx],
              eval_set=(train_df.iloc[val_idx][trn_cols], target.iloc[val_idx]),
              use_best_model=True,
              verbose=True)

## Results & evaluation

In [None]:
y_pred_train = catmodel.predict(train_df.iloc[trn_idx][trn_cols])
y_pred_val = catmodel.predict(train_df.iloc[val_idx][trn_cols])
y_cat_test = catmodel.predict(test_df[trn_cols])

print(f"CatB: RMSE val: {rmse(target.iloc[val_idx], y_pred_val)}  - RMSE train: {rmse(target.iloc[trn_idx], y_pred_train)}")
print(f"CatB : RMSE test: {rmse(target_test, y_cat_test )}  ")

Reference:https://www.kaggle.com/julian3833/2-quick-study-lgbm-xgb-and-catboost-lb-1-66

## 4. Random forest

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn import metrics

In [None]:
X_train_RF = train_df[trn_cols]
y_train_RF = target
X_test_RF  = test_df[trn_cols]
y_test_RF  = target_test

## Results & evaluation

In [None]:
RF=RandomForestRegressor()
RF.fit(X_train_RF,y_train_RF)
RF_scores = cross_val_score(RF,X_train_RF,y_train_RF,cv=5,scoring='neg_mean_squared_error')
print(RF_scores.mean())
y_RF_test = RF.predict(X_test_RF)
print("randomforest RMSE : ", np.sqrt(metrics.mean_squared_error(y_test_RF,y_RF_test)))

In [None]:
# Feature importance
for name, score in zip(df.columns, RF.feature_importances_):
    print(f" ",name, " = " ,score)