# EDA and LGMB Baseline 📚🤖📊

#### Please give an upvote if you find this useful! (WIP)

### Imports 🗂

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
import os
from tqdm import tqdm
import random
import seaborn as sns
import math

import warnings
warnings.filterwarnings('ignore')

### Load in the data ⏳
Load in the pickle file from https://www.kaggle.com/columbia2131/speed-up-reading-csv-to-pickle :)

In [None]:
DATA_PATH = Path('../input/ump-train-picklefile')
SAMPLE_TEST_PATH = Path('../input/ubiquant-market-prediction')
!ls $SAMPLE_TEST_PATH

In [None]:
train = pd.read_pickle(DATA_PATH/'train.pkl')

### Basic EDA 📊

In [None]:
train.head()

We can see each data point has:

- `row_id` - A unique identifier for the row

- `time_id` - ID for the time the data was collected. Not all investments have data for all the time IDs

- `investment_id` - ID for each individual investment

- `target` - The target

- `[f_0:f_299]` - features generated from the investment data at that time ID

In [None]:
num_data_points = len(train)
print(f'We have {num_data_points} data points')

In [None]:
num_investments = train['investment_id'].nunique()
print(f'We have {num_investments} unique investments')

In [None]:
num_time_intervals = train['time_id'].nunique()
print(f'Each investment has a maximum of {num_time_intervals} time intervals')

In [None]:
print(f'We have {train.isnull().sum().sum()} missing values')

(Note this seems to disagree with the summary statistics in the data tab of the competition..)

### Target and Time_id distribution

In [None]:
sns.set_theme()
fig, ax =plt.subplots(1,2, figsize=(14, 4))
sns.distplot(train['target'], ax=ax[0]).set_title('Target Distribution')
sns.distplot(train['time_id'], ax=ax[1]).set_title('Time_id Distribution')
fig.show()

 ### Looking at the distribution of 9 anonymous features

In [None]:
sample_features = random.sample(range(299), 9)
fig, ax = plt.subplots(3,3, figsize=(18, 18))
for i, sample in enumerate(sample_features):
    sns.distplot(train[f'f_{sample}'], ax=ax[math.floor(i/3),i%3]).set_title(f'f_{sample} Distribution')
fig.show()

### Target distribution of investment 0, 1 and 2


In [None]:
sns.set_theme()
fig, ax =plt.subplots(3,1, figsize=(16, 12))
sns.lineplot(data=train[train['investment_id']==0]['target'], ax=ax[0]).set_title('Investment 0')
sns.lineplot(data=train[train['investment_id']==1]['target'], ax=ax[1], color='r').set_title('Investment 1')
sns.lineplot(data=train[train['investment_id']==2]['target'], ax=ax[2], color='g').set_title('Investment 2')
fig.show()

#### Looking at correlation between the first 30 anonymous features, time_id and target for investment 0

In [None]:
investment_0 = train[train['investment_id']==0]
df = investment_0.drop(['row_id', 'investment_id'], axis=1)
del investment_0
df_30 = df.iloc[: , :32]
del df
corrMatrix = df_30.corr()
plt.figure(figsize = (15,8))
sns.heatmap(corrMatrix.to_numpy(), cmap="YlGnBu")

### Basic LightGBM model 🌳

In [None]:
# Due to low memory
%reset -f

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
import random
import seaborn as sns

from lightgbm import LGBMRegressor
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')

In [None]:
DATA_PATH = Path('../input/ump-train-picklefile')
SAMPLE_TEST_PATH = Path('../input/ubiquant-market-prediction')

In [None]:
train = pd.read_pickle(DATA_PATH/'train.pkl')
train.drop(['row_id', 'time_id'], axis=1, inplace=True)
X = train.drop(['target'], axis=1)
y = train["target"]
del train
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.01, random_state=42, shuffle=False)
del X
del y

In [None]:
model = LGBMRegressor(
        objective="regression",
        metric="rmse",
        boosting_type="gbdt",
        n_estimators=1400,
        min_child_samples = 1000,
        num_leaves=100,
        max_depth=10,
        learning_rate=0.02,
        subsample=0.8,
        subsample_freq=1
)


model.fit(X_train, y_train,
          eval_set=[(X_valid, y_valid)],
          verbose=20,
          eval_metric='rmse',
          early_stopping_rounds=30)

In [None]:
cat_model = CatBoostRegressor()
cat_model.fit(X_train, y_train,
          eval_set=[(X_valid, y_valid)],
          verbose=20,
          eval_metric='rmse',
          early_stopping_rounds=30)

In [None]:
def plotImp(model, X , num = 20, fig_size = (40, 20)):
    feature_imp = pd.DataFrame({'Value':model.feature_importances_,'Feature':X.columns})
    plt.figure(figsize=fig_size)
    sns.set(font_scale = 5)
    sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value", 
                                                        ascending=False)[0:num])
    plt.title('LightGBM Features')
    plt.tight_layout()
    plt.savefig('lgbm_importances-01.png')
    plt.show()

In [None]:
plotImp(model, X_valid)

In [None]:
#### Code for kfold split. This can't run on a kaggle kernel due to memory limits ####

#n_splits = 4
#kf = KFold(n_splits=n_splits, shuffle=True)
#models = []
#for i, (train_index, test_index) in enumerate(kf.split(X, y)):
#    X_train, X_valid = X.iloc[train_index], X.iloc[test_index]
#    y_train, y_valid = y.iloc[train_index], y.iloc[test_index]
#    
#    model = LGBMRegressor(
#        objective="regression",
#        metric="rmse",
#        boosting_type="gbdt",
#    )
#    
#    model.fit(X_train, y_train,eval_set=[(X_valid, y_valid)], 
#            early_stopping_rounds=100)
#    models.append(model)
#    
#    print(f'Trained {i}/{n_splits} models')

#### Look at sample submission and example test csvs



In [None]:
example_test = pd.read_csv(SAMPLE_TEST_PATH/'example_test.csv')
sample_sub = pd.read_csv(SAMPLE_TEST_PATH/'example_sample_submission.csv')
display(example_test.head(2))
display(sample_sub.head(2))

In [None]:
import ubiquant
env = ubiquant.make_env()  
iter_test = env.iter_test()
for (test_df, sample_prediction_df) in iter_test:
    test_df.drop(['row_id'], axis=1, inplace=True)
    pred = (model.predict(test_df) + cat_model(test_df))/2
    sample_prediction_df['target'] = pred
    env.predict(sample_prediction_df) 