In [1]:
import os
import gc  #This is garbage collector 
import sys #System 
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import pickle

import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
%matplotlib inline
sns.set_style('darkgrid')
pd.set_option('display.max_columns', 140)

from sklearn.preprocessing import StandardScaler as scale
from sklearn.decomposition import PCA
from sklearn.cluster import k_means

In [19]:
df = pd.read_csv("train.csv")
df_backup = df

#Dataset length 
org_len = len(df)

#Saving memory by changing the dtype
for i in df:
    if df[i].dtype == np.float64:
        if (((df[i] < .0001) & (df[i] > -.0001)).mean()) < .001:
            df[i] = df[i].astype(np.float32)
            gc.collect()
            

df.date = df.date.astype(np.int16)
df.ts_id = df.ts_id.astype(np.int32)
df.feature_0 = df.feature_0.astype(np.int32)

#Sorting with respect to date
df.sort_values(by = ['date','ts_id'],inplace = True)

#Create a action column - 1 if the resp is >0 and 0 if resp < 0 
df['action'] = np.where(df['resp']>0 , 1 , 0 )
df.action = df.action.astype("category")



In [20]:
#Chosing the features which have 10% of the data missing - we can remove these features 
df = df_backup 
high_null_list = []
low_null_list = []
null_values = []
for i in df.columns.values:
    null_sum = df[i].isnull().sum()
    if null_sum > 0.1 * org_len:
        high_null_list.append(i)
        null_values.append([i,null_sum])
    elif null_sum < 0.1 * org_len and null_sum > 0 : 
        low_null_list.append(i)

#Dropping the values which have too many nans 
df = df.drop(columns = high_null_list, axis=1)
len(df.columns.values)

# We need to check for the distribution of the remaining features and the number of outilers 
# before we impute the missing values using mean. 

df_backup_pre_fillna = df 

df_mean = df.mean()
#Imputing the low null list features with mean values 
df.fillna(df.mean(),inplace = True)

print("Number of features with null values:",np.sum(df.isna().sum()>0))

Number of features with null values: 0


# PCA

In [1]:
#df_pre_pca = df
#df_pre_pca.to_csv("Byproducts/df_pre_pca.csv")

#df_pre_pca = pd.read_csv("Byproducts/df_pre_pca.csv")
df = df_pre_pca



df_feature = df.loc[:, df.columns.str.contains('feature')]

#First step is to standardize the features 
scaler = scale()
scaler.fit(df_feature)
df_pca = pd.DataFrame(scaler.transform(df_feature), columns = df_feature.columns.values)

#Step 2 is to find the principal components from the transformed data. 
n_components = 30

pca = PCA(n_components = n_components)
pc_names = ['PC'+str(i) for i in range(n_components)]
df_pca = pd.DataFrame(pca.fit_transform(df_pca), columns = pc_names  )

df_pca_backup = df_pca  

df_pca = pd.concat([df_pca, df[df.columns.values[:7]], df[df.columns.values[-2:]]], axis = 1 )

#Saving all files 

# df_pca.to_csv("Byproducts/df_pca.csv")

# with open('Byproducts/high_null_list', 'wb') as fp:
#     pickle.dump(high_null_list, fp)

# with open('Byproducts/pca_object', 'wb') as fp:
#     pickle.dump(pca, fp)

with open('Byproducts/scalar_object', 'wb') as fp:
    pickle.dump(scaler, fp)


NameError: name 'df_pre_pca' is not defined

In [2]:
df_pca = pd.read_csv("Byproducts/df_pca.csv")

X_train = df_pca.loc[:, df_pca.columns.str.contains('PC')]
y_train = df_pca.loc[:, 'action']


from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import optuna
from optuna.samplers import TPESampler

print("Loaded data")

sampler = TPESampler(seed=666)

def create_model(trial):
    max_depth = trial.suggest_int("max_depth", 2, 12)
    n_estimators = trial.suggest_int("n_estimators", 2, 600)
    learning_rate = trial.suggest_uniform('learning_rate', 0.0001, 0.99)
    subsample = trial.suggest_uniform('subsample', 0.0001, 1.0)
    colsample_bytree = trial.suggest_uniform('colsample_bytree', 0.0000001, 1)
    
    model = XGBClassifier(
        n_estimators=n_estimators, 
        max_depth=max_depth, 
        learning_rate=learning_rate,
        subsample=subsample,
        colsample_bytree=colsample_bytree,
        random_state=666,
        tree_method='hist'
    )
    return model

def objective(trial):
    model = create_model(trial)
    model.fit(X_train, y_train)
    score = accuracy_score(
        y_train, 
        model.predict(X_train)
    )
    return score

params1 = {
    'max_depth': 8, 
    'n_estimators': 500, 
    'learning_rate': 0.01, 
    'subsample': 0.9, 
    'tree_method': 'hist',
    'random_state': 666,
    'n_jobs': 4 
}

params3 = {
    'max_depth': 10, 
    'n_estimators': 500, 
    'learning_rate': 0.03, 
    'subsample': 0.9, 
    'colsample_bytree': 0.7,
    'tree_method': 'hist',
    'random_state': 666,
    'n_jobs': 4 
}


model1 = XGBClassifier(**params1)
model1.fit(X_train, y_train)

print("Built model 1")

model3 = XGBClassifier(**params3)
model3.fit(X_train, y_train)

print("Built model 3")

model1.predict(X_train[:100])

array([1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1,
       1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0])

In [18]:
with open('Byproducts/model1_xgboost', 'wb') as fp:
    pickle.dump(model1, fp)

with open('Byproducts/model3_xgboost', 'wb') as fp:
    pickle.dump(model3, fp)


In [1]:
df_pre_pca

NameError: name 'df_pre_pca' is not defined

# Jane street prediction

In [3]:
#Reading inputs 
df_pre_pca = pd.read_csv("Byproducts/df_pre_pca.csv")

with open ('Byproducts/pca_object', 'rb') as fp:
    pca = pickle.load(fp)
    
with open ('Byproducts/high_null_list', 'rb') as fp:
    high_null_list = pickle.load(fp)
    
with open ('Byproducts/scalar_object', 'rb') as fp:
    scaler = pickle.load(fp)
    
n_components = 30 
pc_names = ['PC'+str(i) for i in range(n_components)]

In [4]:
import janestreet

env = janestreet.make_env()
iter_test = env.iter_test()


In [5]:
cnt = 0 
for (test_df, sample_prediction_df) in iter_test:
    X_test = test_df.loc[:, test_df.columns.str.contains('feature')]

    #preprocess X_test
    X_test = X_test.drop(columns = high_null_list, axis=1)
    X_test.fillna(df_pre_pca.mean(), inplace = True)
    X_test = scaler.transform(X_test[X_test.columns.values])
    X_test = pd.DataFrame(pca.transform(X_test), columns = pc_names  )


    y_preds = model1.predict(X_test) + model3.predict(X_test)
    if y_preds == 2:
        y_preds = np.array([1])
    else:
        y_preds = np.array([0])

    sample_prediction_df.action = y_preds
    env.predict(sample_prediction_df)
    cnt = cnt + 1


In [12]:
with open('Outputs/env', 'wb') as fp:
    pickle.dump(env.predictions, fp)


In [13]:
pd.DataFrame(env.predictions).to_csv("Outputs/predictions.csv")

ValueError: Must pass 2-d input. shape=(15219, 1, 1)

In [14]:
predictions  = env.predictions

In [16]:
pd.DataFrame(env.predictions)

ValueError: Must pass 2-d input. shape=(15219, 1, 1)