In [1]:
import multiprocessing
from kaggle.competitions import twosigmanews


import pandas as pd

import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV, train_test_split
import xgboost as xgb
import lightgbm as lgb

import matplotlib.pyplot as plt


In [2]:
cpu_count = 2*multiprocessing.cpu_count()-1
print('Number of CPUs: {}'.format(cpu_count))

Number of CPUs: 7


In [3]:
env = twosigmanews.make_env()
print('Done!')

Loading the data... This could take a minute.
Done!


In [4]:
(market_train_df, news_train_df) = env.get_training_data()

In [5]:
time = market_train_df['time']
universe = market_train_df['universe']

In [6]:
market_train_df.shape, news_train_df.shape

((4072956, 16), (9328750, 35))

In [7]:
universe = market_train_df['universe']
time = market_train_df['time']

In [8]:
def create_time_features(market_train_df, news_train_df):
    
    market_train_df['hour'] = market_train_df.time.dt.hour
    market_train_df['month'] = market_train_df.time.dt.month
    market_train_df['dayofweek'] = market_train_df.time.dt.dayofweek
    
    market_train_df.time = market_train_df.time.dt.date

    news_train_df.firstCreated = news_train_df.firstCreated.dt.date
    del news_train_df['time'], news_train_df['sourceTimestamp']
    return market_train_df, news_train_df

In [9]:
def join_news_market(market_train_df, news_train_df):

    news_train_df = news_train_df.groupby(['firstCreated', 'assetCodes'], as_index=False).mean()
    market_train_df = pd.merge(market_train_df, news_train_df, how='left', left_on=['time', 'assetCode'], right_on=['firstCreated', 'assetCodes'])
    
    return market_train_df, news_train_df

In [10]:
market_train_df, news_train_df = create_time_features(market_train_df, news_train_df) 
news_train_df['assetCodes'] = news_train_df['assetCodes'].map(lambda x: list(eval(x))[0])
    
market_train_df, news_train_df = join_news_market(market_train_df, news_train_df)

In [11]:
market_train_df.shape, news_train_df.shape

((4072956, 45), (2222939, 26))

In [12]:
market_train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4072956 entries, 0 to 4072955
Data columns (total 45 columns):
time                        object
assetCode                   object
assetName                   category
volume                      float64
close                       float64
open                        float64
returnsClosePrevRaw1        float64
returnsOpenPrevRaw1         float64
returnsClosePrevMktres1     float64
returnsOpenPrevMktres1      float64
returnsClosePrevRaw10       float64
returnsOpenPrevRaw10        float64
returnsClosePrevMktres10    float64
returnsOpenPrevMktres10     float64
returnsOpenNextMktres10     float64
universe                    float64
hour                        int64
month                       int64
dayofweek                   int64
firstCreated                object
assetCodes                  object
urgency                     float64
takeSequence                float64
bodySize                    float64
companyCount                floa

In [13]:
def get_x(market_train_df):
    exclude_col = ['assetName', 'time_y', 'time_x', 'firstCreated', 
                   'universe', 'time', 'assetCode', 'assetCodes']

    cols = [col for col in market_train_df.columns if col not in exclude_col]
    X = market_train_df[cols]
    return X

In [16]:
def get_data(market_train_df):

    r = market_train_df.returnsOpenNextMktres10
    y = market_train_df.returnsOpenNextMktres10 >=0
    
    X = get_x(market_train_df)

    return X, y, r

In [17]:
X, y, r = get_data(market_train_df)

In [18]:
X_train, X_Val, y_train, y_Val, r_train, r_valid = train_test_split(X, y, r, 
                                                                    test_size=0.2,
                                                                    random_state=42)

In [19]:
xgb_params = {'n_jobs':4,
              'n_estimators':300,
              'max_depth':8,
              'eta':0.2}

In [20]:
model_xgb = xgb.XGBClassifier(**xgb_params)

In [None]:
model_xgb.fit(X_train, y_train, 
              eval_set=[(X_train, y_train), (X_Val, y_Val)],
              eval_metric='logloss',
              early_stopping_rounds = 3,
              verbose=True)

In [None]:
days = env.get_prediction_days()

for (market_obs_df, news_obs_df, predictions_template_df) in days:
    
    market_obs_df, news_obs_df = create_time_features(market_obs_df, news_obs_df) 
    news_obs_df['assetCodes'] = news_obs_df['assetCodes'].map(lambda x: list(eval(x))[0])

    market_obs_df, news_obs_df = join_news_market(market_obs_df, news_obs_df)

    X = get_x(market_obs_df)
    predictions_template_df.confidenceValue = np.clip(model_xgb.predict(x), -1, 1)
    
    env.predict(predictions_template_df)

In [None]:
'''
dtrain = lgb.Dataset(X_train.values, y_train, feature_name=X_train.columns, categorical_feature=[], free_raw_data=False)
dvalid = lgb.Dataset(X_Val.values, y_Val, feature_name=X_Val.columns, categorical_feature=[], free_raw_data=False)

lgb_params =dict(
objective = 'regression_l1',
    learning_rate = 0.1,
    num_leaves = 127,
    max_depth = -1,
#     min_data_in_leaf = 1000,
#     min_sum_hessian_in_leaf = 10,
    bagging_fraction = 0.75,
    bagging_freq = 2,
    feature_fraction = 0.5,
    lambda_l1 = 0.0,
    lambda_l2 = 1.0,
    seed = 42 # Change for better luck! 
)

# Fit the best algorithm to the data. 
evals_result = {}

lgb_model = lgb.train(lgb_params, dtrain, num_boost_round=1000, valid_sets=(dvalid,), valid_names=('valid',), verbose_eval=25)


# prediction
lgb_predictions = lgb_model.predict(X_Val)
'''

In [None]:
'''
params = {
        'min_child_weight': [1, 5],
        'gamma': [0.5, 1],
        'subsample': [0.8, 1.0],
        'colsample_bytree': [0.6, 1.0],
        'max_depth': [4, 5]
        }

# Initialize XGB and GridSearch
xgb_model = xgb.XGBRegressor()

grid = GridSearchCV(xgb_model, params, n_jobs=multiprocessing.cpu_count(), verbose=10, cv=3)
grid.fit(X, y)
# Set the clf to the best combination of parameters
xgb_model = grid.best_estimator_
print(xgb_model)
xgb.plot_importance(model)
# Fit the best algorithm to the data. 
xgb_model.fit(X, y)
# prediction
xgb_predictions = xgb_model.predict(X_val)
'''

In [None]:
'''
plt.figure(figsize=(20,5))

plt.subplot(1,2,1)
plt.plot(lgb_predictions[:100], 'b', label='predicted')
plt.plot(y_Val[:100], 'r', label='True')
plt.legend()
plt.title('Prediction using lightgbm', fontsize=20)

plt.subplot(1,2,2)
plt.plot(lgb_predictions[:100], 'b', label='predicted')
plt.plot(y_Val[:100], 'r', label='True')
plt.legend()
plt.title('Prediction using xgboost', fontsize=20)
'''

In [None]:
#missing ensemble