In [None]:
from kaggle.competitions import twosigmanews
# You can only call make_env() once, so don't lose it!
env = twosigmanews.make_env()

In [None]:
(market_train_df, news_train_df) = env.get_training_data()

#this is a constant needed for stacking (not hyperparameter)
market_train_df = market_train_df.loc[market_train_df['time'] >= '2010-01-01 22:00:00+0000']

<h1>Basic data cleaning

In [None]:
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook as tqdm
from time import time
from matplotlib import pyplot as plt

In [None]:
def prepare_data(market_obs_df):
    """
    baseline data cleaning procedure
    Args:
        market_obs_df (market_train_df): from env.get_training_data()
        
    didn't check for N.A.s
    """
    start_time = time()
    
    market_obs_df.reset_index(drop=True, inplace=True)
    
    market_obs_df['close_to_open'] =  np.abs(market_obs_df['close'] / market_obs_df['open'])
    market_obs_df['assetName_mean_open'] = market_obs_df.groupby('assetName')['open'].transform('mean')
    market_obs_df['assetName_mean_close'] = market_obs_df.groupby('assetName')['close'].transform('mean')

    # if open price is too far from mean open price for this company, replace it. Otherwise replace close price.
    for i, row in market_obs_df.loc[market_obs_df['close_to_open'] >= 2].iterrows():
        if np.abs(row['assetName_mean_open'] - row['open']) > np.abs(row['assetName_mean_close'] - row['close']):
            market_obs_df.iloc[i,5] = row['assetName_mean_open']
        else:
            market_obs_df.iloc[i,4] = row['assetName_mean_close']

    for i, row in market_obs_df.loc[market_obs_df['close_to_open'] <= 0.5].iterrows():
        if np.abs(row['assetName_mean_open'] - row['open']) > np.abs(row['assetName_mean_close'] - row['close']):
            market_obs_df.iloc[i,5] = row['assetName_mean_open']
        else:
            market_obs_df.iloc[i,4] = row['assetName_mean_close']
    
    print("TIME: %.2f for cleaning data" % (time()-start_time))
    

In [None]:
# sanity check on prepare data
X = market_train_df.copy()
prepare_data(X)
assert len(market_train_df.columns) != len(X.columns)
del X

TODO: if I try to run linear regressor it gives me You might need a way of handling missing values, such as pandas.DataFrame.fillna or sklearn.preprocessing.Imputer. See our Missing Values tutorial for more details.

-----

<h1>Models

<h1>Linear regressor

In [None]:
def model_lr(X, Y):
    """
    this is the core model used for final predictions
    Args:
        X: pandas.df
        Y: values
    Return:
        model, results (None)
    """
    
    prepare_data(X) 
    Y = Y.clip(Y.quantile(0.001), Y.quantile(0.999))
    Y.reset_index(drop=True, inplace=True)
    
    X = X.iloc[:, (X.columns != 'assetCode') 
               & (X.columns != 'assetName') 
               & (X.columns != 'time') 
               & (X.columns != 'returnsOpenNextMktres10')
               & (X.columns != 'period')]
    X, Y = X.fillna(0), Y.fillna(0)
    
    from sklearn.linear_model import LinearRegression
    lr = LinearRegression()
    lr.fit(X.values, Y)
    
    del X, Y
    return lr, None

def linear_regressor(lr, X_test):
    """simple lr
    Args:
        X: pandas.df
        Y: values
        X_test: not values
    Return:
        predictions
    """
    
    prepare_data(X_test) 
    
    X_test = X_test.iloc[:, (X_test.columns != 'assetCode') 
               & (X_test.columns != 'assetName') 
               & (X_test.columns != 'time') 
               & (X_test.columns != 'returnsOpenNextMktres10')
               & (X_test.columns != 'period')]
    X_test = X_test.fillna(0) #fillna must be in values?
    res = lr.predict(X_test.values).clip(-1, 1)
    
    del X_test
    return res

<h2>Example
<p>This is how to use the model for stacking, with linear_regressor

In [None]:
X, Y = market_train_df.drop('returnsOpenNextMktres10',axis=1), market_train_df['returnsOpenNextMktres10']
split = int(len(X) * 0.8)
X_train, Y_train = X[:split], Y[:split]
X_test, Y_test = X[split:], Y[split:]

lr, _ = model_lr(X_train.copy(), Y_train.copy())
pred_lr = linear_regressor(lr, X_test.copy()) #DEBUG: important to use copy!

from sklearn.metrics import r2_score
print('Test sigma score for linreg in block %d is %f' % (0, r2_score(Y_test.values, pred_lr)))

<h1>Full model predictions
<p>This is the model to make predictions after stacking

In [None]:
X, Y = market_train_df, market_train_df['returnsOpenNextMktres10']
model00, training_results00 = model_lr(X.copy(), Y.copy())

In [None]:
type(model00)

<h2>LGB_0629

In [None]:
def model_lgb_0629(X, Y):
        """
        this is the lightLGB_0629 core model that returns model (used for real predictions)
        Args:
            X: features
            Y: label
        Returns:
            model: lightgbm instance
            training_results: dict with training results
        """
        import lightgbm as lgb
        start_time = time()
        
        prepare_data(X) 
        Y = Y.clip(Y.quantile(0.001), Y.quantile(0.999))
        Y.reset_index(drop=True, inplace=True)
        
        def sigma_score(preds, valid_data):
            """metric definition"""
            df_time = valid_data.params['extra_time'] # will be injected afterwards
            labels = valid_data.get_label()

        #    assert len(labels) == len(df_time)

            x_t = preds * labels #  * df_valid['universe'] -> Here we take out the 'universe' term because we already keep only those equals to 1.

            # Here we take advantage of the fact that `labels` (used to calculate `x_t`)
            # is a pd.Series and call `group_by`
            x_t_sum = x_t.groupby(df_time).sum()
            score = x_t_sum.mean() / x_t_sum.std()

            return 'sigma_score', score, True
        
        #split train validation
        split = int(len(X) * 0.8)
        test_train_distsance = 2000
        X_train, X_val = X[:split - test_train_distsance], X[split:]
        Y_train, Y_val = Y[:split - test_train_distsance], Y[split:]

        #take out universe = 0 from validation
        universe_filter = X['universe'][split:] == 1.0
        X_val = X_val[universe_filter]
        Y_val = Y_val[universe_filter]

        # this is a time_val series used to calc the sigma_score later, applied split and universe filter
        time_val = X['time'][split:][universe_filter]
        assert len(time_val) == len(X_val)
        time_train = X['time'][:split - test_train_distsance]
        assert len(time_train) == len(X_train)

        X_train = X_train.iloc[:, (X_train.columns != 'assetCode') 
                   & (X.columns != 'assetName') 
                   & (X.columns != 'time') 
                   & (X.columns != 'returnsOpenNextMktres10')
                   & (X.columns != 'period')]

        X_val = X_val.iloc[:, (X_val.columns != 'assetCode') 
                   & (X.columns != 'assetName') 
                   & (X.columns != 'time') 
                   & (X.columns != 'returnsOpenNextMktres10')
                   & (X.columns != 'period')]

        assert len(X_train.columns) == len(X_val.columns)

        train_cols = X_train.columns.tolist()

        lgb_train = lgb.Dataset(X_train.values, Y_train, feature_name=train_cols, free_raw_data=False)
        lgb_val = lgb.Dataset(X_val.values, Y_val, feature_name=train_cols, free_raw_data=False)

        lgb_train.params = {
            'extra_time' : time_train.factorize()[0]
        }
        lgb_val.params = {
            'extra_time' : time_val.factorize()[0]
        }

        x_1 = [0.19000424246380565, 2452, 212, 328, 202]
        #this is from eda script 67
        lgb_params = {
                'task': 'train',
                'boosting_type': 'gbdt',
                'objective': 'regression_l1',
        #         'objective': 'regression',
                'learning_rate': x_1[0],
                'num_leaves': x_1[1],
                'min_data_in_leaf': x_1[2],
        #         'num_iteration': x_1[3],
                'num_iteration': 239,
                'max_bin': x_1[4],
                'verbose': 1,
                'lambda_l1': 0.0,
                'lambda_l2' : 1.0,
                'metric':'None'
        }

        training_results = {}
        model = lgb.train(lgb_params, lgb_train, num_boost_round=1000, valid_sets=(lgb_val,lgb_train), valid_names=('valid','train'), verbose_eval=25,
                      early_stopping_rounds=20, feval=sigma_score, evals_result=training_results)

        print("\n\nTIME - lgb_0629: {}".format(time()-start_time))
        
        del X, Y, X_train, Y_train, X_val, Y_val
        return model, training_results

    
def lgb_0629(model, X_test):
    """
    this is the lightLGB model that got 0.629 scores on public LB (with parameters from script 67)
    Args:
        lgb: lgb model
        X_test: features (not values)
    Returns:
        prediction: Y_test
    """
    
    prepare_data(X_test)
    
    X_test = X_test.iloc[:, (X_test.columns != 'assetCode') 
               & (X_test.columns != 'assetName') 
               & (X_test.columns != 'time') 
               & (X_test.columns != 'returnsOpenNextMktres10')
               & (X_test.columns != 'period')]
    
    res = model.predict(X_test.values).clip(-1, 1)
    
    del X_test
    return res

<h2>Example
<p>This is how to use the model for stacking, with lgb_0629 function

In [None]:
X, Y = market_train_df, market_train_df['returnsOpenNextMktres10']

split = int(len(X) * 0.8)
X_train, Y_train = X[:split], Y[:split]
X_test, Y_test = X[split:], Y[split:]

model, results = model_lgb_0629(X_train.copy(), Y_train.copy())
predictions = lgb_0629(model, X_test.copy()) #DEBUG: did you use copy?

from sklearn.metrics import r2_score
print('Test sigma score for lbg in block %d is %f' % (0, r2_score(Y_test.values, predictions)))

plt.figure(figsize=(8,4))
plt.plot(results['train']['sigma_score'])
plt.plot(results['valid']['sigma_score'])

<h2>Full-model predictions
<p>This is the model to make predictions after stacking

In [None]:
X, Y = market_train_df, market_train_df['returnsOpenNextMktres10']
model01, training_results01 = model_lgb_0629(X.copy(), Y.copy())

plt.figure(figsize=(8,4))
plt.plot(training_results01['train']['sigma_score'])
plt.plot(training_results01['valid']['sigma_score'])

<h1>First level models stacking

Validation in presence of time component

**f) KFold scheme in time series
**
In time-series task we usually have a fixed period of time we are asked to predict. Like day, week, month or arbitrary period with duration of T.

Split the train data into chunks of duration T. Select first M chunks.
Fit N diverse models on those M chunks and predict for the chunk M+1. Then fit those models on first M+1 chunks and predict for chunk M+2 and so on, until you hit the end. After that use all train data to fit models and get predictions for test. Now we will have meta-features for the chunks starting from number M+1 as well as meta-features for the test.
Now we can use meta-features from first K chunks [M+1,M+2,..,M+K] to fit level 2 models and validate them on chunk M+K+1. Essentially we are back to step 1. with the lesser amount of chunks and meta-features instead of features.

T = 6 month
* 14 periods in Train
* 3 periods in LB
* 1 period in pLB

Build meta features on train periods: [8, 9, 10, 11, 12, 13]

<h2>Building Trainining meta-features

Let's first add periods so that data manipulation will be easier

In [None]:
market_train_df = market_train_df.rename(columns={'returnsOpenNextMktres10':'target'})

In [None]:
periods = ['2010-01-01 22:00:00+0000',
           '2010-06-15 22:00:00+0000',
           '2011-01-01 22:00:00+0000',
           '2011-06-15 22:00:00+0000',
           '2012-01-01 22:00:00+0000',
           '2012-06-15 22:00:00+0000',
           '2013-01-01 22:00:00+0000',
           '2013-06-15 22:00:00+0000',
           '2014-01-01 22:00:00+0000',
           '2014-06-15 22:00:00+0000',
           '2015-01-01 22:00:00+0000',
           '2015-06-15 22:00:00+0000',
           '2016-01-01 22:00:00+0000',
           '2016-06-15 22:00:00+0000',
           '2017-01-01 22:00:00+0000']

In [None]:
market_train_df['period'] = -1

In [None]:
for i, period in tqdm(enumerate(periods[:-1])):
    market_train_df.loc[(market_train_df['time'] < periods[i + 1]) & (period <= market_train_df['time']), ['period']] = i

In [None]:
#move period to second column
cols = market_train_df.columns.tolist()
cols.insert(1, cols[-1])
cols.pop()

In [None]:
X_train_level1 = market_train_df[cols]

In [None]:
y_train_level1 = X_train_level1['target']
X_train_level1 = X_train_level1.drop('target',axis=1)

In [None]:
#sanity check
assert len(X_train_level1['period'].unique()) == 14

Now that we cleaned the Database let's build x_train_level2

In [None]:
periods_level2 = X_train_level1['period'][X_train_level1['period'].isin([8, 9, 10, 11, 12, 13])]

In [None]:
y_train_level2 = y_train_level1[X_train_level1['period'].isin([8, 9, 10, 11, 12, 13])]

In [None]:
# how many level1 do we have?
level1_models = 2
X_train_level2 = np.zeros([y_train_level2.shape[0], level1_models])

In [None]:
# Now fill `X_train_level2` with metafeatures
from sklearn.metrics import r2_score
for cur_block_num in tqdm([8, 9, 10, 11, 12, 13]):
    
    print(cur_block_num)
    
    '''
        1. Split `X_train` into parts
           Remember, that corresponding dates are stored in `dates_train` 
        2. Fit linear regression 
        3. Fit LightGBM and put predictions          
        4. Store predictions from 2. and 3. in the right place of `X_train_level2`. 
           You can use `dates_train_level2` for it
           Make sure the order of the meta-features is the same as in `X_test_level2`
    '''      
    cur_block_X = X_train_level1[X_train_level1['period'] < cur_block_num]
    cur_block_Y = y_train_level1[X_train_level1['period'] < cur_block_num]
    
    cur_block_X_test = X_train_level1[X_train_level1['period'] == cur_block_num]
    cur_block_Y_test = y_train_level1[X_train_level1['period'] == cur_block_num]
    
    #TODO there is NA in values
    
    # let's train here all the models
    '''
    MODEL 1
    : linear regression
    '''
    block_model00, results = model_lr(cur_block_X.copy(), cur_block_Y.copy())
    pred_lr = linear_regressor(block_model00, cur_block_X_test.copy())
    print('Test r2 score for linreg in block %d is %f' % (cur_block_num, r2_score(cur_block_Y_test, pred_lr)))
    
    
    '''
    MODEL 2
    lightLGB (lgb_0629)
    using 'script 67' params
    '''
    block_model01, results = model_lgb_0629(cur_block_X.copy(), cur_block_Y.copy())
    pred_lgb = lgb_0629(block_model01, cur_block_X_test.copy())
    print('Test r2 score for lgb_0629 in block %d is %f' % (cur_block_num, r2_score(cur_block_Y_test, pred_lgb)))
    
    cur_block_X_train_level2 = np.c_[pred_lr, pred_lgb] 
    
    X_train_level2[periods_level2 == cur_block_num] = cur_block_X_train_level2

In [None]:
plt.figure(figsize=(15,10))
plt.scatter(X_train_level2[periods_level2 == 9][:,0].clip(-0.04, 0.04), X_train_level2[periods_level2 == 9][:,1].clip(-0.04, 0.04),alpha=0.3) 
plt.scatter(X_train_level2[periods_level2 == 8][:,0].clip(-0.04, 0.04), X_train_level2[periods_level2 == 8][:,1].clip(-0.04, 0.04),alpha=0.3) 
plt.scatter(X_train_level2[periods_level2 == 10][:,0].clip(-0.04, 0.04), X_train_level2[periods_level2 == 10][:,1].clip(-0.04, 0.04),alpha=0.3) 
plt.scatter(X_train_level2[periods_level2 == 11][:,0].clip(-0.04, 0.04), X_train_level2[periods_level2 == 11][:,1].clip(-0.04, 0.04),alpha=0.3) 
plt.scatter(X_train_level2[periods_level2 == 12][:,0].clip(-0.04, 0.04), X_train_level2[periods_level2 == 12][:,1].clip(-0.04, 0.04),alpha=0.3) 
plt.scatter(X_train_level2[periods_level2 == 13][:,0].clip(-0.04, 0.04), X_train_level2[periods_level2 == 13][:,1].clip(-0.04, 0.04),alpha=0.3) 
plt.title("first level predictions")
plt.xlabel("Predictions of model 0")
plt.ylabel("Predictions of model 1")
plt.plot([-0.04, 0.04], [-0.04, 0.04])

In [None]:
plt.figure(figsize=(15,10))
plt.plot(results['train']['sigma_score'])
plt.plot(results['valid']['sigma_score'])
#notably good!?

<h1>Stacking</h1>
now let's move to level2, and let's build a model that stacks the predictions of model of level1

In [None]:
#sanity check on level 2 training test dimensions
assert X_train_level2.shape[0] == np.array(y_train_level2).shape[0]

In [None]:
from sklearn.linear_model import LinearRegression
level2_model = LinearRegression()

In [None]:
level2_model.fit(X_train_level2, y_train_level2)

## Main Loop
Let's loop through all the days and make our random predictions.  The `days` generator (returned from `get_prediction_days`) will simply stop returning values once you've reached the end.

In [None]:
days = env.get_prediction_days()
for (market_obs_df, news_obs_df, predictions_template_df) in days:
    
    market_obs_df['universe'] = 1
    
    """
    MODEL 00 PREDICTIONS
    """
    pred_model00 = linear_regressor(model00, market_obs_df.copy())
    
    """
    MODEL 01 PREDICTIONS
    """
    pred_model01 = lgb_0629(model01, market_obs_df.copy())
    
    """
    META-MODEL PREDICTIONS
    """
    features_level2 = np.c_[pred_model00, pred_model01] 
    
    #with stacking
    #predictions_template_df.confidenceValue = level2_model.predict(features_level2).clip(-1 , 1)
    
    #without stacking
    predictions_template_df.confidenceValue = pred_model01 
    
    
    env.predict(predictions_template_df)
    
print('Done!')

## **`write_submission_file`** function

Writes your predictions to a CSV file (`submission.csv`) in the current working directory.

In [None]:
env.write_submission_file()

In [None]:
# We've got a submission file!
import os
print([filename for filename in os.listdir('.') if '.csv' in filename])

As indicated by the helper message, calling `write_submission_file` on its own does **not** make a submission to the competition.  It merely tells the module to write the `submission.csv` file as part of the Kernel's output.  To make a submission to the competition, you'll have to **Commit** your Kernel and find the generated `submission.csv` file in that Kernel Version's Output tab (note this is _outside_ of the Kernel Editor), then click "Submit to Competition".  When we re-run your Kernel during Stage Two, we will run the Kernel Version (generated when you hit "Commit") linked to your chosen Submission.

## Restart the Kernel to run your code again
In order to combat cheating, you are only allowed to call `make_env` or iterate through `get_prediction_days` once per Kernel run.  However, while you're iterating on your model it's reasonable to try something out, change the model a bit, and try it again.  Unfortunately, if you try to simply re-run the code, or even refresh the browser page, you'll still be running on the same Kernel execution session you had been running before, and the `twosigmanews` module will still throw errors.  To get around this, you need to explicitly restart your Kernel execution session, which you can do by pressing the Restart button in the Kernel Editor's bottom Console tab:
![Restart button](https://i.imgur.com/hudu8jF.png)