In [None]:
# Changes compared to version V10 - 0.70951
# 1. this version fixes the bug in creating lag features in prediction part
# 2. adds lag terms of a few more market features
# 3. restructures the code
# 4. 2-lags is the best
from kaggle.competitions import twosigmanews
import numpy as np
import pandas as pd
import time
import lightgbm as lgb
import math
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.calibration import CalibratedClassifierCV
from sklearn.isotonic import IsotonicRegression

from itertools import product

from datetime import datetime

In [None]:
import logging
logging.getLogger().setLevel(logging.DEBUG)

In [None]:
def generate_split_dates(start_year, end_year):
    """ Generate a list of start & end dates for splitting the data (e.g. [('2016-01-01','2016-12-31')])
    """
    split_dates = []
    for year in range(start_year, end_year+1):
        split_dates.append((str(year)+'-01-01', str(year)+'-12-31'))
    return split_dates

In [None]:
def split_data_by_date(data, split_dates):
    """ Split the data by the input list of dates
    data: the input DataFrame
    split_dates: a list of start & end dates for splitting the data (e.g. [('2016-01-01','2016-12-31')])
    """
    data_split = {}
    for start_date, end_date in split_dates:
        condition1 = data['time'] >= pd.to_datetime(start_date).date()
        condition2 = data['time'] <= pd.to_datetime(end_date).date()
        data_split[start_date] = data[condition1 & condition2]
    return data_split

In [None]:
def find_asset_name_map(market_train, news_train):
    """ Find assetName correspondences between market_train and news_train
    """
    # Get assetCode and assetName for both market_train and news_train data
    code_name_market = market_train[['assetCode', 'assetName']].drop_duplicates(subset=['assetCode'])
    code_name_news = news_train[['assetCodes', 'assetName']].drop_duplicates(subset=['assetCodes', 'assetName'])
    # Split "assetCodes"
    code_name_news['assetCodes'] = code_name_news['assetCodes'].str.strip('{}').str.split(',')
    # For each assetCode in the list of "assetCodes", prepare a new row for each assetcode
    assetCode = code_name_news.apply(lambda x: pd.Series(x['assetCodes']), axis=1).stack().reset_index(level=1, drop=True)
    assetCode.name = 'assetCode'
    code_name_news.drop('assetCodes', axis=1, inplace=True)
    code_name_news_joined = code_name_news.join(assetCode).reset_index(drop=True)
    code_name_news_joined['assetCode'] = code_name_news_joined['assetCode'].apply(lambda x: x.replace("'",""))
    code_name_news_joined['assetCode'] = code_name_news_joined['assetCode'].apply(lambda x: x.replace(" ",""))
    # Rename assetName to assetName_news
    code_name_news_joined.rename(columns={'assetName': 'assetName_news'}, inplace=True)
    # Merge two dataframes together
    code_name_merged = pd.merge(left=code_name_market, right=code_name_news_joined, how='left', on=['assetCode'])
    # Filter out Unknown and NAN assetName
    code_name_merged.dropna(inplace=True)
    condition = code_name_merged['assetName'] == 'Unknown'
    code_name_filtered = code_name_merged[~condition]
    # Build up a dictionary to establish assetName mapping from market_train to news_train
    assetName_map = {}
    # Note: iterrows can be very inefficient!
    for index, row in code_name_filtered.iterrows():
        if row['assetName'] != row['assetName_news']:
            assetName_map[row['assetName_news']] = row['assetName']
    return assetName_map

In [None]:
def normalize_news_sentiment(news_train):
    """ Normalize news sentiment by subtracting the mean sentiment of each day
    """
    for date in news_train['time'].unique().tolist():
        # Normalize all news sentiment for the given date
        mean_sentiment_positive = np.mean(news_train.loc[news_train.time == date, 'sentimentPositive'])
        news_train.loc[news_train.time == date, 'sentimentPositive'] = news_train.loc[news_train.time == date, 'sentimentPositive'] - mean_sentiment_positive
        mean_sentiment_negative = np.mean(news_train.loc[news_train.time == date, 'sentimentNegative'])
        news_train.loc[news_train.time == date, 'sentimentNegative'] = news_train.loc[news_train.time == date, 'sentimentNegative'] - mean_sentiment_negative
        mean_sentiment_neutral = np.mean(news_train.loc[news_train.time == date, 'sentimentNeutral'])
        news_train.loc[news_train.time == date, 'sentimentNeutral'] = news_train.loc[news_train.time == date, 'sentimentNeutral'] - mean_sentiment_neutral

In [None]:
def merge_by_asset_name(market_train, news_train, assetName_map):
    # Modify assetName in news data according to assetName_map
    news_train['assetName'] = news_train['assetName'].apply(lambda x: assetName_map[x] if x in assetName_map.keys() else x)
    news_train.drop(['assetCodes'], axis=1, inplace=True)
    # Group news_train by "time" and "assetCode" and then compute mean on each group
    news_train_grouped = news_train.groupby(['time','assetName'], sort=False).aggregate(np.mean).reset_index()
    # Normalize news sentiment
    normalize_news_sentiment(news_train_grouped)
    # Merge two DataFrames
    return pd.merge(left=market_train, right= news_train_grouped, how='left', on=['time', 'assetName'], copy=False)

In [None]:
def merge_data(market_train, news_train, split_by_year=True):
    """ Return the combined data by merging market_train and news_train on "time" and "assetCode"
    """
    """ Pre-process of market_train """
    # Convert "time" to datetime format (Note: Currently, we only keep the time to date)
    market_train['time'] = pd.to_datetime(market_train['time']).apply(lambda x: x.date())
    #logging.debug('Convert time to datetime format is done for market data!')
    
    """ Pre-process of news_train """
    # Convert "time" to datetime format (Note: Currently, we only keep the time to date)
    news_train['time'] = pd.to_datetime(news_train['time']).apply(lambda x: x.date())
    #logging.debug('Convert time to datetime format is done for news data!')

    # feature engineering before dropping columns
    news_train['position'] = news_train['firstMentionSentence'] / news_train['sentenceCount']
    news_train['coverage'] = news_train['sentimentWordCount'] / news_train['wordCount']
    
    # Get rid of some columns in news data (the list of dropped columns can be modified)
    drop_list = ['sourceTimestamp','firstCreated','sourceId','headline',
                 'takeSequence','provider','firstMentionSentence',
                 'sentenceCount','headlineTag','marketCommentary',
                 'subjects','audiences','wordCount','sentimentWordCount']
    
    news_train.drop(drop_list, axis=1, inplace=True)
    #logging.debug('Drop columns is done for news data!')
    
    # Find assetName map from market_train to news_train
    assetName_map = find_asset_name_map(market_train, news_train)
    #logging.debug('Find the assetName correspondences between market and news data!')
    
    # Adjust 'time' for news_train
    # First, get all the unique dates from news data and market data
    time_market = pd.DataFrame(market_train['time'].unique(), columns={'time'})
    time_news = pd.DataFrame(news_train['time'].unique(), columns={'time'})
    # Keep a copy of market date before merging
    time_market['time_market'] = time_market['time']
    # Merge the two dataframes,the merged dataframe should have the same length with time_news
    # Also fill the next trading date
    time_adjusted= pd.merge(left=time_market, right= time_news, how='right', on=['time'], sort=True).fillna(method='bfill')
    # Merge adjusted time to news data
    news_train_adjusted = pd.merge(left=news_train, right=time_adjusted, how='left', on=['time'], copy=False)
    del news_train
    # Modify 'time_market' as the new 'time' column
    news_train_adjusted.drop(['time'], axis=1, inplace=True)
    news_train_adjusted.rename(columns={'time_market': 'time'}, inplace=True)
    #logging.debug('Adjust date is done for news data!')
    
    # Split market_train and news_train by year
    if (split_by_year):
        market_train_years = pd.to_datetime(market_train['time']).dt.year.unique()
        news_train_years = pd.to_datetime(news_train_adjusted['time']).dt.year.unique()
        start_year = min(np.amin(market_train_years), np.amin(news_train_years))
        end_year = max(np.amax(market_train_years), np.amax(news_train_years))
        #logging.debug('Split data from year-%d to year-%d' % (start_year, end_year))
        split_dates = generate_split_dates(start_year, end_year)
        market_train_split = split_data_by_date(market_train, split_dates)
        del market_train
        #logging.debug('Split market data is done!')
        news_train_split = split_data_by_date(news_train_adjusted, split_dates)
        del news_train_adjusted
        #logging.debug('Split news data is done!')
        # Iterate over split market and news data
        if len(market_train_split.items()) != len(news_train_split.items()):
            raise ValueError('The split train and news data must have the same length!')  
        merged_data = pd.DataFrame([])
        for start_date, end_date in split_dates:
            logging.debug('Merge data from %s to %s ...' % (start_date, end_date))
            market_train_to_process = market_train_split[start_date]
            news_train_to_process = news_train_split[start_date]
            # Merge two DataFrames
            merged_data = pd.concat([merged_data, 
                                     merge_by_asset_name(market_train_to_process, news_train_to_process, assetName_map)],
                                     ignore_index=True)
            #merged_data.info()
            del market_train_to_process
            del news_train_to_process
        del market_train_split
        del news_train_split
    else: 
        # If we do not split data by year, we should directly merge market and news train data.
        merged_data = merge_by_asset_name(market_train, news_train_adjusted, assetName_map)
    # Concatenate and return all DataFrames
    return merged_data

In [None]:
# Handling missing values
def handle_missing_values(merged_data):
    # Market data: replacing NaN by implied market-adjusted returns
    Raw_cols = ['returnsClosePrevRaw1', 'returnsOpenPrevRaw1', 'returnsClosePrevRaw10', 'returnsOpenPrevRaw10']
    Mktres_cols = ['returnsClosePrevMktres1', 'returnsOpenPrevMktres1', 'returnsClosePrevMktres10', 'returnsOpenPrevMktres10']
    Mkt_cols = ['returnClosePrevMkt1', 'returnOpenPrevMkt1', 'returnClosePrevMkt10', 'returnOpenPrevMkt10']
    for i in range(len(Mkt_cols)):
        merged_data[Mkt_cols[i]] = merged_data[Raw_cols[i]] - merged_data[Mktres_cols[i]]
        merged_data[Mkt_cols[i]] = merged_data.groupby('time')[Mkt_cols[i]].transform(lambda x: x.mean())
        merged_data[Mktres_cols[i]] = merged_data[Mktres_cols[i]].fillna(merged_data[Raw_cols[i]] - merged_data[Mkt_cols[i]])
    
    # News data: adding a 'nonews' feature & then, replacing NaN by a fixed constant
    merged_data['noNews'] = merged_data.sentimentPositive.isnull().astype(int)
    merged_data.fillna(-999, inplace=True)
    assert merged_data.isnull().sum().sum() == 0
    return merged_data

In [None]:
# Prepare merged data for training & prediction
def prepare_data(merged_data, goal, lag_features_window_10, lag_features_window_1=[], n_lag=2):
    ## Handling missing values
    merged_data = handle_missing_values(merged_data)
    
    ## Feature engineering
    # market features:
    merged_data['daytrend'] = merged_data['close'] / merged_data['open']
    #merged_data['average'] = (merged_data['close'] + merged_data['open']) / 2
    #merged_data['moving_average_10_days'] = merged_data.groupby('assetCode')['open'].transform(lambda x: pd.Series.ewm(x, span=10).mean())
    #merged_data['moving_average_20_days'] = merged_data.groupby('assetCode')['open'].transform(lambda x: pd.Series.ewm(x, span=20).mean())
    #merged_data['moving_average_50_days'] = merged_data.groupby('assetCode')['open'].transform(lambda x: pd.Series.ewm(x, span=50).mean())
    #merged_data['moving_average_100_days'] = merged_data.groupby('assetCode')['open'].transform(lambda x: pd.Series.ewm(x, span=100).mean())
    #merged_data['moving_average_200_days'] = merged_data.groupby('assetCode')['open'].transform(lambda x: pd.Series.ewm(x, span=200).mean())
    
    if goal == 'pred':
        for i in range(n_lag):
            for feature in lag_features_window_10 + lag_features_window_1:
                merged_data['%s_lag_%s' % (feature, i+1)] = np.nan # will input values later in prediction part
    else: # elif goal == 'train'
        for i in range(n_lag):
            for feature in lag_features_window_10:
                merged_data['%s_lag_%s' % (feature, i+1)] = merged_data.groupby('assetCode')['%s' % feature].transform(lambda x: x.shift(10 * (i+1)).fillna(0))
            for feature in lag_features_window_1:
                merged_data['%s_lag_%s' % (feature, i+1)] = merged_data.groupby('assetCode')['%s' % feature].transform(lambda x: x.shift(1 * (i+1)).fillna(0))
                
            #merged_data['%s_lag_2' % feature] = merged_data.groupby('assetCode')['%s' % feature].transform(lambda x: x.shift(20).fillna(0))
            #merged_data['returnsOpenPrevMktres10_lag_2'] = merged_data.groupby('assetCode')['returnsOpenPrevMktres10'].transform(lambda x: x.shift(20).fillna(0))
            #merged_data['returnsOpenPrevMktres10_lag_3'] = merged_data.groupby('assetCode')['returnsOpenPrevMktres10'].transform(lambda x: x.shift(30).fillna(0))
            #merged_data['returnsOpenPrevMktres10_lag_4'] = merged_data.groupby('assetCode')['returnsOpenPrevMktres10'].transform(lambda x: x.shift(40).fillna(0))
    #merged_data['returnsOpenPrevRaw10_lag_1'] = merged_data.groupby('assetCode')['returnsOpenPrevRaw10'].transform(lambda x: x.shift(10).fillna(x.mean()))
    #merged_data['returnsOpenPrevMktres10_lag_2'] = merged_data.groupby('assetCode')['returnsOpenPrevMktres10'].transform(lambda x: x.shift(20).fillna(x.mean()))
    #merged_data['returnsOpenPrevRaw10_lag_2'] = merged_data.groupby('assetCode')['returnsOpenPrevRaw10'].transform(lambda x: x.shift(20).fillna(x.mean()))
    #merged_data['returnsOpenPrevMktres10_lag_3'] = merged_data.groupby('assetCode')['returnsOpenPrevMktres10'].transform(lambda x: x.shift(30).fillna(x.mean()))
    #merged_data['returnsOpenPrevRaw10_lag_3'] = merged_data.groupby('assetCode')['returnsOpenPrevRaw10'].transform(lambda x: x.shift(30).fillna(x.mean()))
    #merged_data['returnsOpenPrevMktres10_lag_4'] = merged_data.groupby('assetCode')['returnsOpenPrevMktres10'].transform(lambda x: x.shift(40).fillna(x.mean()))
    #merged_data['returnsOpenPrevRaw10_lag_4'] = merged_data.groupby('assetCode')['returnsOpenPrevRaw10'].transform(lambda x: x.shift(40).fillna(x.mean()))
    #merged_data['returnsClosePrevMktres10_lag_1'] = merged_data.groupby('assetCode')['returnsClosePrevMktres10'].transform(lambda x: x.shift(10).fillna(x.mean()))
    #merged_data['returnsClosePrevRaw10_lag_1'] = merged_data.groupby('assetCode')['returnsClosePrevRaw10'].transform(lambda x: x.shift(10).fillna(x.mean()))
    #merged_data['returnsClosePrevMktres10_lag_2'] = merged_data.groupby('assetCode')['returnsClosePrevMktres10'].transform(lambda x: x.shift(20).fillna(x.mean()))
    #merged_data['returnsClosePrevRaw10_lag_2'] = merged_data.groupby('assetCode')['returnsClosePrevRaw10'].transform(lambda x: x.shift(20).fillna(x.mean()))
    #merged_data['returnsClosePrevMktres10_lag_3'] = merged_data.groupby('assetCode')['returnsClosePrevMktres10'].transform(lambda x: x.shift(30).fillna(x.mean()))
    #merged_data['returnsClosePrevRaw10_lag_3'] = merged_data.groupby('assetCode')['returnsClosePrevRaw10'].transform(lambda x: x.shift(30).fillna(x.mean()))
    #merged_data['returnsClosePrevMktres10_lag_4'] = merged_data.groupby('assetCode')['returnsClosePrevMktres10'].transform(lambda x: x.shift(40).fillna(x.mean()))
    #merged_data['returnsClosePrevRaw10_lag_4'] = merged_data.groupby('assetCode')['returnsClosePrevRaw10'].transform(lambda x: x.shift(40).fillna(x.mean()))
    
    return merged_data

In [None]:
# Removing outliers (defined as data m times std. away from its mean)
def remove_outliers(data, columns, m=3):
    """
    type data: DataFrame
    type columns: list[str]
    type m: int
    rtype: DataFrame
    """
    normal = np.asarray([True] * len(data))
    for i in range(len(columns)):
        normal = normal & (abs(data[columns[i]] - np.mean(data[columns[i]])) <= m * np.std(data[columns[i]]))
    return data[normal]

In [None]:
# Create lag features for prediction data
def rolling_lag_features(cdf, rolling, lag_features_window_10, lag_features_window_1=[], n_lag=2):
    """
    type cdf: DataFrame (merged data on a prediction day)
    type rolling: DataFrame (saved historical market data)
    """
    ## save merged_data from current and previous 10 days & keep rolling, for the purpose of creating lag Prev10 returns
    # step 1: concatenate data from current day to the DataFrame 'rolling'
    rolling = pd.concat([rolling, cdf[['time', 'assetCode'] + lag_features_window_10 + lag_features_window_1]]).reset_index(drop=True)
    # step 2: fill in one-lag feature in cdf by merging from 'rolling'
    #print(day, len(rolling['time'].unique())) # debug
    rollingtime = rolling['time'].unique()
    
    for i in range(n_lag):
        timelag10 = rollingtime[10 * (n_lag - i - 1)]
        cdf = pd.merge(left = cdf, right = rolling.loc[rolling['time'] == timelag10, ['assetCode'] + lag_features_window_10], how='left', on='assetCode', suffixes=('', '_rolling_%s'%(i+1)))
        #timelag1, timelag2 = rollingtime[10], rollingtime[0]
        #cdf = pd.merge(left = cdf, right = rolling.loc[rolling['time'] == timelag1, ['assetCode'] + lag_features], how='left', on='assetCode', suffixes=('', '_rolling_1'))
        #cdf = pd.merge(left = cdf, right = rolling.loc[rolling['time'] == timelag2, ['assetCode'] + lag_features], how='left', on='assetCode', suffixes=('', '_rolling_2'))
        for feature in lag_features_window_10:
            cdf['%s_lag_%s' % (feature, i+1)] = cdf['%s_rolling_%s' % (feature, i+1)]
            #cdf['%s_lag_2' % feature] = cdf['%s_rolling_2' % feature]
            cdf = cdf.drop(columns=['%s_rolling_%s' % (feature, i+1)])
        if lag_features_window_1:
            timelag1 = rollingtime[-i-1]
            cdf = pd.merge(left = cdf, right = rolling.loc[rolling['time'] == timelag1, ['assetCode'] + lag_features_window_1], how='left', on='assetCode', suffixes=('', '_rolling_%s'%(i+1)))
            for feature in lag_features_window_1:
                cdf['%s_lag_%s' % (feature, i+1)] = cdf['%s_rolling_%s' % (feature, i+1)]
                cdf = cdf.drop(columns=['%s_rolling_%s' % (feature, i+1)])
            
    # step 3: drop rows corresponding to the oldest day from 'rolling'
    rolling = rolling.drop(rolling[rolling['time'] == rollingtime[0]].index)
    return cdf, rolling

In [None]:
# You can only call make_env() once, so don't lose it!
env = twosigmanews.make_env()

In [None]:
(market_train_df, news_train_df) = env.get_training_data()

In [None]:
# for debugging purpose
#rownum = 1500000
#market_train_df, news_train_df = market_train_df[0:rownum], news_train_df[0:rownum]

In [None]:
# Merging data
#merge_start = time.time()
merged_data = merge_data(market_train_df, news_train_df)
#print("Merging data took %.0f minutes." % math.ceil((time.time() - merge_start)/60))
# train post-crisis samples
merged_data = merged_data[merged_data['time'] >= datetime(2009, 7, 1).date()]
del market_train_df, news_train_df

In [None]:
# Handling outliers
Raw_cols = ['returnsClosePrevRaw1', 'returnsOpenPrevRaw1', 'returnsClosePrevRaw10', 'returnsOpenPrevRaw10']
Mktres_cols = ['returnsClosePrevMktres1', 'returnsOpenPrevMktres1', 'returnsClosePrevMktres10', 'returnsOpenPrevMktres10']
outliercols = ['volume', 'close', 'open'] + Raw_cols + Mktres_cols
merged_data = remove_outliers(merged_data, outliercols, m=3)

In [None]:
# Preparing data for training
#prepare_start = time.time()
lag_features_window_10 = ['returnsOpenPrevMktres10', 'returnsOpenPrevRaw10', 'returnClosePrevMkt10', 'returnOpenPrevMkt10']
lag_features_window_1 = ['returnClosePrevMkt1', 'returnOpenPrevMkt1'] # creating lag features for the top six important features
n_lag = 2
cdf = prepare_data(merged_data, 'train', lag_features_window_10, lag_features_window_1, n_lag)
#print("Preparing data for training took %.0f minutes." % math.ceil((time.time() - prepare_start)/60))
del merged_data

In [None]:
# Building training set
targetcol = 'returnsOpenNextMktres10'
cdf[targetcol] = (cdf[targetcol] > 0).astype(int) # we be classifying
dropfeatures = ['noNews', 'noveltyCount5D', 'urgency', 'noveltyCount24H', 'noveltyCount12H'] # drop last five features
traincols = [col for col in cdf.columns if col not in ['time', 'assetCode', 'assetName', 'universe'] + [targetcol] + dropfeatures]

## Train 1: with valid set
dates = cdf['time'].unique()
train = range(len(dates))[:int(0.95*len(dates))]
val = range(len(dates))[int(0.95*len(dates)):]

# train data
Xt = cdf[traincols].loc[cdf['time'].isin(dates[train])].values
Yt = cdf[targetcol].loc[cdf['time'].isin(dates[train])].values

# validation data
Xv_with_time = cdf[traincols + ['time']].fillna(0).loc[cdf['time'].isin(dates[val])]
Yv = cdf[targetcol].fillna(0).loc[cdf['time'].isin(dates[val])].values

#print(Xt.shape, Xv.shape)
##---

## Train 2: no valid set
#allTrainData = cdf[traincols].fillna(0).values
#allTrainLabels = cdf[targetcol].fillna(0).values
##---

rolling = cdf.loc[cdf['time'] >= cdf.time.unique()[-10 * n_lag], ['time', 'assetCode'] + lag_features_window_10 + lag_features_window_1]  # to be used in prediction part
del cdf

In [None]:
# Feature scaling
sc = StandardScaler()

## Train 1: with valid set
Xt = sc.fit_transform(Xt)
#Xv = sc.transform(Xv)

## Train 2: no valid set
#allTrainData = sc.fit_transform(allTrainData)

In [None]:
def find_combinations(grid_params):
    params = []
    for key in grid_params.keys():
        params.append(grid_params[key])
    return list(product(*params))

In [None]:
def post_scaling(pred_Y):
    """
    type pred_Y: array
    rtype: array
    """
    mean, std = np.mean(pred_Y), np.std(pred_Y)
    pred_Y = (pred_Y - mean) / std / 8
    return np.clip(pred_Y, -1, 1)

In [None]:
def score(x, y, pred_y):
    all_time = x['time'].values
    unique_time = x['time'].unique()
    daily_score = np.zeros(len(unique_time))
    for i in range(len(unique_time)):
        index = []
        for j in range(x.shape[0]):
            if all_time[j] == unique_time[i]:
                index.append(j)
        daily_score[i] = (y[index]*pred_y[index]).sum()
    return np.mean(daily_score)/np.std(daily_score)    

In [None]:
def grid_search(X_train, Y_train, X_val, Y_val, grid_params):
    combinations = find_combinations(grid_params)
    print("Total number of candidates = %d" % len(combinations))   
    # Default parameters
    hyper_params = {"objective" : "binary",
          "metric" : "binary_logloss",
          "num_leaves" : 32,
          "max_depth": -1,
          "max_bin": 512,
          'subsample_for_bin': 200,
          'subsample': 0.7,
          'subsample_freq': 1,
          'colsample_bytree': 0.65,
          'reg_alpha': 1.2,
          'reg_lambda': 1,
          'min_split_gain': 0.5,
          'min_child_weight': 1,
          'min_child_samples': 5,
          "learning_rate" : 0.01}
    # Iterate over all candidates
    cv_results = {'hyper_params':[], 'score':[]}
    params_keys = list(grid_params.keys())
    for i in range(len(combinations)):
        out_info = ""
        for j in range(len(combinations[i])):
            value = combinations[i][j]
            key = params_keys[j]
            out_info += key + ": " + str(value) + " "
            # Update params
            hyper_params[key] = value
        # Train and Calibrate
        print("Candidate %d / %d ... %s" % (i+1, len(combinations), out_info))
        # Drop the last column for time
        X_val_numbers = sc.transform(X_val[traincols].values)
        lgbmodel = lgb.train(hyper_params, train_set=lgb.Dataset(X_train, Y_train), valid_sets=lgb.Dataset(X_val_numbers, Y_val), num_boost_round=1000, early_stopping_rounds=50, verbose_eval=500)
        pred = post_scaling(lgbmodel.predict(X_val_numbers))
        ir = IsotonicRegression(out_of_bounds = 'clip')
        ir.fit(pred, Y_val)
        pred_calibrated = post_scaling(ir.transform(pred))
        # Score before calibration on the validation data
        before_calibration_score = score(X_val, Y_val, pred)
        print(before_calibration_score)
        after_calibration_score = score(X_val, Y_val, pred_calibrated)
        print(after_calibration_score)    
        cv_results['hyper_params'].append(hyper_params)
        cv_results['score'].append(after_calibration_score)
    return cv_results

In [None]:
print('Grid Search with Cross-Validation')
gridParams = {
    'learning_rate': [0.005],
    'num_leaves': [32,64,128],
    'colsample_bytree' : [0.6,0.65,0.7],
    'subsample' : [0.7,0.75,0.8],
    'reg_alpha' : [1,1.2],
    'reg_lambda' : [1,1.2]}
cv_results = grid_search(Xt,Yt,Xv_with_time,Yv,gridParams)
best_score = max(cv_results['score'])
best_index = cv_results['score'].index(best_score)
best_params = cv_results['hyper_params'][best_index]
print("Best Accuracy: %f" % best_accuracy)
print("best learning_rate = %f" % best_params['learning_rate'])
print("best num_leaves = %f" % best_params['num_leaves'])
print("best colsample_bytree = %f" % best_params['colsample_bytree'])
print("best subsample = %f" % best_params['subsample'])
print("best reg_alpha = %f" % best_params['reg_alpha'])
print("best reg_lambda = %f" % best_params['reg_lambda'])

In [None]:
# Retrain model using the best_params
Xv = sc.transform(Xv_with_time[traincols].values)
lgbmodel = lgb.train(best_params, train_set=lgb.Dataset(Xt, Yt), valid_sets=lgb.Dataset(Xv, Yv), num_boost_round=2000, early_stopping_rounds=200, verbose_eval=200)

In [None]:
# Calibrate prediction
pred = lgbmodel.predict(Xv)
ir = IsotonicRegression(out_of_bounds = 'clip')
ir.fit(pred,Yv)
pred_calibrated = ir.transform(pred)

In [None]:
def cdf(data, calibrated_data):
    # sort the data:
    data_sorted = np.sort(data)
    calibrated_data_sorted = np.sort(calibrated_data)
    # calculate the proportional values of samples
    p_data = 1. * np.arange(len(data)) / (len(data) - 1)
    p_calibrated_data = 1. * np.arange(len(calibrated_data)) / (len(calibrated_data) - 1)
    # plot the sorted data:
    plt.plot(data_sorted, p_data, 'b')
    plt.plot(calibrated_data_sorted, p_calibrated_data, 'r')
    plt.xlabel('$x$')
    plt.ylabel('$p$')
    plt.show()

In [None]:
cdf(pred, p_calibrated)

In [None]:
"""
#######################################################
##
## LightGBM
##
#######################################################
#import lightgbm as lgb

# sklearn tools for model training and assesment
#from sklearn.model_selection import train_test_split
#from sklearn.metrics import (roc_curve, auc, accuracy_score)
#from sklearn.model_selection import GridSearchCV

print ('Training lightgbm')
# Create parameters to search
gridParams = {
    'learning_rate': [0.005,0.01,0.05],
    'num_leaves': [16,32,64,128],
    'colsample_bytree' : list(np.linspace(0.6, 0.7, 3)),
    'subsample' : list(np.linspace(0.7, 0.9, 5)),
    'reg_alpha' : [1,1.2],
    'reg_lambda' : [1,1.2,1.4]}

# Grid Search

# GridSearch and Calibration of classifier
params = {"objective" : "binary",
          "metric" : "binary_logloss",
          "num_leaves" : 32,
          "max_depth": -1,
          "max_bin": 512,
          'subsample_for_bin': 200,
          'subsample': 0.7,
          'subsample_freq': 1,
          'colsample_bytree': 0.65,
          'reg_alpha': 1.2,
          'reg_lambda': 1,
          'min_split_gain': 0.5,
          'min_child_weight': 1,
          'min_child_samples': 5,
          "learning_rate" : 0.01}


model = lgb.LGBMClassifier(boosting_type= 'gbdt',
                           objective = 'binary',
                           metric = 'binary_logloss',
                           silent = True,
                           max_depth = params['max_depth'],
                           max_bin = params['max_bin'],
                           subsample_for_bin = params['subsample_for_bin'],
                           subsample = params['subsample'],
                           subsample_freq = params['subsample_freq'],
                           min_split_gain = params['min_split_gain'],
                           min_child_weight = params['min_child_weight'],
                           min_child_samples = params['min_child_samples'],
                           learning_rate=params['learning_rate'])

# Create the grid
#grid = GridSearchCV(model, gridParams,
#                    verbose=10,
#                    cv=5,
#                    n_jobs=1)
# Print the best parameters found
#print(grid.best_params_)
#print(grid.best_score_)

lgtrain = lgb.Dataset(allTrainData, allTrainLabels)
#lgbmodel = None
lgbmodel = lgb.train(params, train_set=lgtrain, num_boost_round=10000, verbose_eval=False)
pred = lgbmodel.predict(allTrainData)
ir = IsotonicRegression(out_of_bounds = 'clip')
ir.fit(pred,allTrainLabels)
pred_calibrated = ir.transform(pred)
"""

In [None]:
##### scaling
def post_scaling(df):
    mean, std = np.mean(df), np.std(df)
    df = (df - mean)/ (std * 8)
    return np.clip(df,-1,1)
############################################################
print("generating predictions...")
preddays = env.get_prediction_days()
#day = 0 # debug
#pred_start = time.time()
logging.getLogger().disabled = True
saved_data = pd.DataFrame([])
for marketdf, newsdf, predtemplatedf in preddays:
    #day += 1 # debug
    merged_data = merge_data(marketdf, newsdf, split_by_year=False) # merge data
    cdf = prepare_data(merged_data, 'pred', lag_features_window_10, lag_features_window_1, n_lag) # prepare merged data for prediction
    cdf, rolling = rolling_lag_features(cdf, rolling, lag_features_window_10, lag_features_window_1, n_lag)
    Xp = sc.transform(cdf[traincols].fillna(0).values) # extract features columns and scale
    preds = ir.transform(lgbmodel.predict(Xp, num_iteration=lgbmodel.best_iteration)) # calibrate
    preds = preds*2 - 1
    predsdf = pd.DataFrame({'ast':cdf['assetCode'],'conf':post_scaling(preds)})
    predtemplatedf.loc[predtemplatedf['assetCode'].isin(predsdf.ast), 'confidenceValue'] = predsdf['conf'].values
    env.predict(predtemplatedf)
    """
    # Incremetal learning the model
    saved_data = pd.concat([saved_data, cdf], ignore_index=True)
    saved_data_time = saved_data['time'].unique()
    if (saved_data_time.shape[0] == 11):
        # Data for day 1
        day1 = saved_data.loc[saved_data.time == saved_data_time[0]]
        # Data for day 10
        day10 = cdf[['assetCode', 'returnsOpenPrevMktres10']]
        day10.rename(columns={'returnsOpenPrevMktres10': 'label'}, inplace=True)
        # Merge data from day 1 and day 10
        merged = pd.merge(left=day1, right=day10, on=['assetCode'], how='inner')
        incremental_train_data = sc.transform(merged[traincols].fillna(0).values)
        incremental_train_labels = (merged['label'] > 0).astype(int).fillna(0).values
        lgbmodel = lgb.train(params, init_model=lgbmodel, train_set=lgb.Dataset(incremental_train_data, incremental_train_labels), num_boost_round=10, verbose_eval=False, keep_training_booster=True)
        saved_data.drop(saved_data[saved_data.time == saved_data_time[0]].index, inplace=True)
    """
#print("Prediction took %.0f minutes." % math.ceil((time.time() - pred_start)/60))
env.write_submission_file()

In [None]:
# Feature importance
feature_imp = pd.DataFrame(sorted(zip(lgbmodel.feature_importance(importance_type='gain'), traincols)), columns=['Value','Feature'])
plt.figure(figsize=(20, 15))
sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value", ascending=False))
plt.title('LightGBM Features (avg over folds)')
plt.tight_layout()
plt.show()

In [None]:
# records
# based on V10: turn off time tracker, replace Fang's training method and parameters with mine, same features: 0.65015
# found a bug, drop the lag features: 0.67390
# use Fang's training methods (the only difference from V10 is dropping lag features; the diff from V7 is adding five news feature 
# ... adding daytrend & PrevMkt, and removing outliers, handling missing values, training post-crisis data, pre-scaling): 0.70359
# fix the bug (based on one-lag): 0.68992
# add two lags, fillna with 0 in creating lags, fix the reset_index() bug in last version: 
# based on 0.75235 version, drop post-scaling: 