In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from itertools import chain

%matplotlib inline

In [None]:
REDUCED = True # Reduce the data size for development and testing

In [None]:
from kaggle.competitions import twosigmanews
env = twosigmanews.make_env()

In [None]:
(market_train_df, news_train_df) = env.get_training_data()

In [None]:
if REDUCED:
    market_train_df = market_train_df.tail(10000)
    news_train_df = news_train_df.tail(50000)

In [None]:
def clean_market_data(market_df, train=True):
    '''Clean and preprocess the market data for training or testing.
    
    Parameters
    ----------
    market_df : dataframe
        See https://www.kaggle.com/c/two-sigma-financial-news/data for full description of the dataframe.
    train : bool
        When true, adds the target variable to the dataframe.
    
    Returns
    -------
    dataframe 
        Cleaned market data.
    
    '''
    # Select columns and drop NA
    if train:
        cols = ['assetCode', 'time', 'volume', 'open', 'returnsOpenPrevMktres1',
                'returnsOpenPrevMktres10', 'returnsOpenNextMktres10']
    else:
        cols = ['assetCode', 'time', 'volume', 'open', 'returnsOpenPrevMktres1',
                'returnsOpenPrevMktres10']
    market_df = market_df.loc[:,cols]
    market_df.dropna(inplace=True)
    
    # Normalize time
    market_df.loc[:, 'time'] = market_df.time.dt.normalize()
    
    return market_df

In [None]:
def clean_news_data(news_df, extra_features= False):
    '''Clean and preprocess the news data for training or testing.
    
    Parameters
    ----------
    news_df : dataframe
        See https://www.kaggle.com/c/two-sigma-financial-news/data for full description of the dataframe.
    extra_features : bool
        When true, adds extra columns that SE added ('urgency', 'provider', 'bodySize', 'relevance').
    
    Returns
    -------
    dataframe 
        Cleaned news data.
    
    '''
    # Select columns and drop NA
    if extra_features:
        cols = ['time','assetCodes', 'sentimentNegative', 'sentimentNeutral', 'sentimentPositive',
                'urgency', 'provider', 'bodySize', 'relevance']
    else:
        cols = ['time','assetCodes', 'sentimentNegative', 'sentimentNeutral', 'sentimentPositive']
    news_df = news_df.loc[:,cols]
    news_df.dropna(inplace=True)
    
    # Normalize time
    news_df.loc[:, 'time'] = news_df.time.dt.normalize()
    
    # assetCodes from String to List
    news_df['assetCodes'] = news_df['assetCodes'].str.findall(f"'([\w\./]+)'")
    
    # Explode news on assetCodes
    assetCodes_expanded = list(chain(*news_df['assetCodes']))
    assetCodes_index = news_df.index.repeat(news_df['assetCodes'].apply(len))

    assert len(assetCodes_expanded) == len(assetCodes_index)
    
    assetCodes_df =  pd.DataFrame({'index': assetCodes_index, 'assetCode': assetCodes_expanded})
    news_df_exploded = news_df.merge(assetCodes_df, 'right', right_on='index', left_index=True, validate='1:m')
    news_df_exploded.drop(['assetCodes', 'index'], 1, inplace=True)

    if extra_features:
        # Compute means for same date and assetCode
        news_agg_dict = {
            'sentimentNegative':'mean',
            'sentimentNeutral':'mean',
            'sentimentPositive':'mean',
            'urgency':'mean',
            'bodySize':'mean',
            'relevance':'mean'
            }
        news_df_agg = news_df_exploded.groupby(['time', 'assetCode'], as_index=False).agg(news_agg_dict)
        
        # Add provider information
        idx = news_df_exploded.groupby(['time', 'assetCode'])['urgency'].transform(max) == news_df_exploded['urgency']
        news_df_exploded_2 = news_df_exploded[idx][['time', 'assetCode', 'provider']].drop_duplicates(['time', 'assetCode'])
        news_df_agg = news_df_agg.merge(news_df_exploded_2, 'left', ['time', 'assetCode'])
        
        # One-hot encoding provider
        ohe_provider = pd.get_dummies(news_df_agg['provider'])
        news_df_agg = pd.concat([news_df_agg, ohe_provider], axis=1).drop(['provider'], axis=1)
    
    else:
        # Compute means for same date and assetCode
        news_agg_dict = {
            'sentimentNegative':'mean',
            'sentimentNeutral':'mean',
            'sentimentPositive':'mean'
            }
        news_df_agg = news_df_exploded.groupby(['time', 'assetCode'], as_index=False).agg(news_agg_dict)
    
    return news_df_agg

In [None]:
def clean_data(market_df, news_df, train=True, extra_features=False):
    '''Clean and preprocess the news and market data for training then merge them, to create a train set or test set.
    
    Parameters
    ----------
    market_df : dataframe
        See https://www.kaggle.com/c/two-sigma-financial-news/data for full description of the dataframe.
    news_df : dataframe
        See https://www.kaggle.com/c/two-sigma-financial-news/data for full description of the dataframe.
    train : bool
        When true, creates both the input features and the target dataframes.
    extra_features : bool
        When true, adds extra columns that SE added ('urgency', 'provider', 'bodySize', 'relevance').
        
    Returns
    -------
    dataframe 
        Cleaned data ready to be fed to the model. Returns both the input and the target dataframes when train=True.
    
    '''
    cleaned_market_df = clean_market_data(market_df, train)
    cleaned_news_df = clean_news_data(news_df, extra_features)
    
    # Merge on market data
    df_merged = cleaned_market_df.merge(cleaned_news_df, 'inner', ['time', 'assetCode'])
    
    if train:
        y = df_merged['returnsOpenNextMktres10']
        X = df_merged.drop(['returnsOpenNextMktres10'], axis=1)
        return X, y
    else:
        return df_merged

In [None]:
#Final dataframes for training
X_train, y_train = clean_data(market_train_df, news_train_df, extra_features=True)