Building of model based on this paper:

https://www.sciencedirect.com/science/article/pii/S1303070121000329

In [3]:
import hexuity as hx
import pandas as pd
import numpy as np

# Tokenisation

In [203]:
# quick clean

speeches, _ = hx.data_expander(price_data=False)

import string
def strip(dirty):
    try:
        return ''.join([x for x in dirty if x in string.ascii_letters + '\'- '])
    except:
        return ''

speeches = speeches.applymap(strip)

# put speeches in long column for training

all_speeches = pd.Series()
for col in speeches.columns:
    all_speeches = all_speeches.append(speeches.loc[:,col])

all_speeches = all_speeches.drop_duplicates()

  all_speeches = pd.Series()


In [230]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()

vectorizer1 = TfidfVectorizer(ngram_range=(1,1), max_features=100, stop_words='english')
vectorizer2 = TfidfVectorizer(ngram_range=(2,2), max_features=100, stop_words='english')
vectorizer3 = TfidfVectorizer(ngram_range=(3,3), max_features=250, stop_words='english')

In [231]:
#vectorizer1.fit(all_speeches)
#vectorizer2.fit(all_speeches)
vectorizer3.fit(all_speeches)

vectorizers = [vectorizer3] #[vectorizer1, vectorizer2, vectorizer3]

In [232]:
tokenised_speeches = pd.DataFrame(index=np.arange(len(speeches)))
i = 0
limit = 120
for vectorizer in vectorizers:
    for col in speeches.columns:
        temp_tokens = pd.DataFrame(vectorizer.transform(speeches.loc[:,col]).toarray())
        tokenised_speeches = pd.concat([tokenised_speeches,temp_tokens], axis=1, ignore_index=True)
        i+=1
        print(i/limit)

0.008333333333333333
0.016666666666666666
0.025
0.03333333333333333
0.041666666666666664
0.05
0.058333333333333334
0.06666666666666667
0.075
0.08333333333333333
0.09166666666666666
0.1
0.10833333333333334
0.11666666666666667
0.125
0.13333333333333333
0.14166666666666666
0.15
0.15833333333333333
0.16666666666666666
0.175
0.18333333333333332
0.19166666666666668
0.2
0.20833333333333334
0.21666666666666667
0.225
0.23333333333333334
0.24166666666666667
0.25
0.25833333333333336
0.26666666666666666
0.275
0.2833333333333333
0.2916666666666667
0.3
0.30833333333333335
0.31666666666666665
0.325
0.3333333333333333


# Quick model test 

In [77]:
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier, XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_squared_error

In [237]:
X = tokenised_speeches

X_prices, y = hx.data_expander(price_data=True, speech_data=False, task='regression')
X_prices.index=np.arange(len(X_prices))

X = pd.concat([X, X_prices], axis=1, ignore_index=True)

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [224]:
X, y = hx.data_expander(speech_data=False, task='regression')
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [238]:
classifer = XGBRegressor()
classifer.fit(X_train, y_train)
preds = classifer.predict(X_test)
mean_squared_error(preds, y_test, squared=False)

0.3606053997459688

In [222]:
classifer = XGBRegressor()
classifer.fit(X_train, y_train)
preds = classifer.predict(X_test)
mean_squared_error(preds, y_test, squared=False)

0.4166716391893491

In [225]:
classifer = XGBRegressor()
classifer.fit(X_train, y_train)
preds = classifer.predict(X_test)
mean_squared_error(preds, y_test, squared=False)

0.446849024551102

In [236]:
classifer = XGBClassifier()
classifer.fit(X_train, y_train)
preds = classifer.predict(X_test)
mean_squared_error(preds, y_test, squared=False)





0.605420403143

# Verification / Pipeline

In [84]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()


def df_tokenizer(DF):


    orig_index = DF.index

    # put speeches in long column for training

    all_speeches = pd.Series()
    for col in DF.columns:
        all_speeches = all_speeches.append(DF.loc[:,col])

    all_speeches = all_speeches.drop_duplicates()


    #vectorizer1 = TfidfVectorizer(ngram_range=(1,1), max_features=100, stop_words='english')
    #vectorizer2 = TfidfVectorizer(ngram_range=(2,2), max_features=100, stop_words='english')
    vectorizer3 = TfidfVectorizer(ngram_range=(3,3), max_features=250, stop_words='english')

    #vectorizer1.fit(all_speeches)
    #vectorizer2.fit(all_speeches)
    vectorizer3.fit(all_speeches)

    vectorizers = [vectorizer3]

    tokenised_speeches = pd.DataFrame(index=np.arange(len(DF)))

    for vectorizer in vectorizers:
        for col in DF.columns:
            temp_tokens = pd.DataFrame(vectorizer.transform(DF.loc[:,col]).toarray())
            tokenised_speeches = pd.concat([tokenised_speeches,temp_tokens], axis=1, ignore_index=True)

    tokenised_speeches.index = orig_index

    return tokenised_speeches

In [85]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer

tokenized = FunctionTransformer(df_tokenizer)

In [94]:
def pre(DF):
    price_columns = DF.select_dtypes(include=np.number).columns.tolist()
    speech_columns = DF.select_dtypes(include=object).columns.tolist()

    X_price = DF[price_columns]
    X_speech = DF[speech_columns]

    import string
    
    def strip(dirty):
        try:
            return ''.join([x for x in dirty if x in string.ascii_letters + '\'- '])
        except:
            return 'the'

    X_speech = X_speech.applymap(strip)

    X_speech = df_tokenizer(X_speech)

    out_df = pd.concat([X_price, X_speech], axis=1)
    return out_df

In [95]:
sk_pre = FunctionTransformer(pre)

In [96]:
test_pipe = Pipeline([
    ('pre', sk_pre),
    ('classifier', XGBRegressor())
    ])

In [97]:
X_train, X_test, y_train, y_test = hx.data_expander(task='regression', test_size=0.2)

In [98]:
y_train

534    -0.812528
269     0.015305
1160    0.383347
197     1.368310
97     -0.745036
          ...   
417    -0.671216
1154   -0.673325
865    -0.822019
455    -1.067732
1042    0.778809
Name: target_reg, Length: 802, dtype: float64

In [99]:
test_pipe.fit(X_train, y_train)

  all_speeches = pd.Series()


Pipeline(steps=[('pre',
                 FunctionTransformer(func=<function pre at 0x2e42fce50>)),
                ('classifier',
                 XGBRegressor(base_score=0.5, booster='gbtree',
                              colsample_bylevel=1, colsample_bynode=1,
                              colsample_bytree=1, enable_categorical=False,
                              gamma=0, gpu_id=-1, importance_type=None,
                              interaction_constraints='',
                              learning_rate=0.300000012, max_delta_step=0,
                              max_depth=6, min_child_weight=1, missing=nan,
                              monotone_constraints='()', n_estimators=100,
                              n_jobs=8, num_parallel_tree=1, predictor='auto',
                              random_state=0, reg_alpha=0, reg_lambda=1,
                              scale_pos_weight=1, subsample=1,
                              tree_method='exact', validate_parameters=1,
              

In [101]:
preds = test_pipe.predict(X_test)
mean_squared_error(preds, y_test, squared=False)

  all_speeches = pd.Series()


0.4250874931485275