# Regression Model for price movement

In [10]:
import pandas as pd
import numpy as np
from datetime import datetime
import datetime
import sklearn
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import MinMaxScaler
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
import tensorflow as tf
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
import keras.utils
from keras.models import Sequential
from keras.layers import Dropout
from keras.layers import Activation
from keras.layers import Bidirectional
from keras.layers import Dense
from keras.layers import LSTM
from keras.optimizers import Adam

from tensorflow.keras.optimizers import RMSprop
from sklearn.metrics import classification_report

In [11]:
def rename_datetime(data):
    data.rename(columns={'Unnamed: 0': 'datetime'},inplace=True)
    data['datetime']=pd.to_datetime(data['datetime'])
    data.set_index('datetime', inplace=True)
    return data
# feature selection
# create features
def create_features(data, g_lag, tv_lag, tw_lag):
    data['Change'] =data['Close'].diff().dropna()
    data['Label'] = np.where(data['Change']>0, 1 ,0)
    data['google_trends_lag']=data['google_trends'].shift(g_lag)
    data['tweet_volume_lag']=data['tweet_volume'].shift(tv_lag)
    data['tw_polarity_lag'] = data['tw_polarity'].shift(tw_lag)

    data.drop(columns=['Open','High','Low','Change'],inplace=True)
    # Add features like RSI? Moving average?

    data.dropna(inplace=True)
    return data
# keep the wanted features
def keep_features(feature_conditions):
    features=['Label','Close']
    for feature, condition in feature_conditions.items():
        if condition:
            features.append(feature)
    return features

# create the lagged features based on the timesteps
def reshape_features(data, to_keep=1, to_remove=1):
    variables = 1 if type(data) is list else data.shape[1]
    df = pd.DataFrame(data)
    columns, names = list(), list()

    for i in range(to_keep, 0, -1):
        columns.append(df.shift(i))
        names += [('var%d(t-%d)' % (j+1, i)) for j in range(variables)]

    for i in range(0, to_remove):
        columns.append(df.shift(-i))
        if i == 0:
            names += [('var%d(t)' % (j+1)) for j in range(variables)]
        else:
            names += [('var%d(t+%d)' % (j+1, i)) for j in range(variables)]

    #put it all together
    final = pd.concat(columns, axis=1)
    final.columns = names

    #drop rows with NaN values
    final.dropna(inplace=True)

    new_data = final.reset_index()

    new_data = new_data.drop(columns=['datetime'])

    return new_data

# shuffle the data
def shuffle_data(times, data):
    np.random.seed(1)
    for i in range(times+1):
        data=shuffle(data)
    return data

# split labels from data
def split_label(train, test):
    train_y = train['var1(t)'].values
    test_y = test['var1(t)'].values
    train_y = train_y.reshape(len(train_y), 1)
    test_y = test_y.reshape(len(test_y), 1)
    return train_y, test_y

# normalize data using Minmaxscaler
def normalize_reshape_data(train, test, train_y, test_y, all_features, n_features, timestep):
    feature_scaler=MinMaxScaler()
    scale_train_data=feature_scaler.fit_transform(train)
    scale_test_data= feature_scaler.transform(test)
    train = scale_train_data[:, :all_features]
    test = scale_test_data[:, :all_features]
    #keep only prices array
    train_X, train_y = train[:, :all_features], train_y[:, -1]
    test_X, test_y = test[:, :all_features], test_y[:, -1]

    return train_X, test_X, train_y, test_y

In [12]:
filepath='./../data/processed_data.csv'

In [13]:
def get_data(filepath, g_lag, tv_lag, tw_lag, timestep, shuffle_times, split_ratio, feature_conditions):
    # import data
    # import the original data. processed_data
    # processed_data: weighted reddit score+ fill the nan data
    data=pd.read_csv(filepath)
    # show the data
    data=rename_datetime(data)

    # create features
    data_created = create_features(data,g_lag,tv_lag,tw_lag)

    # keep the wanted features
    features = keep_features(feature_conditions)
    data=data_created[features]

    # reshape the data
    # create the lagged features based on the timesteps
    df_copy = data.copy()
    new_data=reshape_features(df_copy, timestep, 1)

    # shuffle the data
    shuffled_data=shuffle_data(shuffle_times, new_data)

    # split the data
    train, test= train_test_split(shuffled_data, test_size=split_ratio)
    train_y, test_y=split_label(train, test)

    # normalized the data using MinMaxscaler
    n_features=len(features)
    all_features = timestep * n_features
    if (all_features==0):
        all_features=n_features
    train_X, test_X, train_y, test_y =normalize_reshape_data(train, test, train_y, test_y, all_features, n_features,timestep)

    print("train_X Shape:", train_X.shape)
    print("train_y Shape:", train_y.shape)
    print("test_X Shape:", test_X.shape)
    print("test_y Shape:", test_y.shape)

    return data, train_X, test_X, train_y, test_y, n_features

In [14]:
# # get correlation matrix
# sns.heatmap(data.corr(), annot=True)
# plt.show()

## Model Building

In [15]:
def create_model(train_X, train_y, test_X, test_y, n_features, timestep ):
    #set seed to reproduce results
    np.random.seed(1)
    tf.random.set_seed(1)

    # set the range of the hyperparameters
    param_grid = {
        'n_estimators': np.arange(100,1000,100),
         'max_depth': [10,20,30],
    }
    # Create a Random Forest Classifier
    forest = RandomForestClassifier(oob_score = True, criterion = "gini", random_state = 18)

    # hyperparameter selection with GridSearchCV
    grid_search = GridSearchCV(forest, param_grid=param_grid, cv=5, scoring='accuracy', refit=True,n_jobs=-1, return_train_score=True)
    grid_search.fit(train_X, train_y)

    forest_best = grid_search.best_estimator_
    print(forest_best)
    # Make predictions
    y_pred = forest_best.predict(test_X)
    results=grid_search.cv_results_
    # calculate the metrics
    report=classification_report(
          test_y,
          y_pred,target_names = ["Down", "Up"],
          digits = 5, output_dict=True)

    # precision = report['Down']['precision']
    down_f1_score = report['Down']['f1-score']
    up_f1_score = report['Up']['f1-score']
    accuracy=report['accuracy']

    return down_f1_score, up_f1_score, accuracy, grid_search.best_params_

In [16]:
# model parameters

feature_conditions = {
        'google_trends': 0, 'google_trends_lag': 0,
        'tweet_volume_lag': 0, 'tw_polarity_lag': 0, 'tw_compound': 0,
        'tw_polarity': 0, 'tweet_volume': 0,'re_compound': 0,'re_polarity': 0,
        're_subjectivity': 0
    }
def test_model(filepath_out, feature_conditions):
    columns = ["timestep","features","google_trends_lag","tweet_volume_lag","tweet_polarity_score_lag","best_estimators", "down_f1_score","up_f1_score","acc"]

    try:
        results = pd.read_csv(filepath_out)
    except:
        results = pd.DataFrame(columns=columns)

    #lagged_features
    timestep = [10, 15]
    #train_ratio
    split_ratio =0.2
    shuffle_times = 3

     #for each lag feature
    for step in timestep:
        # Lags
        g_lag = [10]
        tv_lag = [28] # tweets volume
        tw_lag = 15 # tweets score



        for g in g_lag:
            for tv in tv_lag:

                data, train_X, test_X, train_y, test_y, n_features = get_data(filepath, g, tv, tw_lag, step, shuffle_times, split_ratio, feature_conditions)
                down_score, up_score, accuracy,best_estimators  = create_model(train_X, train_y, test_X, test_y, n_features, step)



                results = results.append({"timestep": step,"features": data.columns.values,"google_trends_lag":g,"tweet_volume_lag": tv,"tweet_polarity_score_lag": tw_lag, "best_estimators":best_estimators,"down_f1_score":down_score,"up_f1_score":up_score, "acc": accuracy}, ignore_index=True)
    return pd.DataFrame(results)

In [17]:
filepath_out='./../data/randomforest_results_1.csv'
results=test_model(filepath_out, feature_conditions)

train_X Shape: (3444, 20)
train_y Shape: (3444,)
test_X Shape: (862, 20)
test_y Shape: (862,)
RandomForestClassifier(max_depth=10, n_estimators=300, oob_score=True,
                       random_state=18)
train_X Shape: (3440, 30)
train_y Shape: (3440,)
test_X Shape: (861, 30)
test_y Shape: (861,)


  results = results.append({"timestep": step,"features": data.columns.values,"google_trends_lag":g,"tweet_volume_lag": tv,"tweet_polarity_score_lag": tw_lag, "best_estimators":best_estimators,"down_f1_score":down_score,"up_f1_score":up_score, "acc": accuracy}, ignore_index=True)


RandomForestClassifier(max_depth=10, n_estimators=300, oob_score=True,
                       random_state=18)


  results = results.append({"timestep": step,"features": data.columns.values,"google_trends_lag":g,"tweet_volume_lag": tv,"tweet_polarity_score_lag": tw_lag, "best_estimators":best_estimators,"down_f1_score":down_score,"up_f1_score":up_score, "acc": accuracy}, ignore_index=True)


In [18]:
results.to_csv(filepath_out, index=False)