In [1]:
import numpy as np
import pandas as pd
import nltk, re, itertools
import matplotlib.pyplot as plt

from sklearn.svm import SVC
from sklearn.metrics import *
from collections import Counter
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
from nltk.stem.snowball import SnowballStemmer
from datetime import datetime, timedelta, date
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import KFold
from collections import defaultdict
from data_science_toolkit.dataset_ops import classifier_train_val_test_dfs
from data_science_toolkit.data_visualization import get_fig_ax, visualize_class_distribution, top_n_tokens_plot_from_counter

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import f1_score

# Include a paragraph in your final report about why you changed your topic
- Predict more or less than avg number of retweets for that year for each tweet
- LDA for topic modelling (investigate)
- Apriori for combos of words
- Binary classifier
- In general we use clustering or topic modelling to help understand the data, but we don't usually use it for prediction

In [2]:
file_name = 'since_election_with_cluster_elonmusk_twint_preprocessed.csv'
# file_name = "since_election_with_cluster_trump_tweets_sp500.csv"

In [3]:
# nltk.download('stopwords')
# nltk.download('punkt')

In [4]:
output_col = 'above_monthly_avg'
rand_state = 1

In [5]:
%matplotlib inline
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_rows', 500)

In [6]:
stock_tweet = pd.read_csv("./{}".format(file_name))
stock_tweet['created_at']= pd.to_datetime(stock_tweet['created_at']) 
stock_tweet['dow'] = stock_tweet.dow.astype('category')
stock_tweet['num_links'] = stock_tweet.num_links.astype('category')
stock_tweet['created_hour'] = stock_tweet.created_hour.astype('category')
# These would be better as categorical variables, but there are not enough of them for k fold to work properly?
# stock_tweet['num_mentions'] = stock_tweet.num_mentions.astype('category')
# stock_tweet['num_hashtags'] = stock_tweet.num_hashtags.astype('category')
# stock_tweet['percent_caps'] = stock_tweet.percent_caps.astype('category')

In [7]:
# for col in [x for x in stock_tweet.columns.tolist() if '_apr_' in x]:
#     stock_tweet[col] = stock_tweet[col].astype('category')

In [8]:
stop_words = set(stopwords.words('english')).union({''})

# https://medium.com/@chrisfotache/text-classification-in-python-pipelines-nlp-nltk-tf-idf-xgboost-and-more-b83451a327e0

In [9]:
def filter_tokens_without_letters(text):
    tokens = text.split(" ")
    filtered_tokens = []
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    return filtered_tokens

def split_text_only(text):
    return text.split(" ")

In [10]:
class TextSelector(BaseEstimator, TransformerMixin):
    def __init__(self, field):
        self.field = field
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.field]
class NumberSelector(BaseEstimator, TransformerMixin):
    def __init__(self, field):
        self.field = field
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[[self.field]]

In [15]:
def create_models(numeric_features, categorical_features):
    # We create the preprocessing pipelines for both numeric and categorical data.
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())])
    categorical_transformer = Pipeline(steps=[
        ('onehot', OneHotEncoder(handle_unknown='error', categories='auto'))])
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)])
    preprocessing_pipeline = Pipeline([
        ('features', FeatureUnion([
            ('preprocessor', preprocessor),
            ('text', Pipeline([
                ('colext', TextSelector('preprocessed_text')),
                ('tfidf', TfidfVectorizer(max_df=0.9,min_df=3,use_idf=True, tokenizer=filter_tokens_without_letters, ngram_range=(1,4)))
            ])),
            ('bow', Pipeline([
                ('colext', TextSelector('preprocessed_text')),
                ('bow', CountVectorizer(max_df=0.5, min_df=5, tokenizer=split_text_only, ngram_range=(1, 5)))
            ])),
        ]))
    ])
#     svm_linear = Pipeline(steps=[('yeet', preprocessing_pipeline),
#                           ('classifier', SVC(probability=True, gamma='scale', kernel='linear'))])
    svm_rbf = Pipeline(steps=[('yeet', preprocessing_pipeline),
                          ('classifier', SVC(probability=True, gamma='scale', kernel='rbf'))])
    random_forest = Pipeline(steps=[('yeet', preprocessing_pipeline),
                            ('classifier', RandomForestClassifier(n_estimators=100, random_state=rand_state))])
    log_reg = Pipeline(steps=[('yeet', preprocessing_pipeline),
                            ('classifier', LogisticRegression(solver='lbfgs'))])
    return svm_rbf, random_forest, log_reg

In [None]:
svm_rbf_results = defaultdict(list)
random_forest_results = defaultdict(list)
log_reg_results = defaultdict(list)
svm_rbf_with_extra_features_results = defaultdict(list)
random_forest_with_extra_features_results = defaultdict(list)
log_reg_results = defaultdict(list)
log_reg_with_extra_features_results = defaultdict(list)
# model_avg = defaultdict(list)
kf = KFold(n_splits=2, random_state=rand_state)
for train_index, test_index in kf.split(stock_tweet):
    print("New Fold...")
    curr_train_df = stock_tweet.iloc[train_index]
    curr_train_out = curr_train_df[output_col].tolist()
    curr_test_df = stock_tweet.iloc[test_index]
    curr_test_out = curr_test_df[output_col].tolist()
    numeric_features = ['num_words', 'percent_caps', 'num_mentions', 'num_hashtags', "num_photos", "num_videos"]
    categorical_features = ['created_hour', 'dow', 'num_links']# + [x for x in stock_tweet.columns.tolist() if '_apr_' in x]
    svm_rbf, random_forest, log_reg = create_models([], [])
    svm_rbf_with_extra_features, random_forest_with_extra_features, log_reg_with_extra_features = create_models(numeric_features, categorical_features)
    # Random Forest
    random_forest.fit(curr_train_df, curr_train_out)
    rf_out = random_forest.predict_proba(curr_test_df)
    random_forest_results['roc'].append(roc_auc_score(curr_test_out, rf_out[:,1]))
    random_forest_results['acc'].append(accuracy_score(curr_test_out, np.round(rf_out[:,1])))
    random_forest_results['f1'].append(f1_score(curr_test_out, np.round(rf_out[:,1])))
    # SVM RBF
    svm_rbf.fit(curr_train_df, curr_train_out)
    svm_rbf_out = svm_rbf.predict_proba(curr_test_df)
    svm_rbf_results['roc'].append(roc_auc_score(curr_test_out, svm_rbf_out[:,1]))
    svm_rbf_results['acc'].append(accuracy_score(curr_test_out, np.round(svm_rbf_out[:,1])))
    svm_rbf_results['f1'].append(f1_score(curr_test_out, np.round(svm_rbf_out[:,1])))
    # Log Reg
    log_reg.fit(curr_train_df, curr_train_out)
    log_reg_out = log_reg.predict_proba(curr_test_df)
    log_reg_results['roc'].append(roc_auc_score(curr_test_out, log_reg_out[:,1]))
    log_reg_results['acc'].append(accuracy_score(curr_test_out, np.round(log_reg_out[:,1])))
    log_reg_results['f1'].append(f1_score(curr_test_out, np.round(log_reg_out[:,1])))
    # Random Forest with Extra Features:
    random_forest_with_extra_features.fit(curr_train_df, curr_train_out)
    rf_out = random_forest_with_extra_features.predict_proba(curr_test_df)
    random_forest_with_extra_features_results['roc'].append(roc_auc_score(curr_test_out, rf_out[:,1]))
    random_forest_with_extra_features_results['acc'].append(accuracy_score(curr_test_out, np.round(rf_out[:,1])))
    random_forest_with_extra_features_results['f1'].append(f1_score(curr_test_out, np.round(rf_out[:,1])))
    # SVM RBF with Extra Features
    svm_rbf_with_extra_features.fit(curr_train_df, curr_train_out)
    svm_rbf_with_extra_features_out = svm_rbf_with_extra_features.predict_proba(curr_test_df)
    svm_rbf_with_extra_features_results['roc'].append(roc_auc_score(curr_test_out, svm_rbf_with_extra_features_out[:,1]))
    svm_rbf_with_extra_features_results['acc'].append(accuracy_score(curr_test_out, np.round(svm_rbf_with_extra_features_out[:,1])))
    svm_rbf_with_extra_features_results['f1'].append(f1_score(curr_test_out, np.round(svm_rbf_with_extra_features_out[:,1])))
    # Log Reg
    log_reg_with_extra_features.fit(curr_train_df, curr_train_out)
    log_reg_with_extra_features_results_out = log_reg_with_extra_features.predict_proba(curr_test_df)
    log_reg_with_extra_features_results['roc'].append(roc_auc_score(curr_test_out, log_reg_with_extra_features_results_out[:,1]))
    log_reg_with_extra_features_results['acc'].append(accuracy_score(curr_test_out, np.round(log_reg_with_extra_features_results_out[:,1])))
    log_reg_with_extra_features_results['f1'].append(f1_score(curr_test_out, np.round(log_reg_with_extra_features_results_out[:,1])))
    
#     averaged_preds = np.mean(np.array([rf_out[:,1], svm_rbf_out[:,1]]), axis=0)
#     model_avg['roc'].append(roc_auc_score(curr_test_out, averaged_preds))
#     model_avg['acc'].append(accuracy_score(curr_test_out, np.round(averaged_preds)))
    print("SVM RBF")
    print(svm_rbf_results)
    print("SVM RBF with extra features")
    print(svm_rbf_with_extra_features_results)
    print("Random Forest")
    print(random_forest_results)
    print("Random Forest with extra features")
    print(random_forest_with_extra_features_results)
    print("Logistic Regression")
    print(log_reg_results)
    print("Logistic Regression with extra features")
    print(log_reg_with_extra_features_results)

New Fold...




SVM RBF
defaultdict(<class 'list'>, {'acc': [0.8180936995153474], 'roc': [0.7399191220112207], 'f1': [0.18287373004354135]})
SVM RBF with extra features
defaultdict(<class 'list'>, {'acc': [0.8180936995153474], 'roc': [0.7378746800733715], 'f1': [0.19225251076040173]})
Random Forest
defaultdict(<class 'list'>, {'acc': [0.8193861066235865], 'roc': [0.7070455699058147], 'f1': [0.23319615912208505]})
Random Forest with extra features
defaultdict(<class 'list'>, {'acc': [0.8206785137318255], 'roc': [0.7266225282059132], 'f1': [0.24076607387140905]})
Logistic Regression
defaultdict(<class 'list'>, {'acc': [0.8164781906300485], 'roc': [0.6929285136873542], 'f1': [0.32057416267942584]})
Logistic Regression with extra features
defaultdict(<class 'list'>, {'acc': [0.8148626817447496], 'roc': [0.7068240047696931], 'f1': [0.32189349112426036]})
New Fold...


In [18]:
for model in [svm_rbf_results, random_forest_results, log_reg_results, svm_rbf_with_extra_features_results, random_forest_with_extra_features_results, log_reg_with_extra_features_results]:
    print("Acc: {}, ROC: {}".format(np.mean(model['acc']), 
                                    np.mean(model['roc']),
                                    np.mean(model['f1'])))

Acc: 0.851373182552504, ROC: 0.7597295776506516
Acc: 0.8466882067851373, ROC: 0.7482297929084507
Acc: 0.8389337641357028, ROC: 0.7495433437255496
Acc: 0.8502423263327948, ROC: 0.7600222903636887
Acc: 0.8491114701130856, ROC: 0.7574460987665825
Acc: 0.8350565428109855, ROC: 0.7555562392031534
