In [1]:
# Stealing stuff... http://brandonrose.org/clustering
import numpy as np
import pandas as pd
import nltk, re, itertools
import matplotlib.pyplot as plt

from sklearn.svm import SVC
from sklearn.metrics import *
from collections import Counter
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
from nltk.stem.snowball import SnowballStemmer
from datetime import datetime, timedelta, date
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import KFold
from collections import defaultdict
from data_science_toolkit.dataset_ops import classifier_train_val_test_dfs
from data_science_toolkit.data_visualization import get_fig_ax, visualize_class_distribution, top_n_tokens_plot_from_counter

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.base import BaseEstimator, TransformerMixin

# Include a paragraph in your final report about why you changed your topic
- Predict more or less than avg number of retweets for that year for each tweet
- LDA for topic modelling (investigate)
- Apriori for combos of words
- Binary classifier
- In general we use clustering or topic modelling to help understand the data, but we don't usually use it for prediction

In [2]:
# nltk.download('stopwords')
# nltk.download('punkt')

In [3]:
output_col = 'above_monthly_avg'
rand_state = 1

In [4]:
%matplotlib inline
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_rows', 500)

In [5]:
stock_tweet = pd.read_csv('./since_election_with_cluster_trump_tweets_sp500.csv')
stock_tweet['created_at']= pd.to_datetime(stock_tweet['created_at']) 
stock_tweet['dow'] = stock_tweet.dow.astype('category')
stock_tweet['num_links'] = stock_tweet.num_links.astype('category')
stock_tweet['created_hour'] = stock_tweet.created_hour.astype('category')
# These would be better as categorical variables, but there are not enough of them for k fold to work properly?
# stock_tweet['num_mentions'] = stock_tweet.num_mentions.astype('category')
# stock_tweet['num_hashtags'] = stock_tweet.num_hashtags.astype('category')
# stock_tweet['percent_caps'] = stock_tweet.percent_caps.astype('category')

In [6]:
for col in [x for x in stock_tweet.columns.tolist() if '_apr_' in x]:
    stock_tweet[col] = stock_tweet[col].astype('category')

In [7]:
stop_words = set(stopwords.words('english')).union({''})

# https://medium.com/@chrisfotache/text-classification-in-python-pipelines-nlp-nltk-tf-idf-xgboost-and-more-b83451a327e0

In [8]:
def filter_tokens_without_letters(text):
    tokens = text.split(" ")
    filtered_tokens = []
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    return filtered_tokens

def split_text_only(text):
    return text.split(" ")

In [11]:
class TextSelector(BaseEstimator, TransformerMixin):
    def __init__(self, field):
        self.field = field
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.field]
class NumberSelector(BaseEstimator, TransformerMixin):
    def __init__(self, field):
        self.field = field
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[[self.field]]

In [12]:
def create_models(numeric_features, categorical_features):
    # We create the preprocessing pipelines for both numeric and categorical data.
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())])
    categorical_transformer = Pipeline(steps=[
        ('onehot', OneHotEncoder(handle_unknown='error', categories='auto'))])
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)])
    preprocessing_pipeline = Pipeline([
        ('features', FeatureUnion([
            ('preprocessor', preprocessor),
            ('text', Pipeline([
                ('colext', TextSelector('preprocessed_text')),
                ('tfidf', TfidfVectorizer(max_df=0.9,min_df=3,use_idf=True, tokenizer=filter_tokens_without_letters, ngram_range=(1,4)))
            ])),
            ('bow', Pipeline([
                ('colext', TextSelector('preprocessed_text')),
                ('bow', CountVectorizer(max_df=0.5, min_df=5, tokenizer=split_text_only, ngram_range=(1, 5)))
            ])),
        ]))
    ])
#     svm_linear = Pipeline(steps=[('yeet', preprocessing_pipeline),
#                           ('classifier', SVC(probability=True, gamma='scale', kernel='linear'))])
    svm_rbf = Pipeline(steps=[('yeet', preprocessing_pipeline),
                          ('classifier', SVC(probability=True, gamma='scale', kernel='rbf'))])
    random_forest = Pipeline(steps=[('yeet', preprocessing_pipeline),
                            ('classifier', RandomForestClassifier(n_estimators=100, random_state=rand_state))])
    return svm_rbf, random_forest

In [13]:
model_1 = defaultdict(list)
model_2 = defaultdict(list)
model_avg = defaultdict(list)
kf = KFold(n_splits=3, random_state=rand_state)
for train_index, test_index in kf.split(stock_tweet):
    print("New Fold...")
    curr_train_df = stock_tweet.iloc[train_index]
    curr_train_out = curr_train_df[output_col].tolist()
    curr_test_df = stock_tweet.iloc[test_index]
    curr_test_out = curr_test_df[output_col].tolist()
    numeric_features = ['num_words', 'percent_caps', 'num_mentions', 'num_hashtags']
    categorical_features = ['created_hour', 'dow', 'num_links'] + [x for x in stock_tweet.columns.tolist() if '_apr_' in x]
    svm_rbf, random_forest = create_models(numeric_features, categorical_features)
    random_forest.fit(curr_train_df, curr_train_out)
    rf_out = random_forest.predict_proba(curr_test_df)
    model_2['roc'].append(roc_auc_score(curr_test_out, rf_out[:,1]))
    model_2['acc'].append(accuracy_score(curr_test_out, np.round(rf_out[:,1])))
    svm_rbf.fit(curr_train_df, curr_train_out)
    svm_rbf_out = svm_rbf.predict_proba(curr_test_df)
    model_1['roc'].append(roc_auc_score(curr_test_out, svm_rbf_out[:,1]))
    model_1['acc'].append(accuracy_score(curr_test_out, np.round(svm_rbf_out[:,1])))
    averaged_preds = np.mean(np.array([rf_out[:,1], svm_rbf_out[:,1]]), axis=0)
    model_avg['roc'].append(roc_auc_score(curr_test_out, averaged_preds))
    model_avg['acc'].append(accuracy_score(curr_test_out, np.round(averaged_preds)))
    print(model_1)
    print(model_2)
    print(model_avg)

New Fold...
defaultdict(<class 'list'>, {'acc': [0.6987138263665594], 'roc': [0.7560482969043508]})
defaultdict(<class 'list'>, {'acc': [0.6787781350482315], 'roc': [0.7375334701456074]})
defaultdict(<class 'list'>, {'acc': [0.6938906752411576], 'roc': [0.758147605242299]})
New Fold...
defaultdict(<class 'list'>, {'acc': [0.6987138263665594, 0.7237942122186495], 'roc': [0.7560482969043508, 0.7833792266708868]})
defaultdict(<class 'list'>, {'acc': [0.6787781350482315, 0.684887459807074], 'roc': [0.7375334701456074, 0.7554391517645197]})
defaultdict(<class 'list'>, {'acc': [0.6938906752411576, 0.7160771704180064], 'roc': [0.758147605242299, 0.7819186580155087]})
New Fold...
defaultdict(<class 'list'>, {'acc': [0.6987138263665594, 0.7237942122186495, 0.6876809263428755], 'roc': [0.7560482969043508, 0.7833792266708868, 0.7310420222751447]})
defaultdict(<class 'list'>, {'acc': [0.6787781350482315, 0.684887459807074, 0.6516564811836604], 'roc': [0.7375334701456074, 0.7554391517645197, 0.6888

In [14]:
for model in [model_1, model_2, model_avg]:
    print("Acc: {}, ROC: {}".format(np.mean(model['acc']), np.mean(model['roc'])))

Acc: 0.7033963216426948, ROC: 0.7568231819501275
Acc: 0.671774025346322, ROC: 0.7272819556217239
Acc: 0.6963214358480049, ROC: 0.753530476989576
