## Data Preprocessing

### read data

In [1]:
import pandas as pd
import csv

In [2]:
trump_tweets = pd.read_csv('../data/RAW_trump_tweets_20160101_20200930.csv', parse_dates=['created_at'], quoting=csv.QUOTE_NONE)
trump_tweets.head()

Unnamed: 0,source,text,created_at,retweet_count,favorite_count,is_retweet,id_str
0,Twitter for iPhone,I won the debate big based on compilation of p...,2020-10-01 15:14:28,44961,337926,False,1311685923097260034
1,Twitter for iPhone,Thank you @HerschelWalker! https://t.co/XjlYe8...,2020-10-01 03:45:25,19616,65721,False,1311512518800470016
2,Twitter for iPhone,RT @GOPChairwoman: Big news!A Maine court side...,2020-10-01 03:25:31,29393,0,True,1311507509958471680
3,Twitter for iPhone,Thank you Paul! https://t.co/aAk1sfww0d,2020-10-01 03:00:33,15992,63294,False,1311501225423073281
4,Twitter for iPhone,100000 DEFECTIVE BALLOTS IN NEW YORK. THEY WAN...,2020-10-01 02:59:02,51445,190750,False,1311500843309387781


In [3]:
trump_tweets.dtypes

source                    object
text                      object
created_at        datetime64[ns]
retweet_count              int64
favorite_count             int64
is_retweet                object
id_str                     int64
dtype: object

### convert GMT into US Eastern timezone

In [4]:
import datetime
import pytz

In [5]:
gmt = pytz.timezone('GMT')
us_eastern = pytz.timezone('US/Eastern')

In [6]:
def convert_to_us_eastern(row):
    date = row['created_at']
    date_gmt = gmt.localize(date)
    date_us_eastern = date_gmt.astimezone(us_eastern).tz_localize(None) # remove tz info to obtain naive local time
    return date_us_eastern

In [7]:
trump_tweets['created_at'] = trump_tweets.apply(convert_to_us_eastern, axis=1)
trump_tweets.head()

Unnamed: 0,source,text,created_at,retweet_count,favorite_count,is_retweet,id_str
0,Twitter for iPhone,I won the debate big based on compilation of p...,2020-10-01 11:14:28,44961,337926,False,1311685923097260034
1,Twitter for iPhone,Thank you @HerschelWalker! https://t.co/XjlYe8...,2020-09-30 23:45:25,19616,65721,False,1311512518800470016
2,Twitter for iPhone,RT @GOPChairwoman: Big news!A Maine court side...,2020-09-30 23:25:31,29393,0,True,1311507509958471680
3,Twitter for iPhone,Thank you Paul! https://t.co/aAk1sfww0d,2020-09-30 23:00:33,15992,63294,False,1311501225423073281
4,Twitter for iPhone,100000 DEFECTIVE BALLOTS IN NEW YORK. THEY WAN...,2020-09-30 22:59:02,51445,190750,False,1311500843309387781


### filter tweets from 1 Jan 2016 to 30 Sep 2020

In [8]:
start_date = datetime.datetime.strptime('2016-01-01', '%Y-%m-%d')
end_date = datetime.datetime.strptime('2020-10-01', '%Y-%m-%d')

In [9]:
trump_tweets = trump_tweets[(trump_tweets.created_at >= start_date) & (trump_tweets.created_at <= end_date)]
trump_tweets.head()

Unnamed: 0,source,text,created_at,retweet_count,favorite_count,is_retweet,id_str
1,Twitter for iPhone,Thank you @HerschelWalker! https://t.co/XjlYe8...,2020-09-30 23:45:25,19616,65721,False,1311512518800470016
2,Twitter for iPhone,RT @GOPChairwoman: Big news!A Maine court side...,2020-09-30 23:25:31,29393,0,True,1311507509958471680
3,Twitter for iPhone,Thank you Paul! https://t.co/aAk1sfww0d,2020-09-30 23:00:33,15992,63294,False,1311501225423073281
4,Twitter for iPhone,100000 DEFECTIVE BALLOTS IN NEW YORK. THEY WAN...,2020-09-30 22:59:02,51445,190750,False,1311500843309387781
5,Twitter for iPhone,In just 3 and a half years we have secured Ame...,2020-09-30 22:51:05,18885,70838,False,1311498845860196355


### extract hashtags

In [10]:
import re

In [11]:
# create a new column to store the hashtags
trump_tweets['hashtag'] = trump_tweets['text'].apply(lambda x: re.findall(r'\B#\w*[a-zA-Z]+\w*', x))
trump_tweets.head()

Unnamed: 0,source,text,created_at,retweet_count,favorite_count,is_retweet,id_str,hashtag
1,Twitter for iPhone,Thank you @HerschelWalker! https://t.co/XjlYe8...,2020-09-30 23:45:25,19616,65721,False,1311512518800470016,[]
2,Twitter for iPhone,RT @GOPChairwoman: Big news!A Maine court side...,2020-09-30 23:25:31,29393,0,True,1311507509958471680,[]
3,Twitter for iPhone,Thank you Paul! https://t.co/aAk1sfww0d,2020-09-30 23:00:33,15992,63294,False,1311501225423073281,[]
4,Twitter for iPhone,100000 DEFECTIVE BALLOTS IN NEW YORK. THEY WAN...,2020-09-30 22:59:02,51445,190750,False,1311500843309387781,[]
5,Twitter for iPhone,In just 3 and a half years we have secured Ame...,2020-09-30 22:51:05,18885,70838,False,1311498845860196355,[#MAGA]


### extract mentions

In [12]:
# extract mentions
# for retweets, this regex expression only extracts mentions, but ignores the user handle of the OP
# e.g. the regex expression on "RT @realDonaldTrump: Biden for resident! @JoeBiden" will extract @JoeBiden but not @realDonaldTrump
# as @realDonaldTrump is the OP of the tweet that has been retweeted, rather than a direct mention in the retweet.
trump_tweets['mention'] = trump_tweets['text'].apply(lambda x: re.findall(r'(?<!RT\s)@[a-zA-Z0-9]{1,15}', x))
trump_tweets.head()

Unnamed: 0,source,text,created_at,retweet_count,favorite_count,is_retweet,id_str,hashtag,mention
1,Twitter for iPhone,Thank you @HerschelWalker! https://t.co/XjlYe8...,2020-09-30 23:45:25,19616,65721,False,1311512518800470016,[],[@HerschelWalker]
2,Twitter for iPhone,RT @GOPChairwoman: Big news!A Maine court side...,2020-09-30 23:25:31,29393,0,True,1311507509958471680,[],[]
3,Twitter for iPhone,Thank you Paul! https://t.co/aAk1sfww0d,2020-09-30 23:00:33,15992,63294,False,1311501225423073281,[],[]
4,Twitter for iPhone,100000 DEFECTIVE BALLOTS IN NEW YORK. THEY WAN...,2020-09-30 22:59:02,51445,190750,False,1311500843309387781,[],[]
5,Twitter for iPhone,In just 3 and a half years we have secured Ame...,2020-09-30 22:51:05,18885,70838,False,1311498845860196355,[#MAGA],[]


### Extract hashtag & mention counts
In addition, hashtag count and mention count are added as additional features as well.

In [13]:
trump_tweets['hashtag_count'] = trump_tweets['hashtag'].apply(lambda x: len(x))
trump_tweets['mention_count'] = trump_tweets['mention'].apply(lambda x: len(x))

In [14]:
trump_tweets.head()

Unnamed: 0,source,text,created_at,retweet_count,favorite_count,is_retweet,id_str,hashtag,mention,hashtag_count,mention_count
1,Twitter for iPhone,Thank you @HerschelWalker! https://t.co/XjlYe8...,2020-09-30 23:45:25,19616,65721,False,1311512518800470016,[],[@HerschelWalker],0,1
2,Twitter for iPhone,RT @GOPChairwoman: Big news!A Maine court side...,2020-09-30 23:25:31,29393,0,True,1311507509958471680,[],[],0,0
3,Twitter for iPhone,Thank you Paul! https://t.co/aAk1sfww0d,2020-09-30 23:00:33,15992,63294,False,1311501225423073281,[],[],0,0
4,Twitter for iPhone,100000 DEFECTIVE BALLOTS IN NEW YORK. THEY WAN...,2020-09-30 22:59:02,51445,190750,False,1311500843309387781,[],[],0,0
5,Twitter for iPhone,In just 3 and a half years we have secured Ame...,2020-09-30 22:51:05,18885,70838,False,1311498845860196355,[#MAGA],[],1,0


### prepare text for modelling
perform further cleaning on the original tweets with the aim of performing various text modelling. The preprocessed tweets will be stored as a separate feature.

In [15]:
import string
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

In [16]:
def text_processing(text, 
                    lower=True,  
                    remove_handles=True,
                    remove_hashtags=False,
                    remove_url=True,
                    remove_punctuations=True, 
                    remove_numbers=True,
                    remove_stopwords=True,
                    additional_stopwords=['RT'],
                    keep_stopwords = ["no", "not", "nor"],
                    lemmatize=True
                   ):
    '''
    Accepts a text and options to run the following processing functions.
    - remove_handles removes not just the mentions, but also the OP's Twitter handle in retweets
    - by default, remove_hashtags is set to False, as hashtags converys information useful for prediction. They also do convey sentiment and emotion
    '''
    
    # strip non-ascii characters
    text = text.encode('ascii', errors='ignore')
    text = str(text.decode("utf-8"))

    # covert to lowercase
    if lower:
        text = text.lower()
        
    # remove handles 
    if remove_handles:
        text = re.sub(r'@[a-zA-Z0-9]{1,15}', '', text)
        
    # remove hashtags
    if remove_hashtags:
        text = re.sub(r'\B#\w*[a-zA-Z]+\w*', '', text)
        
    # remove url 
    if remove_url:
        text = re.sub(r'http\S+', '', text)
    
    # remove punctuations
    if remove_punctuations:
        text = text.translate(str.maketrans(string.punctuation, ' '*len(string.punctuation)))
    
    # remove numbers
    if remove_numbers:
        text = re.sub(r'\d+', '', text)
    
    # remove unnecessary new lines and whitespaces
    text = text.replace("\n", "") 
    text = ' '.join(text.split())
    
    # tokenize
    text_words = nltk.word_tokenize(text)

    # remove stop words
    if remove_stopwords:
        stop = set(stopwords.words('english'))
        # remove words from the predefined stopwords set
        if not keep_stopwords is None and len(keep_stopwords) != 0:
            for word in keep_stopwords:
                stop.discard(word)
        # add additional words to the stopwords set
        if not additional_stopwords is None and len(additional_stopwords) != 0:
            for word in additional_stopwords:
                # convert the additional stopwords to lowercase if lower is set to True
                if lower:
                    word = word.lower()
                stop.add(word)
        # finally, remove the stopwords from the tweets
        text_words = [x for x in text_words if not x in stop]
        
    # lemmatize
    if lemmatize:
        wordnet_lemmatizer = WordNetLemmatizer()
        text_words = [wordnet_lemmatizer.lemmatize(x, pos="v") for x in text_words]
    
    return ' '.join(text_words)

In [17]:
trump_tweets['cleaned_text'] = trump_tweets['text'].apply(lambda x: text_processing(x))
trump_tweets.head()

Unnamed: 0,source,text,created_at,retweet_count,favorite_count,is_retweet,id_str,hashtag,mention,hashtag_count,mention_count,cleaned_text
1,Twitter for iPhone,Thank you @HerschelWalker! https://t.co/XjlYe8...,2020-09-30 23:45:25,19616,65721,False,1311512518800470016,[],[@HerschelWalker],0,1,thank
2,Twitter for iPhone,RT @GOPChairwoman: Big news!A Maine court side...,2020-09-30 23:25:31,29393,0,True,1311507509958471680,[],[],0,0,big news maine court side rnc uphold ban ballo...
3,Twitter for iPhone,Thank you Paul! https://t.co/aAk1sfww0d,2020-09-30 23:00:33,15992,63294,False,1311501225423073281,[],[],0,0,thank paul
4,Twitter for iPhone,100000 DEFECTIVE BALLOTS IN NEW YORK. THEY WAN...,2020-09-30 22:59:02,51445,190750,False,1311500843309387781,[],[],0,0,defective ballot new york want replace happen ...
5,Twitter for iPhone,In just 3 and a half years we have secured Ame...,2020-09-30 22:51:05,18885,70838,False,1311498845860196355,[#MAGA],[],1,0,half years secure americas border rebuild awes...


### Extract meta information from text data

The purpose of feature engineering is to extract more information from the text data and use the extracted information as features. Such text/NLP based features could include:
<br>
* Word Count – total number of words in the documents
* Character Count – total number of characters in the documents
* Average Word Density – average length of the words used in the documents
* Punctuation Frequency – total number of punctuation marks / total number of words in the documents
* Upper Case Frequency – total number of upper case words / total number of words in the documents
<br>

Since length of a tweet varies, frequency is used instead of absolute count for Puncutation and Upper Case metrics.

Handles, hashtags, numbers, stopwords are retained from the original tweet, whereas urls are removed. No lemmatization is performed.

In [18]:
# define a function to generate the new features
def add_text_features(df):
    
    text_for_feature_extraction = trump_tweets['text'].apply(lambda x: text_processing(x, 
                                                                                       lower=False,  
                                                                                       remove_handles=False,
                                                                                       remove_hashtags=False,
                                                                                       remove_url=True,
                                                                                       remove_punctuations=False, 
                                                                                       remove_numbers=False,
                                                                                       remove_stopwords=False,
                                                                                       additional_stopwords=None,
                                                                                       keep_stopwords=None,
                                                                                       lemmatize=False)
                                                            )
    df['word_count'] = text_for_feature_extraction.apply(lambda x: len(x.split()))
    df['char_count'] = text_for_feature_extraction.apply(len)
    df['word_density'] = df['char_count'] / df['word_count']
    df['punctuation_freq'] = text_for_feature_extraction.apply(lambda x: 
                                                               len("".join(_ for _ in x if _ in string.punctuation))
                                                              ) / df['word_count']
    df['upper_case_freq'] = text_for_feature_extraction.apply(lambda x: 
                                                              len([wrd for wrd in x.split() if wrd.isupper()])
                                                             ) / df['word_count']
    return df

In [19]:
trump_tweets = add_text_features(trump_tweets)
trump_tweets.head()

Unnamed: 0,source,text,created_at,retweet_count,favorite_count,is_retweet,id_str,hashtag,mention,hashtag_count,mention_count,cleaned_text,word_count,char_count,word_density,punctuation_freq,upper_case_freq
1,Twitter for iPhone,Thank you @HerschelWalker! https://t.co/XjlYe8...,2020-09-30 23:45:25,19616,65721,False,1311512518800470016,[],[@HerschelWalker],0,1,thank,5,28,5.6,0.4,0.0
2,Twitter for iPhone,RT @GOPChairwoman: Big news!A Maine court side...,2020-09-30 23:25:31,29393,0,True,1311507509958471680,[],[],0,0,big news maine court side rnc uphold ban ballo...,24,138,5.75,0.291667,0.166667
3,Twitter for iPhone,Thank you Paul! https://t.co/aAk1sfww0d,2020-09-30 23:00:33,15992,63294,False,1311501225423073281,[],[],0,0,thank paul,4,16,4.0,0.25,0.0
4,Twitter for iPhone,100000 DEFECTIVE BALLOTS IN NEW YORK. THEY WAN...,2020-09-30 22:59:02,51445,190750,False,1311500843309387781,[],[],0,0,defective ballot new york want replace happen ...,42,199,4.738095,0.119048,0.857143
5,Twitter for iPhone,In just 3 and a half years we have secured Ame...,2020-09-30 22:51:05,18885,70838,False,1311498845860196355,[#MAGA],[],1,0,half years secure americas border rebuild awes...,43,239,5.55814,0.093023,0.116279


### reset index

In [20]:
trump_tweets = trump_tweets.reset_index().drop(columns=['index'])
trump_tweets.head()

Unnamed: 0,source,text,created_at,retweet_count,favorite_count,is_retweet,id_str,hashtag,mention,hashtag_count,mention_count,cleaned_text,word_count,char_count,word_density,punctuation_freq,upper_case_freq
0,Twitter for iPhone,Thank you @HerschelWalker! https://t.co/XjlYe8...,2020-09-30 23:45:25,19616,65721,False,1311512518800470016,[],[@HerschelWalker],0,1,thank,5,28,5.6,0.4,0.0
1,Twitter for iPhone,RT @GOPChairwoman: Big news!A Maine court side...,2020-09-30 23:25:31,29393,0,True,1311507509958471680,[],[],0,0,big news maine court side rnc uphold ban ballo...,24,138,5.75,0.291667,0.166667
2,Twitter for iPhone,Thank you Paul! https://t.co/aAk1sfww0d,2020-09-30 23:00:33,15992,63294,False,1311501225423073281,[],[],0,0,thank paul,4,16,4.0,0.25,0.0
3,Twitter for iPhone,100000 DEFECTIVE BALLOTS IN NEW YORK. THEY WAN...,2020-09-30 22:59:02,51445,190750,False,1311500843309387781,[],[],0,0,defective ballot new york want replace happen ...,42,199,4.738095,0.119048,0.857143
4,Twitter for iPhone,In just 3 and a half years we have secured Ame...,2020-09-30 22:51:05,18885,70838,False,1311498845860196355,[#MAGA],[],1,0,half years secure americas border rebuild awes...,43,239,5.55814,0.093023,0.116279


### Adjust column order
shift `cleaned_text` to just after the `text` column

In [21]:
col_names = trump_tweets.columns.tolist()
col_names = col_names[:2] + col_names[11:12] + col_names[2:11] + col_names[12:]
trump_tweets = trump_tweets[col_names]
trump_tweets.head()

Unnamed: 0,source,text,cleaned_text,created_at,retweet_count,favorite_count,is_retweet,id_str,hashtag,mention,hashtag_count,mention_count,word_count,char_count,word_density,punctuation_freq,upper_case_freq
0,Twitter for iPhone,Thank you @HerschelWalker! https://t.co/XjlYe8...,thank,2020-09-30 23:45:25,19616,65721,False,1311512518800470016,[],[@HerschelWalker],0,1,5,28,5.6,0.4,0.0
1,Twitter for iPhone,RT @GOPChairwoman: Big news!A Maine court side...,big news maine court side rnc uphold ban ballo...,2020-09-30 23:25:31,29393,0,True,1311507509958471680,[],[],0,0,24,138,5.75,0.291667,0.166667
2,Twitter for iPhone,Thank you Paul! https://t.co/aAk1sfww0d,thank paul,2020-09-30 23:00:33,15992,63294,False,1311501225423073281,[],[],0,0,4,16,4.0,0.25,0.0
3,Twitter for iPhone,100000 DEFECTIVE BALLOTS IN NEW YORK. THEY WAN...,defective ballot new york want replace happen ...,2020-09-30 22:59:02,51445,190750,False,1311500843309387781,[],[],0,0,42,199,4.738095,0.119048,0.857143
4,Twitter for iPhone,In just 3 and a half years we have secured Ame...,half years secure americas border rebuild awes...,2020-09-30 22:51:05,18885,70838,False,1311498845860196355,[#MAGA],[],1,0,43,239,5.55814,0.093023,0.116279


## Sentiment Analysis
Attach sentiment scores to each tweet

### VADER
* Advantage: Works well for social media text, including emojis and slangs
* Disadvantage: Out of Vocab words are classified as neutral
* https://www.nltk.org/_modules/nltk/sentiment/vader.html
* https://github.com/cjhutto/vaderSentiment#about-the-scoring

In [22]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [23]:
analyzer = SentimentIntensityAnalyzer()

In [24]:
# preprocess the original tweets for vader sentiment analysis
text_for_vader = trump_tweets['text'].apply(lambda x: text_processing(x, 
                                                                      lower=False,  
                                                                      remove_handles=True,
                                                                      remove_hashtags=False,
                                                                      remove_url=True,
                                                                      remove_punctuations=False, 
                                                                      remove_numbers=True,
                                                                      remove_stopwords=True,
                                                                      additional_stopwords=['RT'],
                                                                      keep_stopwords = ["no", "not", "nor"],
                                                                      lemmatize=False))

In [25]:
def get_vader_sentiment(text):
    """"
    polarity score represents the proportion of texts that falls under this category.
    positive sentiment: compound score >= 0.05
    neutral sentiment: (compound score > -0.05) and (compound score < 0.05)
    negative sentiment: compound score <= -0.05
    """
    return list(analyzer.polarity_scores(text).values())

In [26]:
trump_tweets['neg_sentiment'], trump_tweets['neu_sentiment'], trump_tweets['pos_sentiment'], trump_tweets['compound_sentiment'] = zip(*text_for_vader.map(get_vader_sentiment))

In [27]:
trump_tweets.head()

Unnamed: 0,source,text,cleaned_text,created_at,retweet_count,favorite_count,is_retweet,id_str,hashtag,mention,...,mention_count,word_count,char_count,word_density,punctuation_freq,upper_case_freq,neg_sentiment,neu_sentiment,pos_sentiment,compound_sentiment
0,Twitter for iPhone,Thank you @HerschelWalker! https://t.co/XjlYe8...,thank,2020-09-30 23:45:25,19616,65721,False,1311512518800470016,[],[@HerschelWalker],...,1,5,28,5.6,0.4,0.0,0.0,0.0,1.0,0.4199
1,Twitter for iPhone,RT @GOPChairwoman: Big news!A Maine court side...,big news maine court side rnc uphold ban ballo...,2020-09-30 23:25:31,29393,0,True,1311507509958471680,[],[],...,0,24,138,5.75,0.291667,0.166667,0.0,1.0,0.0,0.0
2,Twitter for iPhone,Thank you Paul! https://t.co/aAk1sfww0d,thank paul,2020-09-30 23:00:33,15992,63294,False,1311501225423073281,[],[],...,0,4,16,4.0,0.25,0.0,0.0,0.264,0.736,0.4199
3,Twitter for iPhone,100000 DEFECTIVE BALLOTS IN NEW YORK. THEY WAN...,defective ballot new york want replace happen ...,2020-09-30 22:59:02,51445,190750,False,1311500843309387781,[],[],...,0,42,199,4.738095,0.119048,0.857143,0.176,0.796,0.028,-0.7988
4,Twitter for iPhone,In just 3 and a half years we have secured Ame...,half years secure americas border rebuild awes...,2020-09-30 22:51:05,18885,70838,False,1311498845860196355,[#MAGA],[],...,0,43,239,5.55814,0.093023,0.116279,0.202,0.61,0.188,-0.126


## Join tweets with stock prices
Preprocessing to generate the joined dataset with trump's tweets and log returns

### Map tweets to the correct market dates
If a tweet was posted during non-trading hours, it is mapped to the date of the next trading day.

In [28]:
from datetime import *
import numpy as np

In [29]:
#creating a new column indicate the date that the tweet was posted
trump_tweets['created_at'] = pd.to_datetime(trump_tweets['created_at'])
trump_tweets['created_date'] = trump_tweets['created_at'].dt.date
trump_tweets['created_time'] = trump_tweets['created_at'].dt.time

In [30]:
#create a column of indicating if the tweets time passes the closing time of the stock
closingTime = pd.to_datetime("2020-01-01 16:00:00").time()
trump_tweets['passed_closing'] = np.where(trump_tweets['created_time'] >= closingTime, 1, 0)
trump_tweets.head()

Unnamed: 0,source,text,cleaned_text,created_at,retweet_count,favorite_count,is_retweet,id_str,hashtag,mention,...,word_density,punctuation_freq,upper_case_freq,neg_sentiment,neu_sentiment,pos_sentiment,compound_sentiment,created_date,created_time,passed_closing
0,Twitter for iPhone,Thank you @HerschelWalker! https://t.co/XjlYe8...,thank,2020-09-30 23:45:25,19616,65721,False,1311512518800470016,[],[@HerschelWalker],...,5.6,0.4,0.0,0.0,0.0,1.0,0.4199,2020-09-30,23:45:25,1
1,Twitter for iPhone,RT @GOPChairwoman: Big news!A Maine court side...,big news maine court side rnc uphold ban ballo...,2020-09-30 23:25:31,29393,0,True,1311507509958471680,[],[],...,5.75,0.291667,0.166667,0.0,1.0,0.0,0.0,2020-09-30,23:25:31,1
2,Twitter for iPhone,Thank you Paul! https://t.co/aAk1sfww0d,thank paul,2020-09-30 23:00:33,15992,63294,False,1311501225423073281,[],[],...,4.0,0.25,0.0,0.0,0.264,0.736,0.4199,2020-09-30,23:00:33,1
3,Twitter for iPhone,100000 DEFECTIVE BALLOTS IN NEW YORK. THEY WAN...,defective ballot new york want replace happen ...,2020-09-30 22:59:02,51445,190750,False,1311500843309387781,[],[],...,4.738095,0.119048,0.857143,0.176,0.796,0.028,-0.7988,2020-09-30,22:59:02,1
4,Twitter for iPhone,In just 3 and a half years we have secured Ame...,half years secure americas border rebuild awes...,2020-09-30 22:51:05,18885,70838,False,1311498845860196355,[#MAGA],[],...,5.55814,0.093023,0.116279,0.202,0.61,0.188,-0.126,2020-09-30,22:51:05,1


In [31]:
# if it passes 4:00, then the date will plus 1
trump_tweets['Date'] = ''
for i in range(len(trump_tweets)):
    if (trump_tweets['passed_closing'].iloc[i] == 1):
        trump_tweets['Date'].iloc[i] = trump_tweets['created_date'].iloc[i] + timedelta(days=1)
    else:
        trump_tweets['Date'].iloc[i] = trump_tweets['created_date'].iloc[i]
trump_tweets['Date'] = pd.to_datetime(trump_tweets['Date'])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


In [32]:
trump_tweets.head()

Unnamed: 0,source,text,cleaned_text,created_at,retweet_count,favorite_count,is_retweet,id_str,hashtag,mention,...,punctuation_freq,upper_case_freq,neg_sentiment,neu_sentiment,pos_sentiment,compound_sentiment,created_date,created_time,passed_closing,Date
0,Twitter for iPhone,Thank you @HerschelWalker! https://t.co/XjlYe8...,thank,2020-09-30 23:45:25,19616,65721,False,1311512518800470016,[],[@HerschelWalker],...,0.4,0.0,0.0,0.0,1.0,0.4199,2020-09-30,23:45:25,1,2020-10-01
1,Twitter for iPhone,RT @GOPChairwoman: Big news!A Maine court side...,big news maine court side rnc uphold ban ballo...,2020-09-30 23:25:31,29393,0,True,1311507509958471680,[],[],...,0.291667,0.166667,0.0,1.0,0.0,0.0,2020-09-30,23:25:31,1,2020-10-01
2,Twitter for iPhone,Thank you Paul! https://t.co/aAk1sfww0d,thank paul,2020-09-30 23:00:33,15992,63294,False,1311501225423073281,[],[],...,0.25,0.0,0.0,0.264,0.736,0.4199,2020-09-30,23:00:33,1,2020-10-01
3,Twitter for iPhone,100000 DEFECTIVE BALLOTS IN NEW YORK. THEY WAN...,defective ballot new york want replace happen ...,2020-09-30 22:59:02,51445,190750,False,1311500843309387781,[],[],...,0.119048,0.857143,0.176,0.796,0.028,-0.7988,2020-09-30,22:59:02,1,2020-10-01
4,Twitter for iPhone,In just 3 and a half years we have secured Ame...,half years secure americas border rebuild awes...,2020-09-30 22:51:05,18885,70838,False,1311498845860196355,[#MAGA],[],...,0.093023,0.116279,0.202,0.61,0.188,-0.126,2020-09-30,22:51:05,1,2020-10-01


### Obtain log return of selected market indices

In [33]:
from pandas_datareader.data import DataReader

In [34]:
def get_data_for_multiple_stocks(tickers, start_date, end_date):
    '''
    Obtain stocks information (Date, OHLC, Volume and Adjusted Close).
    Uses Pandas DataReader to make an API Call to Yahoo Finance and download the data directly.
    Computes other values - Log Return and Arithmetic Return.
    
    Input: List of Stock Tickers
    Output: A dictionary of dataframes for each stock
    '''
    # read in stock data
    s = DataReader(tickers[0], 'yahoo', start_date, end_date)[["Adj Close"]]
    # get log returns
    s[tickers[0]] = np.log(s['Adj Close']/s['Adj Close'].shift(1))
    
    stocks = s[[tickers[0]]]
    
    for ticker in tickers[1:]:
        s = DataReader(ticker, 'yahoo', start_date, end_date)
        s[ticker] = np.log(s['Adj Close']/s['Adj Close'].shift(1))
        stocks[ticker] = s[ticker]
        
    # skip first row that will be na, and fillna by 0 incase there are trading halts on specific days
    return stocks.iloc[1:].fillna(0)

In [35]:
log_returns = get_data_for_multiple_stocks(["^GSPC","^IXIC","VGT"], "2016-01-01", "2020-10-01")
log_returns.index.name = 'Date'
log_returns.reset_index(inplace=True)
log_returns['Date'] = pd.to_datetime(log_returns['Date'])

In [36]:
# join the two dataset to detect null values
joined = trump_tweets.merge(log_returns, on='Date', how='left')

In [37]:
# a function to find the next nearest date
def nearestDate(base, df):
    df = df[(df['Date'] > base)]
    nearness = { abs(base.timestamp() - date.timestamp()) : date for date in df['Date']}
    return nearness[min(nearness.keys())]
#nearestDate(datetime(2020,1,7),log_returns)

In [38]:
# change the null value date with the next available date in the log return
for i in range(len(joined)):
    if (np.isnan(joined['^GSPC'].iloc[i])):
        the_date = joined['Date'].iloc[i]
        #print(the_date)
        the_date = pd.to_datetime(the_date)
        n = nearestDate(the_date,log_returns)
        #print('changed to:')
        #print(n)
        joined['Date'].iloc[i] = n

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


In [39]:
# merge the dataset again
joined = joined.drop(['^GSPC', '^IXIC', 'VGT'], axis=1)
new_joined = joined.merge(log_returns, on='Date', how='left')

In [40]:
new_joined.head()
# We can drop the columns that we dont need. 

Unnamed: 0,source,text,cleaned_text,created_at,retweet_count,favorite_count,is_retweet,id_str,hashtag,mention,...,neu_sentiment,pos_sentiment,compound_sentiment,created_date,created_time,passed_closing,Date,^GSPC,^IXIC,VGT
0,Twitter for iPhone,Thank you @HerschelWalker! https://t.co/XjlYe8...,thank,2020-09-30 23:45:25,19616,65721,False,1311512518800470016,[],[@HerschelWalker],...,0.0,1.0,0.4199,2020-09-30,23:45:25,1,2020-10-01,0.005279,0.014137,0.012381
1,Twitter for iPhone,RT @GOPChairwoman: Big news!A Maine court side...,big news maine court side rnc uphold ban ballo...,2020-09-30 23:25:31,29393,0,True,1311507509958471680,[],[],...,1.0,0.0,0.0,2020-09-30,23:25:31,1,2020-10-01,0.005279,0.014137,0.012381
2,Twitter for iPhone,Thank you Paul! https://t.co/aAk1sfww0d,thank paul,2020-09-30 23:00:33,15992,63294,False,1311501225423073281,[],[],...,0.264,0.736,0.4199,2020-09-30,23:00:33,1,2020-10-01,0.005279,0.014137,0.012381
3,Twitter for iPhone,100000 DEFECTIVE BALLOTS IN NEW YORK. THEY WAN...,defective ballot new york want replace happen ...,2020-09-30 22:59:02,51445,190750,False,1311500843309387781,[],[],...,0.796,0.028,-0.7988,2020-09-30,22:59:02,1,2020-10-01,0.005279,0.014137,0.012381
4,Twitter for iPhone,In just 3 and a half years we have secured Ame...,half years secure americas border rebuild awes...,2020-09-30 22:51:05,18885,70838,False,1311498845860196355,[#MAGA],[],...,0.61,0.188,-0.126,2020-09-30,22:51:05,1,2020-10-01,0.005279,0.014137,0.012381


In [41]:
# rename Date to market_date
new_joined = new_joined.rename(columns={'Date':'market_date'})
new_joined.columns

Index(['source', 'text', 'cleaned_text', 'created_at', 'retweet_count',
       'favorite_count', 'is_retweet', 'id_str', 'hashtag', 'mention',
       'hashtag_count', 'mention_count', 'word_count', 'char_count',
       'word_density', 'punctuation_freq', 'upper_case_freq', 'neg_sentiment',
       'neu_sentiment', 'pos_sentiment', 'compound_sentiment', 'created_date',
       'created_time', 'passed_closing', 'market_date', '^GSPC', '^IXIC',
       'VGT'],
      dtype='object')

## Save cleaned data

In [42]:
new_joined.to_csv('../data/tweets&sentiment&logreturns.csv', index=False)