## Data Preprocessing

### read data

In [1]:
import pandas as pd
import csv

In [2]:
trump_tweets = pd.read_csv('../data/RAW_trump_tweets_20160101_20200930.csv', parse_dates=['created_at'], quoting=csv.QUOTE_NONE)
trump_tweets.head()

Unnamed: 0,source,text,created_at,retweet_count,favorite_count,is_retweet,id_str
0,Twitter for iPhone,I won the debate big based on compilation of p...,2020-10-01 15:14:28,44961,337926,False,1311685923097260034
1,Twitter for iPhone,Thank you @HerschelWalker! https://t.co/XjlYe8...,2020-10-01 03:45:25,19616,65721,False,1311512518800470016
2,Twitter for iPhone,RT @GOPChairwoman: Big news!A Maine court side...,2020-10-01 03:25:31,29393,0,True,1311507509958471680
3,Twitter for iPhone,Thank you Paul! https://t.co/aAk1sfww0d,2020-10-01 03:00:33,15992,63294,False,1311501225423073281
4,Twitter for iPhone,100000 DEFECTIVE BALLOTS IN NEW YORK. THEY WAN...,2020-10-01 02:59:02,51445,190750,False,1311500843309387781


In [3]:
trump_tweets.dtypes

source                    object
text                      object
created_at        datetime64[ns]
retweet_count              int64
favorite_count             int64
is_retweet                object
id_str                     int64
dtype: object

### convert GMT into US Eastern timezone

In [4]:
import datetime
import pytz

In [5]:
gmt = pytz.timezone('GMT')
us_eastern = pytz.timezone('US/Eastern')

In [6]:
def convert_to_us_eastern(row):
    date = row['created_at']
    date_gmt = gmt.localize(date)
    date_us_eastern = date_gmt.astimezone(us_eastern).tz_localize(None) # remove tz info to obtain naive local time
    return date_us_eastern

In [7]:
trump_tweets['created_at'] = trump_tweets.apply(convert_to_us_eastern, axis=1)
trump_tweets.head()

Unnamed: 0,source,text,created_at,retweet_count,favorite_count,is_retweet,id_str
0,Twitter for iPhone,I won the debate big based on compilation of p...,2020-10-01 11:14:28,44961,337926,False,1311685923097260034
1,Twitter for iPhone,Thank you @HerschelWalker! https://t.co/XjlYe8...,2020-09-30 23:45:25,19616,65721,False,1311512518800470016
2,Twitter for iPhone,RT @GOPChairwoman: Big news!A Maine court side...,2020-09-30 23:25:31,29393,0,True,1311507509958471680
3,Twitter for iPhone,Thank you Paul! https://t.co/aAk1sfww0d,2020-09-30 23:00:33,15992,63294,False,1311501225423073281
4,Twitter for iPhone,100000 DEFECTIVE BALLOTS IN NEW YORK. THEY WAN...,2020-09-30 22:59:02,51445,190750,False,1311500843309387781


### filter tweets from 1 Jan 2016 to 30 Sep 2020

In [8]:
start_date = datetime.datetime.strptime('2016-01-01', '%Y-%m-%d')
end_date = datetime.datetime.strptime('2020-09-30', '%Y-%m-%d')

In [9]:
trump_tweets = trump_tweets[(trump_tweets.created_at >= start_date) & (trump_tweets.created_at <= end_date)]
trump_tweets.head()

Unnamed: 0,source,text,created_at,retweet_count,favorite_count,is_retweet,id_str
69,Twitter for iPhone,https://t.co/HUSFkHqsyC,2020-09-29 23:57:50,75761,321378,False,1311153253472636928
70,Twitter for iPhone,RT @TyCardon: Regardless who you’re pulling fo...,2020-09-29 22:51:43,30926,0,True,1311136617327984640
71,Twitter for iPhone,RT @realDonaldTrump: Rigged Election!,2020-09-29 22:37:44,66647,0,True,1311133095651717123
72,Twitter for iPhone,https://t.co/58ssX7EfUj,2020-09-29 22:35:11,20658,65014,False,1311132452853706752
73,Twitter for iPhone,Volunteer to be a Trump Election Poll Watcher....,2020-09-29 22:30:39,14609,48540,False,1311131311965306885


### extract hashtags

In [10]:
import re

In [11]:
# create a new column to store the hashtags
trump_tweets['hashtag'] = trump_tweets['text'].apply(lambda x: re.findall(r'\B#\w*[a-zA-Z]+\w*', x))
trump_tweets.head()

Unnamed: 0,source,text,created_at,retweet_count,favorite_count,is_retweet,id_str,hashtag
69,Twitter for iPhone,https://t.co/HUSFkHqsyC,2020-09-29 23:57:50,75761,321378,False,1311153253472636928,[]
70,Twitter for iPhone,RT @TyCardon: Regardless who you’re pulling fo...,2020-09-29 22:51:43,30926,0,True,1311136617327984640,[]
71,Twitter for iPhone,RT @realDonaldTrump: Rigged Election!,2020-09-29 22:37:44,66647,0,True,1311133095651717123,[]
72,Twitter for iPhone,https://t.co/58ssX7EfUj,2020-09-29 22:35:11,20658,65014,False,1311132452853706752,[]
73,Twitter for iPhone,Volunteer to be a Trump Election Poll Watcher....,2020-09-29 22:30:39,14609,48540,False,1311131311965306885,[#MakeAmericaGreatAgain]


### extract mentions

In [12]:
# extract mentions
# for retweets, this regex expression only extracts mentions, but ignores the user handle of the OP
# e.g. the regex expression on "RT @realDonaldTrump: Biden for resident! @JoeBiden" will extract @JoeBiden but not @realDonaldTrump
# as @realDonaldTrump is the OP of the tweet that has been retweeted, rather than a direct mention in the retweet.
trump_tweets['mention'] = trump_tweets['text'].apply(lambda x: re.findall(r'(?<!RT\s)@[a-zA-Z0-9]{1,15}', x))
trump_tweets.head()

Unnamed: 0,source,text,created_at,retweet_count,favorite_count,is_retweet,id_str,hashtag,mention
69,Twitter for iPhone,https://t.co/HUSFkHqsyC,2020-09-29 23:57:50,75761,321378,False,1311153253472636928,[],[]
70,Twitter for iPhone,RT @TyCardon: Regardless who you’re pulling fo...,2020-09-29 22:51:43,30926,0,True,1311136617327984640,[],[]
71,Twitter for iPhone,RT @realDonaldTrump: Rigged Election!,2020-09-29 22:37:44,66647,0,True,1311133095651717123,[],[]
72,Twitter for iPhone,https://t.co/58ssX7EfUj,2020-09-29 22:35:11,20658,65014,False,1311132452853706752,[],[]
73,Twitter for iPhone,Volunteer to be a Trump Election Poll Watcher....,2020-09-29 22:30:39,14609,48540,False,1311131311965306885,[#MakeAmericaGreatAgain],[]


### prepare text for modelling
perform further cleaning on the original tweets with the aim of performing various text modelling. The preprocessed tweets will be stored as a separate feature.

In [13]:
import string
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

In [14]:
def text_processing(text, 
                    lower=True,  
                    remove_handles=True,
                    remove_hashtags=False,
                    remove_url=True,
                    remove_punctuations=True, 
                    remove_numbers=True,
                    remove_stopwords=True,
                    additional_stopwords=['RT'],
                    keep_stopwords = ["no", "not", "nor"],
                    lemmatize=True
                   ):
    '''
    Accepts a text and options to run the following processing functions.
    - remove_handles removes not just the mentions, but also the OP's Twitter handle in retweets
    - by default, remove_hashtags is set to False, as hashtags converys information useful for prediction. They also do convey sentiment and emotion
    '''
    
    # strip non-ascii characters
    text = text.encode('ascii', errors='ignore')
    text = str(text.decode("utf-8"))

    # covert to lowercase
    if lower:
        text = text.lower()
        
    # remove handles 
    if remove_handles:
        text = re.sub(r'@[a-zA-Z0-9]{1,15}', '', text)
        
    # remove hashtags
    if remove_hashtags:
        text = re.sub(r'\B#\w*[a-zA-Z]+\w*', '', text)
        
    # remove url 
    if remove_url:
        text = re.sub(r'http\S+', '', text)
    
    # remove punctuations
    if remove_punctuations:
        text = text.translate(str.maketrans(string.punctuation, ' '*len(string.punctuation)))
    
    # remove numbers
    if remove_numbers:
        text = re.sub(r'\d+', '', text)
    
    # remove unnecessary new lines and whitespaces
    text = text.replace("\n", "") 
    text = ' '.join(text.split())
    
    # tokenize
    text_words = nltk.word_tokenize(text)

    # remove stop words
    if remove_stopwords:
        stop = set(stopwords.words('english'))
        # remove words from the predefined stopwords set
        if not keep_stopwords is None and len(keep_stopwords) != 0:
            for word in keep_stopwords:
                stop.discard(word)
        # add additional words to the stopwords set
        if not additional_stopwords is None and len(additional_stopwords) != 0:
            for word in additional_stopwords:
                # convert the additional stopwords to lowercase if lower is set to True
                if lower:
                    word = word.lower()
                stop.add(word)
        # finally, remove the stopwords from the tweets
        text_words = [x for x in text_words if not x.lower() in stop]
        
    # lemmatize
    if lemmatize:
        wordnet_lemmatizer = WordNetLemmatizer()
        text_words = [wordnet_lemmatizer.lemmatize(x, pos="v") for x in text_words]
    
    return ' '.join(text_words)

In [15]:
trump_tweets['cleaned_text'] = trump_tweets['text'].apply(lambda x: text_processing(x))
trump_tweets.head()

Unnamed: 0,source,text,created_at,retweet_count,favorite_count,is_retweet,id_str,hashtag,mention,cleaned_text
69,Twitter for iPhone,https://t.co/HUSFkHqsyC,2020-09-29 23:57:50,75761,321378,False,1311153253472636928,[],[],
70,Twitter for iPhone,RT @TyCardon: Regardless who you’re pulling fo...,2020-09-29 22:51:43,30926,0,True,1311136617327984640,[],[],regardless youre pull think agree joe rogan wo...
71,Twitter for iPhone,RT @realDonaldTrump: Rigged Election!,2020-09-29 22:37:44,66647,0,True,1311133095651717123,[],[],rig election
72,Twitter for iPhone,https://t.co/58ssX7EfUj,2020-09-29 22:35:11,20658,65014,False,1311132452853706752,[],[],
73,Twitter for iPhone,Volunteer to be a Trump Election Poll Watcher....,2020-09-29 22:30:39,14609,48540,False,1311131311965306885,[#MakeAmericaGreatAgain],[],volunteer trump election poll watcher sign tod...


### Extract meta information from text data

The purpose of feature engineering is to extract more information from the text data and use the extracted information as features. Such text/NLP based features could include:
<br>
* Word Count – total number of words in the documents
* Character Count – total number of characters in the documents
* Average Word Density – average length of the words used in the documents
* Punctuation Frequency – total number of punctuation marks / total number of words in the documents
* Upper Case Frequency – total number of upper case words / total number of words in the documents
<br>

Since length of a tweet varies, frequency is used instead of absolute count for Puncutation and Upper Case metrics.

Handles, hashtags, numbers, stopwords are retained from the original tweet, whereas urls are removed. No lemmatization is performed.

In [16]:
# define a function to generate the new features
def add_text_features(df):
    
    text_for_feature_extraction = trump_tweets['text'].apply(lambda x: text_processing(x, 
                                                                                       lower=False,  
                                                                                       remove_handles=False,
                                                                                       remove_hashtags=False,
                                                                                       remove_url=True,
                                                                                       remove_punctuations=False, 
                                                                                       remove_numbers=False,
                                                                                       remove_stopwords=False,
                                                                                       additional_stopwords=None,
                                                                                       keep_stopwords=None,
                                                                                       lemmatize=False)
                                                            )
    df['word_count'] = text_for_feature_extraction.apply(lambda x: len(x.split()))
    df['char_count'] = text_for_feature_extraction.apply(len)
    df['word_density'] = df['char_count'] / df['word_count']
    df['punctuation_freq'] = text_for_feature_extraction.apply(lambda x: 
                                                               len("".join(_ for _ in x if _ in string.punctuation))
                                                              ) / df['word_count']
    df['upper_case_freq'] = text_for_feature_extraction.apply(lambda x: 
                                                              len([wrd for wrd in x.split() if wrd.isupper()])
                                                             ) / df['word_count']
    return df

In [17]:
trump_tweets = add_text_features(trump_tweets)
trump_tweets.head()

Unnamed: 0,source,text,created_at,retweet_count,favorite_count,is_retweet,id_str,hashtag,mention,cleaned_text,word_count,char_count,word_density,punctuation_freq,upper_case_freq
69,Twitter for iPhone,https://t.co/HUSFkHqsyC,2020-09-29 23:57:50,75761,321378,False,1311153253472636928,[],[],,0,0,,,
70,Twitter for iPhone,RT @TyCardon: Regardless who you’re pulling fo...,2020-09-29 22:51:43,30926,0,True,1311136617327984640,[],[],regardless youre pull think agree joe rogan wo...,28,139,4.964286,0.071429,0.071429
71,Twitter for iPhone,RT @realDonaldTrump: Rigged Election!,2020-09-29 22:37:44,66647,0,True,1311133095651717123,[],[],rig election,7,40,5.714286,0.428571,0.142857
72,Twitter for iPhone,https://t.co/58ssX7EfUj,2020-09-29 22:35:11,20658,65014,False,1311132452853706752,[],[],,0,0,,,
73,Twitter for iPhone,Volunteer to be a Trump Election Poll Watcher....,2020-09-29 22:30:39,14609,48540,False,1311131311965306885,[#MakeAmericaGreatAgain],[],volunteer trump election poll watcher sign tod...,15,87,5.8,0.2,0.0


### reset index

In [18]:
trump_tweets = trump_tweets.reset_index().drop(columns=['index'])
trump_tweets.head()

Unnamed: 0,source,text,created_at,retweet_count,favorite_count,is_retweet,id_str,hashtag,mention,cleaned_text,word_count,char_count,word_density,punctuation_freq,upper_case_freq
0,Twitter for iPhone,https://t.co/HUSFkHqsyC,2020-09-29 23:57:50,75761,321378,False,1311153253472636928,[],[],,0,0,,,
1,Twitter for iPhone,RT @TyCardon: Regardless who you’re pulling fo...,2020-09-29 22:51:43,30926,0,True,1311136617327984640,[],[],regardless youre pull think agree joe rogan wo...,28,139,4.964286,0.071429,0.071429
2,Twitter for iPhone,RT @realDonaldTrump: Rigged Election!,2020-09-29 22:37:44,66647,0,True,1311133095651717123,[],[],rig election,7,40,5.714286,0.428571,0.142857
3,Twitter for iPhone,https://t.co/58ssX7EfUj,2020-09-29 22:35:11,20658,65014,False,1311132452853706752,[],[],,0,0,,,
4,Twitter for iPhone,Volunteer to be a Trump Election Poll Watcher....,2020-09-29 22:30:39,14609,48540,False,1311131311965306885,[#MakeAmericaGreatAgain],[],volunteer trump election poll watcher sign tod...,15,87,5.8,0.2,0.0


### Adjust column order
shift `cleaned_text` to just after the `text` column

In [19]:
col_names = trump_tweets.columns.tolist()
col_names = col_names[:2] + col_names[9:10] + col_names[2:9] + col_names[10:]
trump_tweets = trump_tweets[col_names]
trump_tweets.head()

Unnamed: 0,source,text,cleaned_text,created_at,retweet_count,favorite_count,is_retweet,id_str,hashtag,mention,word_count,char_count,word_density,punctuation_freq,upper_case_freq
0,Twitter for iPhone,https://t.co/HUSFkHqsyC,,2020-09-29 23:57:50,75761,321378,False,1311153253472636928,[],[],0,0,,,
1,Twitter for iPhone,RT @TyCardon: Regardless who you’re pulling fo...,regardless youre pull think agree joe rogan wo...,2020-09-29 22:51:43,30926,0,True,1311136617327984640,[],[],28,139,4.964286,0.071429,0.071429
2,Twitter for iPhone,RT @realDonaldTrump: Rigged Election!,rig election,2020-09-29 22:37:44,66647,0,True,1311133095651717123,[],[],7,40,5.714286,0.428571,0.142857
3,Twitter for iPhone,https://t.co/58ssX7EfUj,,2020-09-29 22:35:11,20658,65014,False,1311132452853706752,[],[],0,0,,,
4,Twitter for iPhone,Volunteer to be a Trump Election Poll Watcher....,volunteer trump election poll watcher sign tod...,2020-09-29 22:30:39,14609,48540,False,1311131311965306885,[#MakeAmericaGreatAgain],[],15,87,5.8,0.2,0.0


## Join tweets with stock prices
Preprocessing to generate the joined dataset with trump's tweets and log returns

### Map tweets to the correct market dates
If a tweet was posted during non-trading hours, it is mapped to the date of the next trading day.

In [20]:
from datetime import *
import numpy as np

In [21]:
#creating a new column indicate the date that the tweet was posted
trump_tweets['created_at'] = pd.to_datetime(trump_tweets['created_at'])
trump_tweets['created_date'] = trump_tweets['created_at'].dt.date
trump_tweets['created_time'] = trump_tweets['created_at'].dt.time

In [22]:
#create a column of indicating if the tweets time passes the closing time of the stock
closingTime = pd.to_datetime("2020-01-01 16:00:00").time()
trump_tweets['passed_closing'] = np.where(trump_tweets['created_time'] >= closingTime, 1, 0)
trump_tweets.head()

Unnamed: 0,source,text,cleaned_text,created_at,retweet_count,favorite_count,is_retweet,id_str,hashtag,mention,word_count,char_count,word_density,punctuation_freq,upper_case_freq,created_date,created_time,passed_closing
0,Twitter for iPhone,https://t.co/HUSFkHqsyC,,2020-09-29 23:57:50,75761,321378,False,1311153253472636928,[],[],0,0,,,,2020-09-29,23:57:50,1
1,Twitter for iPhone,RT @TyCardon: Regardless who you’re pulling fo...,regardless youre pull think agree joe rogan wo...,2020-09-29 22:51:43,30926,0,True,1311136617327984640,[],[],28,139,4.964286,0.071429,0.071429,2020-09-29,22:51:43,1
2,Twitter for iPhone,RT @realDonaldTrump: Rigged Election!,rig election,2020-09-29 22:37:44,66647,0,True,1311133095651717123,[],[],7,40,5.714286,0.428571,0.142857,2020-09-29,22:37:44,1
3,Twitter for iPhone,https://t.co/58ssX7EfUj,,2020-09-29 22:35:11,20658,65014,False,1311132452853706752,[],[],0,0,,,,2020-09-29,22:35:11,1
4,Twitter for iPhone,Volunteer to be a Trump Election Poll Watcher....,volunteer trump election poll watcher sign tod...,2020-09-29 22:30:39,14609,48540,False,1311131311965306885,[#MakeAmericaGreatAgain],[],15,87,5.8,0.2,0.0,2020-09-29,22:30:39,1


In [23]:
# if it passes 4:30, then the date will plus 1
trump_tweets['Date'] = ''
for i in range(len(trump_tweets)):
    if (trump_tweets['passed_closing'].iloc[i] == 1):
        trump_tweets['Date'].iloc[i] = trump_tweets['created_date'].iloc[i] + timedelta(days=1)
    else:
        trump_tweets['Date'].iloc[i] = trump_tweets['created_date'].iloc[i]
trump_tweets['Date'] = pd.to_datetime(trump_tweets['Date'])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


In [24]:
trump_tweets.head()

Unnamed: 0,source,text,cleaned_text,created_at,retweet_count,favorite_count,is_retweet,id_str,hashtag,mention,word_count,char_count,word_density,punctuation_freq,upper_case_freq,created_date,created_time,passed_closing,Date
0,Twitter for iPhone,https://t.co/HUSFkHqsyC,,2020-09-29 23:57:50,75761,321378,False,1311153253472636928,[],[],0,0,,,,2020-09-29,23:57:50,1,2020-09-30
1,Twitter for iPhone,RT @TyCardon: Regardless who you’re pulling fo...,regardless youre pull think agree joe rogan wo...,2020-09-29 22:51:43,30926,0,True,1311136617327984640,[],[],28,139,4.964286,0.071429,0.071429,2020-09-29,22:51:43,1,2020-09-30
2,Twitter for iPhone,RT @realDonaldTrump: Rigged Election!,rig election,2020-09-29 22:37:44,66647,0,True,1311133095651717123,[],[],7,40,5.714286,0.428571,0.142857,2020-09-29,22:37:44,1,2020-09-30
3,Twitter for iPhone,https://t.co/58ssX7EfUj,,2020-09-29 22:35:11,20658,65014,False,1311132452853706752,[],[],0,0,,,,2020-09-29,22:35:11,1,2020-09-30
4,Twitter for iPhone,Volunteer to be a Trump Election Poll Watcher....,volunteer trump election poll watcher sign tod...,2020-09-29 22:30:39,14609,48540,False,1311131311965306885,[#MakeAmericaGreatAgain],[],15,87,5.8,0.2,0.0,2020-09-29,22:30:39,1,2020-09-30


### Obtain log return of selected market indices

In [25]:
from pandas_datareader.data import DataReader

In [26]:
def get_data_for_multiple_stocks(tickers, start_date, end_date):
    '''
    Obtain stocks information (Date, OHLC, Volume and Adjusted Close).
    Uses Pandas DataReader to make an API Call to Yahoo Finance and download the data directly.
    Computes other values - Log Return and Arithmetic Return.
    
    Input: List of Stock Tickers
    Output: A dictionary of dataframes for each stock
    '''
    # read in stock data
    s = DataReader(tickers[0], 'yahoo', start_date, end_date)[["Adj Close"]]
    # get log returns
    s[tickers[0]] = np.log(s['Adj Close']/s['Adj Close'].shift(1))
    
    stocks = s[[tickers[0]]]
    
    for ticker in tickers[1:]:
        s = DataReader(ticker, 'yahoo', start_date, end_date)
        s[ticker] = np.log(s['Adj Close']/s['Adj Close'].shift(1))
        stocks[ticker] = s[ticker]
        
    # skip first row that will be na, and fillna by 0 incase there are trading halts on specific days
    return stocks.iloc[1:].fillna(0)

In [27]:
log_returns = get_data_for_multiple_stocks(["^GSPC","VGT"], "2016-01-01", "2020-09-30")
log_returns.index.name = 'Date'
log_returns.reset_index(inplace=True)
log_returns['Date'] = pd.to_datetime(log_returns['Date'])

In [28]:
# join the two dataset to detect null values
joined = trump_tweets.merge(log_returns, on='Date', how='left')

In [29]:
# a function to find the next nearest date
def nearestDate(base, df):
    df = df[(df['Date'] > base)]
    nearness = { abs(base.timestamp() - date.timestamp()) : date for date in df['Date']}
    return nearness[min(nearness.keys())]
#nearestDate(datetime(2020,1,7),log_returns)

In [30]:
# change the null value date with the next available date in the log return
for i in range(len(joined)):
    if (np.isnan(joined['^GSPC'].iloc[i])):
        the_date = joined['Date'].iloc[i]
        #print(the_date)
        the_date = pd.to_datetime(the_date)
        n = nearestDate(the_date,log_returns)
        #print('changed to:')
        #print(n)
        joined['Date'].iloc[i] = n

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


In [31]:
# merge the dataset again
joined = joined.drop(['^GSPC', 'VGT'], axis=1)
new_joined = joined.merge(log_returns, on='Date', how='left')

In [32]:
new_joined.head()
# We can drop the columns that we dont need. 

Unnamed: 0,source,text,cleaned_text,created_at,retweet_count,favorite_count,is_retweet,id_str,hashtag,mention,...,char_count,word_density,punctuation_freq,upper_case_freq,created_date,created_time,passed_closing,Date,^GSPC,VGT
0,Twitter for iPhone,https://t.co/HUSFkHqsyC,,2020-09-29 23:57:50,75761,321378,False,1311153253472636928,[],[],...,0,,,,2020-09-29,23:57:50,1,2020-09-30,0.00822,0.007412
1,Twitter for iPhone,RT @TyCardon: Regardless who you’re pulling fo...,regardless youre pull think agree joe rogan wo...,2020-09-29 22:51:43,30926,0,True,1311136617327984640,[],[],...,139,4.964286,0.071429,0.071429,2020-09-29,22:51:43,1,2020-09-30,0.00822,0.007412
2,Twitter for iPhone,RT @realDonaldTrump: Rigged Election!,rig election,2020-09-29 22:37:44,66647,0,True,1311133095651717123,[],[],...,40,5.714286,0.428571,0.142857,2020-09-29,22:37:44,1,2020-09-30,0.00822,0.007412
3,Twitter for iPhone,https://t.co/58ssX7EfUj,,2020-09-29 22:35:11,20658,65014,False,1311132452853706752,[],[],...,0,,,,2020-09-29,22:35:11,1,2020-09-30,0.00822,0.007412
4,Twitter for iPhone,Volunteer to be a Trump Election Poll Watcher....,volunteer trump election poll watcher sign tod...,2020-09-29 22:30:39,14609,48540,False,1311131311965306885,[#MakeAmericaGreatAgain],[],...,87,5.8,0.2,0.0,2020-09-29,22:30:39,1,2020-09-30,0.00822,0.007412


In [33]:
# rename Date to market_date
new_joined = new_joined.rename(columns={'Date':'market_date'})
new_joined.columns

Index(['source', 'text', 'cleaned_text', 'created_at', 'retweet_count',
       'favorite_count', 'is_retweet', 'id_str', 'hashtag', 'mention',
       'word_count', 'char_count', 'word_density', 'punctuation_freq',
       'upper_case_freq', 'created_date', 'created_time', 'passed_closing',
       'market_date', '^GSPC', 'VGT'],
      dtype='object')

## save cleaned data

In [34]:
new_joined.to_csv('../data/tweets&logreturns.csv', index=False)