
# Data Processing and Indexing 
---

In [2]:
import pandas as pd
import re
import string
import math

---
### Importing the Dataset
---

In [3]:
"https://www.kaggle.com/davidwallach/financial-tweets"

df = pd.read_csv('stockerbot-export.csv',error_bad_lines=False)
df.head()

b'Skipping line 731: expected 8 fields, saw 13\nSkipping line 2836: expected 8 fields, saw 15\nSkipping line 3058: expected 8 fields, saw 12\nSkipping line 3113: expected 8 fields, saw 12\nSkipping line 3194: expected 8 fields, saw 17\nSkipping line 3205: expected 8 fields, saw 17\nSkipping line 3255: expected 8 fields, saw 17\nSkipping line 3520: expected 8 fields, saw 17\nSkipping line 4078: expected 8 fields, saw 17\nSkipping line 4087: expected 8 fields, saw 17\nSkipping line 4088: expected 8 fields, saw 17\nSkipping line 4499: expected 8 fields, saw 12\n'


Unnamed: 0,id,text,timestamp,source,symbols,company_names,url,verified
0,1019696670777503700,VIDEO: “I was in my office. I was minding my o...,Wed Jul 18 21:33:26 +0000 2018,GoldmanSachs,GS,The Goldman Sachs,https://twitter.com/i/web/status/1019696670777...,True
1,1019709091038548000,The price of lumber $LB_F is down 22% since hi...,Wed Jul 18 22:22:47 +0000 2018,StockTwits,M,Macy's,https://twitter.com/i/web/status/1019709091038...,True
2,1019711413798035500,Who says the American Dream is dead? https://t...,Wed Jul 18 22:32:01 +0000 2018,TheStreet,AIG,American,https://buff.ly/2L3kmc4,True
3,1019716662587740200,Barry Silbert is extremely optimistic on bitco...,Wed Jul 18 22:52:52 +0000 2018,MarketWatch,BTC,Bitcoin,https://twitter.com/i/web/status/1019716662587...,True
4,1019718460287389700,How satellites avoid attacks and space junk wh...,Wed Jul 18 23:00:01 +0000 2018,Forbes,ORCL,Oracle,http://on.forbes.com/6013DqDDU,True


---
### Text Processing
---

In [4]:
pip install emoji

Note: you may need to restart the kernel to use updated packages.


In [5]:
import emoji

def remove_emojis(text: str) -> str:
    return ''.join(c for c in text if c not in emoji.UNICODE_EMOJI)
#Removes the emojis, but I noticed that the ranked scores decreased without the emojis

# emoji_pattern = re.compile("["
#                        u"\U0001F600-\U0001F64F"  # emoticons
#                        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
#                        u"\U0001F680-\U0001F6FF"  # transport & map symbols
#                        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
#                        u"\U00002702-\U000027B0"
#                        u"\U000024C2-\U0001F251"
#                        "]+", flags=re.UNICODE)  

# def remove_emoji(string):
#     return emoji_pattern.sub(r'', string)

In [6]:
df2 = df.copy(deep=True)
df2['tokens'] = df2['text'] 
df2.tokens = df2.tokens.str.lower()
df2.tokens = df2.tokens.apply(lambda x: re.sub(r'https?:\/\/\S+', '', x))
df2.tokens = df2.tokens.apply(lambda x: re.sub(r"www\.[a-z]?\.?(com)+|[a-z]+\.(com)", '', x))
df2.tokens = df2.tokens.apply(lambda x: re.sub(r'{link}', '', x))
df2.tokens = df2.tokens.apply(lambda x: re.sub(r"\[video\]", '', x))
df2.tokens = df2.tokens.apply(lambda x: re.sub(r'&[a-z]+;', '', x))
# df2.tokens = df2.tokens.apply(lambda x: re.sub(r"[^a-z\s\d\(\-:\)\\\/\!?];='#$]", '', x))
df2.tokens = df2.tokens.apply(lambda x: re.sub(r'[@$#]+', '', x))
df2.tokens = df2.tokens.apply(lambda x: re.sub("([^\x00-\x7F])+"," ",x))  #Removed Chinese symbols
##Removes all non-english charaters inlcuidng emojis 
df2.tokens = df2.tokens.apply(remove_emojis)


In [7]:
df2.head()

Unnamed: 0,id,text,timestamp,source,symbols,company_names,url,verified,tokens
0,1019696670777503700,VIDEO: “I was in my office. I was minding my o...,Wed Jul 18 21:33:26 +0000 2018,GoldmanSachs,GS,The Goldman Sachs,https://twitter.com/i/web/status/1019696670777...,True,video: i was in my office. i was minding my o...
1,1019709091038548000,The price of lumber $LB_F is down 22% since hi...,Wed Jul 18 22:22:47 +0000 2018,StockTwits,M,Macy's,https://twitter.com/i/web/status/1019709091038...,True,the price of lumber lb_f is down 22% since hit...
2,1019711413798035500,Who says the American Dream is dead? https://t...,Wed Jul 18 22:32:01 +0000 2018,TheStreet,AIG,American,https://buff.ly/2L3kmc4,True,who says the american dream is dead?
3,1019716662587740200,Barry Silbert is extremely optimistic on bitco...,Wed Jul 18 22:52:52 +0000 2018,MarketWatch,BTC,Bitcoin,https://twitter.com/i/web/status/1019716662587...,True,barry silbert is extremely optimistic on bitco...
4,1019718460287389700,How satellites avoid attacks and space junk wh...,Wed Jul 18 23:00:01 +0000 2018,Forbes,ORCL,Oracle,http://on.forbes.com/6013DqDDU,True,how satellites avoid attacks and space junk wh...


---
### Tweet Indexing
---

In [16]:
documents = df2['tokens'].values.tolist()  #processed tweets


In [9]:
# tokenize documents (tweets)
document_terms = [doc.split(' ') for doc in documents]


In [10]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(stop_words='english')
documents_vectorized = vectorizer.fit_transform(documents)
# a sparse matrix representaion of the data. this is easy to turn into a numpy array with .toarray()
# however, don't do it unless you have to as it is costly

In [11]:
vocabulary = vectorizer.get_feature_names()

In [12]:
vocab = pd.DataFrame(documents_vectorized.toarray(), columns=vocabulary)
vocab

Unnamed: 0,00,000,0000,0000011,0000073124,000011,0000111,00001940,0000215457,00002251,...,zuckerman,zuo,zur,zusch_,zyb,zyme,zyne,zynga,zytiga,zzc
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28259,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
28260,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
28261,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
28262,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
q = 'investment banking'
q_terms = ['investment', 'banking']

In [14]:
# # AND query
# documents[df[(vocab[vocab] > 0).sum(axis=1) == len(q_terms)].index[0]]

In [15]:
# OR query
df[(vocab[q_terms] > 0).sum(axis=1) > 0]

Unnamed: 0,id,text,timestamp,source,symbols,company_names,url,verified
31,1016090469292564500,RT @dailypoliticaln: Fluor Co. $NEW $FLR Stock...,Sun Jul 08 22:43:40 +0000 2018,Riskographer2,FLR,Fluor Corporation,http://www.dailypolitical.com/?p=2188895,False
106,1016206633050009600,Apartment Investment and Management’s $AIV “Ho...,Mon Jul 09 06:25:16 +0000 2018,dailypoliticaln,AIV,Apartment Investment and Management Company,http://www.dailypolitical.com/?p=2190015,False
110,1016209116161806300,The Western Union $WU Stock Rating Upgraded by...,Mon Jul 09 06:35:08 +0000 2018,dailypoliticaln,WU,The Western Union Company,http://www.dailypolitical.com/?p=2190028,False
111,1016209268800938000,The Western Union $WU Stock Rating Upgraded by...,Mon Jul 09 06:35:44 +0000 2018,ZolmaxNews,WU,The Western Union Company,http://zolmax.com/?p=2370057,False
133,1016222817749266400,Zacks Investment Research Upgrades Torchmark $...,Mon Jul 09 07:29:34 +0000 2018,TickerReport,TMK,Torchmark Corporation,http://tickerreport.com/?p=3623782,False
...,...,...,...,...,...,...,...,...
27174,1019708742521229300,Quarterly Earnings @Microsoft $MSFT LIVE Webc...,Wed Jul 18 22:21:24 +0000 2018,DrivingTheDay,MSFT,Microsoft Corporation,https://www.domainmondo.com/2018/07/microsoft-...,False
27503,1019715498773057500,$KMI #Kinder Morgan Kinder Keeps Dividend Grow...,Wed Jul 18 22:48:14 +0000 2018,ResearchPool,KMI,Kinder Morgan,https://twitter.com/i/web/status/1019715498773...,False
27796,1019722075479527400,$KMI #Kinder Morgan Morningstar | Kinder Keeps...,Wed Jul 18 23:14:22 +0000 2018,ResearchPool,KMI,Kinder Morgan,https://twitter.com/i/web/status/1019722075479...,False
28047,1019726863294292000,$OKE #ONEOK Inc. ONEOK Inc.: Update to credit ...,Wed Jul 18 23:33:24 +0000 2018,ResearchPool,OKE,ONEOK,https://twitter.com/i/web/status/1019726863294...,False
