# Keys

In [1]:
# Read API secrets from twitter_secrets.cfg

from configparser import ConfigParser
parser = ConfigParser()
_ = parser.read("twitter_secrets.cfg")

ACCESS_TOKEN = parser.get("twitter", "access_token")
ACCESS_TOKEN_SECRET = parser.get("twitter", "access_token_secret")
CONSUMER_KEY = parser.get("twitter", "consumer_key")
CONSUMER_SECRET = parser.get("twitter", "consumer_secret")

# Import Stuffs

In [2]:
!pip install pytorch_pretrained_bert
!pip install bertModel 

Collecting pytorch_pretrained_bert
  Downloading pytorch_pretrained_bert-0.6.2-py3-none-any.whl (123 kB)
[K     |████████████████████████████████| 123 kB 926 kB/s 
Collecting torch>=0.4.1
  Downloading torch-1.10.0-cp38-none-macosx_10_9_x86_64.whl (147.1 MB)
[K     |████████████████████████████████| 147.1 MB 26 kB/s 
Collecting boto3
  Downloading boto3-1.19.12-py3-none-any.whl (131 kB)
[K     |████████████████████████████████| 131 kB 12.5 MB/s 
[?25hCollecting botocore<1.23.0,>=1.22.12
  Downloading botocore-1.22.12-py3-none-any.whl (8.1 MB)
[K     |████████████████████████████████| 8.1 MB 20.7 MB/s 
Collecting jmespath<1.0.0,>=0.7.1
  Downloading jmespath-0.10.0-py2.py3-none-any.whl (24 kB)
Collecting s3transfer<0.6.0,>=0.5.0
  Downloading s3transfer-0.5.0-py3-none-any.whl (79 kB)
[K     |████████████████████████████████| 79 kB 10.2 MB/s 
[?25hCollecting regex
  Downloading regex-2021.11.2-cp38-cp38-macosx_10_9_x86_64.whl (288 kB)
[K     |████████████████████████████████| 288

In [3]:
from tweepy.streaming import StreamListener
from tweepy import API, Cursor, Stream, OAuthHandler
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
from textblob import TextBlob
import re
import datetime
from nltk.tokenize import WordPunctTokenizer
from bs4 import BeautifulSoup
tok = WordPunctTokenizer()
pat1 = r'@[A-Za-z0-9]+'
pat2 = r'https?://[A-Za-z0-9./]+'
combined_pat = r'|'.join((pat1, pat2))

ModuleNotFoundError: No module named 'tweepy'

In [None]:
from google.colab import drive
drive.mount('/content/drive')
import torch
import torch.nn.functional as F
from pytorch_pretrained_bert import BertTokenizer
from bertModel import BertClassification

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


ModuleNotFoundError: ignored

# Pre-trained FinBERT

In [None]:
labels = {0:'neutral', 1:'positive',2:'negative'}
num_labels= len(labels)
vocab = "finance-uncased"
vocab_path = '/content/drive/MyDrive/analyst_tone/vocab'
pretrained_weights_path = "/content/drive/MyDrive/analyst_tone/pretrained_weights" # this is pre-trained FinBERT weights
fine_tuned_weight_path = "/content/drive/MyDrive/analyst_tone/fine_tuned.pth"      # this is fine-tuned FinBERT weights
max_seq_length=512
device='cuda:0'

In [None]:
model = BertClassification(weight_path= pretrained_weights_path, num_labels=num_labels, vocab=vocab)
model.load_state_dict(torch.load(fine_tuned_weight_path, 'cuda:0'))
model.to(device)
model.eval()

In [None]:
sentences = ["There is a shortage of capital, and we need extra financing", 
             "Growth is strong and we have plenty of liquidity.", 
             "There are doubts about our finances.", 
             "Facebook is going down bad."]
tokenizer = BertTokenizer(vocab_file = vocab_path, do_lower_case = True, do_basic_tokenize = True)


In [None]:
def sa(sent):
  tokenized_sent = tokenizer.tokenize(sent)
  if len(tokenized_sent) > max_seq_length:
      tokenized_sent = tokenized_sent[:max_seq_length]
    
  ids_review  = tokenizer.convert_tokens_to_ids(tokenized_sent)
  mask_input = [1]*len(ids_review)        
  padding = [0] * (max_seq_length - len(ids_review))
  ids_review += padding
  mask_input += padding
  input_type = [0]*max_seq_length
    
  input_ids = torch.tensor(ids_review).to(device).reshape(-1, max_seq_length)
  attention_mask =  torch.tensor(mask_input).to(device).reshape(-1, max_seq_length)
  token_type_ids = torch.tensor(input_type).to(device).reshape(-1, max_seq_length)
    
  with torch.set_grad_enabled(False):
      outputs = model(input_ids, token_type_ids, attention_mask)
      outputs = F.softmax(outputs,dim=1)
      return labels[torch.argmax(outputs).item()]
      #print(sent, '\nFinBERT predicted sentiment: ', outputs, '\n')
      #neutral, pos, neg

# Classes

## TwitterAuthenticator Class

In [None]:
class TwitterAuthenticator():
  def authenticate_twitter_app(self):
    auth = OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
    auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET)
    return auth

## TwitterListener Class

In [None]:
class TwitterListener(StreamListener): # Inherits from StreamListener Class
  '''
  Simple listener class that just prints received tweets to standard output.
  '''
  
  def __init__(self, fetched_tweets_filename):
    self.fetched_tweets_filename = fetched_tweets_filename 

  def on_data(self, data):
    try:
      print(data)
      with open(self.fetched_tweets_filename, 'a') as tf:
        tf.write(data)
      return True
    except BaseException as e:
      print("Error on_data %s" % str(e))
    return true

  def on_error(self, status):
    if status == 420:
      # Case rate limit occurs
      return False;
    print(status)

## TweetAnalyzer Class

In [None]:
class TweetAnalyzer():
  def tweets_to_df(self, tweets):
    df = pd.DataFrame(data=[tweet.full_text for tweet in tweets], columns=["tweets"])
    df['id'] = np.array([tweet.id for tweet in tweets])
    df['len'] = np.array([len(tweet.full_text) for tweet in tweets])
    df['date'] = np.array([tweet.created_at for tweet in tweets])
    df['source'] = np.array([tweet.source for tweet in tweets])
    df['likes'] = np.array([tweet.favorite_count for tweet in tweets])
    df['retweets'] = np.array([tweet.retweet_count for tweet in tweets])
    return df

  def relating(self, t):
    df2 = t['id']
    df2['tweets'] = np.array(t['text'])
    df2['date'] = np.array(t['created_at'])
    df2['source'] = np.array(t['source'])
    df2['likes'] = np.array(t['favorite_count'])
    df2['retweets'] = np.array(t['retweet_count'])
    return df2

  def clean_tweet(self, text):
    text = re.sub('\\n', '', text)
    text = re.sub('https?:\/\/\S+', u'', text)
    text = re.sub('\xa0', u'', text)
    soup = BeautifulSoup(text, 'lxml')
    souped = soup.get_text()
    stripped = re.sub(combined_pat, '', souped)
    try:
        clean = stripped.decode("utf-8-sig").replace(u"\ufffd", "?")
    except:
        clean = stripped
    letters_only = re.sub("[^a-zA-Z]", " ", clean)
    lower_case = letters_only.lower()
    # During the letters_only process two lines above, it has created unnecessay white spaces,
    # I will tokenize and join together to remove unneccessary white spaces
    words = tok.tokenize(lower_case)
    return (" ".join(words)).strip()

  def analyze_sentiment(self, tweet):
    #analysis = TextBlob(self.clean_tweet(tweet))
    #return analysis.sentiment.polarity
    twt = self.clean_tweet(tweet)
    return sa(twt)
    

## TwitterClient Class

In [None]:
class TwitterClient():
  def __init__(self, twitter_user=None): # When you do the None, it means default. If no user is specified, it defaults to you.
    self.auth = TwitterAuthenticator().authenticate_twitter_app()
    self.twitter_client = API(self.auth)
    
    self.twitter_user = twitter_user
  
  def get_user_timeline_tweets(self, num_tweets):
    tweets = []
    search_term = '$INTC AND (buy OR sell) AND Intel -filter:retweets'
    for tweet in Cursor(api.search, q=search_term, lang = 'en', since = '2021-07-28', tweet_mode='extended').items(num_tweets): # The API provides a timeline for every user, which gets the tweets from a user.
      tweets.append(tweet)
    return tweets

  def get_home_timeline_tweets(self, num_tweets):
        home_timeline_tweets = []
        for tweet in Cursor(self.twitter_client.home_timeline, id=self.twitter_user).items(num_tweets):
            home_timeline_tweets.append(tweet)
        return home_timeline_tweets

  def get_twitter_client_api(self):
    return self.twitter_client

## TwitterStreamer Class

In [None]:
class TwitterStreamer():
  '''
  Class for streaming and processing live tweets
  '''

  def __init__(self):
    self.twitter_authenticator = TwitterAuthenticator()

  def stream_tweets(self, fetched_tweets_filename, hash_tag_list):
    # Handles Twitter Auth and connects to the Twitter Streaming API.
    listener = TwitterListener(fetched_tweets_filename)
    auth = self.twitter_authenticator.authenticate_twitter_app()
    
    stream = Stream(auth, listener)
    # stream.filter(track=['donald trump', 'hillary clinton', 'bernie sanders', 'barack obama'])
    stream.filter(track=hash_tag_list)

# Main

In [None]:
if __name__ == "__main__":
  #twitter_client = TwitterClient('TheRealPatD123') # Put person here.
  #print(twitter_client.get_user_timeline_tweets(2)) # Can get number of pages of tweets for people, not just the number of tweets

  # Getting Tweets from a specific person and doing SA on it.
  twitter_client = TwitterClient()
  api = twitter_client.get_twitter_client_api()

  #tweets = api.user_timeline(screen_name='HillaryClinton', count=2)
  tweets = twitter_client.get_user_timeline_tweets(200)
  tweet_analyzer = TweetAnalyzer()
  df = tweet_analyzer.tweets_to_df(tweets)
  
  # print(tweets[0].favorite_count)

  #TIME SERIES
  #time_likes = pd.Series(df['likes'].values, index=df['date'])
  #time_likes.plot(figsize=(16, 4), label='likes', legend=True)
  #time_retweets = pd.Series(df['retweets'].values, index=df['date'])
  #time_retweets.plot(figsize=(16, 4), label='retweets', legend=True)
  #plt.show()
  
  # Sentiment Analysis
  df['sentiment'] = [tweet_analyzer.analyze_sentiment(tweet.full_text) for tweet in tweets]
  # df['clean_tweets'] = [tweet_analyzer.clean_tweet(df.iloc[i]['tweets']) for i in range(0, df.shape[0])]
  print(df)

  # Filtering for certain tweets
  #hash_tag_list = ['donald trump']
  #fetched_tweets_filename = "tweets.json"
  #twitter_streamer = TwitterStreamer()
  #twitter_streamer.stream_tweets(fetched_tweets_filename, hash_tag_list)
  

In [None]:
from statistics import median

df['date'] = matplotlib.dates.date2num(df['date'])

In [None]:
for i in range(0, df.shape[0]):
  print(df.iloc[i]['sentiment'])

In [None]:
for i in range(df.shape[0] - 1, -1, -1):
  #df.iloc[i]['date'] = round(df.iloc[i]['date'], 3)
  if df.iloc[i]['sentiment'] == 0:
    df = df.drop(i, 0)

df = df.sort_values(by =['date'])

l = []
for i in range(0, df.shape[0] - 1):
  l.append(df.iloc[i]['sentiment'])

  if df.iloc[i]['date'] != df.iloc[i + 1]['date']:
    df.iloc[i]['sentiment'] = median(l)
    l.clear()
  else:
    df = df.drop(i, 0)



plt.plot_date(df['date'], df['sentiment'])
plt.gcf().autofmt_xdate()
# plt.xticks(ticks=np.arange(datetime(2021, 7,27), datetime(2021, 8,3), timedelta(days = 2)))
plt.minorticks_on();
plt.show()


In [None]:
df

In [None]:
for i in range (0, df.shape[0]):
  print(i)
  print(df.iloc[i]['tweets'])

In [None]:
#g = pd.read_json('tweets.json', lines=True)
#h = relating(g)
#print(h)
#h['sentiment'] = [tweet_analyzer.analyze_sentiment(g.iloc[i]['text']) for i in range(0, g.shape[0])]




In [None]:
print(dir(tweets[0])) # Shows the number of options you can access from one tweet(user, text, place, retweet count, etc)