In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
import tweepy
import datetime
import pandas as pd
import re

# UPDATE: this is the selected company (query for the tweet search)
COMPANY = 'apple'

def scrape():
  '''This function scrapes twitter data which mentions the selected company.
  It gets the data for the past three days and returns a dataframe.
  INPUT: NONE
  OUTPUT: dataframe with text from twitter mentioning the selected company. One row per tweet. '''

  access_token = "1289900179701800960-2lujUQFC8RbuCcOgDMbloRzYPQY7ve"
  access_token_secret = "7hWUucfMm6oYutWWTBWdCu0RbIIJylfHOM6F0d8CtgH3e"
  consumer_key = "SJVSihso9VGRMpCC1acdbGk5M"
  consumer_secret = "TgQLnyhlEwQSIL9tsBVxwW7NNnxWhO5YaYhLN8brlH1WVjiw4x"


  auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
  auth.set_access_token(access_token, access_token_secret)
  api = tweepy.API(auth,wait_on_rate_limit=True,wait_on_rate_limit_notify=True)
  
  #Dates for scraping tweets: from three days ago until today
  startDate=datetime.datetime.today()-datetime.timedelta(days=1)
  endDate=datetime.datetime.today()

  #Scrape the data
  text  =[]
  for tweet in tweepy.Cursor(api.search, q=COMPANY, lang="en").items(100):
      #keep only the past 3 days
      if tweet.created_at <= endDate and tweet.created_at >= startDate:
        text.append(tweet.text)

  #dump into a panda
  text_series = pd.Series(text)

  #Perform basica cleaning: remove @mentions, #hastags and URLs

  text_series = text_series.str.replace('@[A-Za-z0–9]+', '', regex=True)
  text_series = text_series.str.replace('#', '', regex=True)
  text_series = text_series.str.replace('RT[\s]+', '', regex=True)

  #Return in the form of a dataframe
  text_df = pd.DataFrame({'text': text_series})
  
  return(text_df)

In [None]:
# TEXT CLEANING
TEXT_CLEANING_RE = "@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+"

# nltk
import nltk
from nltk.corpus import stopwords
from  nltk.stem import SnowballStemmer
import re

def preprocess(text, stem=False):
  '''This function does all the necessary preprocessing and cleaning of the texts
  so that they can be fed to the nlp for sentiment analysis.
  INPUT: a text (string)
  OUTPUT:  the cleaned text (string)'''

  # Define the stopwords and stemmer
  stop_words = stopwords.words("english")
  stemmer = SnowballStemmer("english")

  # Remove link,user and special characters
  text = re.sub(TEXT_CLEANING_RE, ' ', str(text).lower()).strip()
    
  tokens = []
  for token in text.split():
    if token not in stop_words:
        if stem:
            tokens.append(stemmer.stem(token))
        else:
            tokens.append(token)
  return " ".join(tokens)

In [None]:
# IMPORT
KERAS_MODEL = "model.h5"
WORD2VEC_MODEL = "model.w2v"
TOKENIZER_MODEL = "tokenizer.pkl"
ENCODER_MODEL = "encoder.pkl" 

# SENTIMENT
POSITIVE = "POSITIVE"
NEGATIVE = "NEGATIVE"
NEUTRAL = "NEUTRAL"
SENTIMENT_THRESHOLDS = (0.4, 0.7)

# KERAS
from keras.models import load_model
from keras.preprocessing.sequence import pad_sequences
SEQUENCE_LENGTH = 300

#UTILITY
import time
import pickle

# nltk
import nltk

# Downloading the necessary for preprocessing
nltk.download('stopwords')

# UPDATE: this is the path were the models are stored
path = "/content/drive/Shared drives/NLP sentiment for stocks/model/"

def nlp():
  '''This function calls the scraper function which scrapes the past three days of tweets containing the selected company name.
  It performs a sentiment analysis of each tweet and gives the average over those three days.
  INPUT: NONE
  OUTPUT: sentiment average over the past three days.'''

  # Loads the saved keras model
  def load_models():

    # Load the tokenizer.
    file = open(path + TOKENIZER_MODEL, 'rb')
    tokenizer = pickle.load(file)
    file.close()
    # Load the model
    model = load_model(path + KERAS_MODEL)
    # Check its architecture
    model.summary()

    return tokenizer, model

  '''
  # Gives a label to the sentiment value
  def decode_sentiment(score, include_neutral=True):

    if include_neutral:        
        label = NEUTRAL
        if score <= SENTIMENT_THRESHOLDS[0]:
            label = NEGATIVE
        elif score >= SENTIMENT_THRESHOLDS[1]:
            label = POSITIVE

        return label
    else:
        return NEGATIVE if score < 0.5 else POSITIVE
  '''

  # Predicts the sentiment value for a tweet
  def predict(text, include_neutral=True):

    start_at = time.time()
    # Tokenize text
    x_test = pad_sequences(tokenizer.texts_to_sequences([text]), maxlen=SEQUENCE_LENGTH)
    # Predict
    score = model.predict([x_test])[0]
    # Decode sentiment
    #label = decode_sentiment(score, include_neutral=include_neutral)

    return float(score)

  if __name__=="__main__":
    # Loading the models.
    tokenizer, model = load_models()

    # Getting the texts
    all_text = scrape()

    # Preprocessing data
    all_text.text = all_text.text.apply(lambda x: preprocess(x))

    # Calculating the sum of scores from all tweets over the past three days
    score = 0
    for text in all_text.text:
      score = score + predict(text)

    # Calculating the average score for the sentiment over the past three days
    score = score/len(all_text)

    return(score)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
print(nlp())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 300, 300)          87125700  
_________________________________________________________________
dropout (Dropout)            (None, 300, 300)          0         
_________________________________________________________________
lstm (LSTM)                  (None, 100)               160400    
_________________________________________________________________
dense (Dense)                (None, 1)                 101       
Total params: 87,286,201
Trainable params: 160,501
Non-trainable params: 87,125,700
_________________________________________________________________
1
1
0.874923825263977
