# Tweet sentiment analysis

In [None]:
import tweepy
from textblob import TextBlob
import numpy as np
import pandas as pd
import re
import json
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from datetime import datetime

## 1. Setting up dataframes + preprocessing

### Connecting to twitter API

In [None]:
with open("twitter_credentials.json", "r") as file:
    creds = json.load(file)

### Helper functions

In [None]:
# Preprocessing of tweet texts
def format_tweet(tweet):
    processed_tweet=""
    if "RT @" or "RT" not in tweet.full_text:
        for word in tweet.full_text.split():
            
            # Removing URL from tweet
            processed_word = re.sub('([^0-9A-Za-z \t])|(\w+:\/\/\S+)', ' ', word)
            
            # Remove all the special characters
            processed_word = re.sub(r'\W', '', processed_word)

            # remove all single characters
            processed_word = re.sub(r'\s+[a-zA-Z]\s+', ' ', processed_word)

            # Remove single characters from the start
            processed_word = re.sub(r'\^[a-zA-Z]\s+', ' ', processed_word) 

            # Substituting multiple spaces with single space
            processed_word = re.sub(r'\s+', '', processed_word, flags=re.I)

            # Removing prefixed 'b'
            processed_word = re.sub(r'^b\s+', ' ', processed_word)

            # Converting to Lowercase
            processed_word = processed_word.lower()
            processed_tweet= processed_tweet+" "+processed_word

        return processed_tweet #+"~ "+ str(tweet.created_at)
            
    else:
        return
    

# Handles limit exception from twitter API
def limit_handler(cursor : tweepy.cursor):
    while True:
        try:
            yield cursor.next()
            
        except tweepy.RateLimitError:
            print(tweepy.RateLimitError)
            time.sleep(15 * 60)
            
        except StopIteration:
            break


### Getting tweets from Twitter API and saves them to csv file

In [None]:
# Authenticates and connects to API
auth = tweepy.OAuthHandler(creds['CONSUMER_KEY'], creds['CONSUMER_SECRET'])
auth.set_access_token(creds['ACCESS_TOKEN'], creds['ACCESS_SECRET'])
api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)

def store_tweets_to_csv(start_date, end_date  ):
    # Gets tweets based on query
    query = "#heatwave -filter=retweets"
    cursor = limit_handler(tweepy.Cursor(api.search, q= query, lang="en", since=start_date,  until=end_date, tweet_mode='extended').items(5000))
    
    # Filters out None objects
    cursor = list(filter(None,cursor))
    
    # Preprocesses the tweet texts
    tweets = [format_tweet(tweet) for tweet in cursor]
    tweets = list(filter(None, tweets))

    # Gets username and creation date of tweet
    usernames = [tweet.user.name for tweet in cursor]
    usernames[0]
    creation_dates = [tweet.created_at for tweet in cursor]
        

    list_for_dataframe = list(zip(tweets,usernames, creation_dates))

    df = pd.DataFrame(list_for_dataframe, columns=["tweet","username", "creation date"])
    month = start_date.split('-')[1]
    day = start_date.split('-')[2]
    if df.shape[0] == 0:
        print("There has been an error. Dataframe tweets_week{}-{} is empty".format(day, month))
    else:
        print("The dataframe tweets_week{}-{} has been stored in the datasets folder".format(day, month))
        df.to_csv(r'D:\dev\python\Climate-Perception\datasets\tweets_week{}-{}'.format(day, month))
                                                                                                               
                                                                                                                                          
# Can't get tweets from longer than a week ago without premium twitter api
#store_tweets_per_year('2016-01-01', '2016-12-31')
#store_tweets_per_year('2017-01-01', '2017-12-31')
#store_tweets_per_year('2018-01-01', '2018-12-31')

store_tweets_to_csv('2019-09-16', '2019-09-22')

                                                                                                                            

## 2.Sentiment analysis with TextBlob

In [None]:
textblob_tweets = [TextBlob(tweet) for tweet in tweets]
sentiment_tweets = [[round(tweet.sentiment.polarity,2) for tweet in textblob_tweets]
zipped_list = list(zip(creation_dates, tweets, sentiment_tweets, usernames ))
#sentiment_values = [[round(tweet.sentiment.polarity,2), tweet.split("~")[0], tweet.split("~")[1]] for tweet in sentiment_tweets]

sentiment_df = pd.DataFrame(zipped_list, columns=["Creation Date", "tweet", "Sentiment", "Username"])
sentiment_df.head()

In [None]:
sentiment_count = sentiment_df["polarity"].value_counts()
x = sentiment_count.index
y = sentiment_count.values
plt.figure(figsize=(20, 8))
sns.barplot(x, y, alpha=0.8)
plt.yticks(np.arange(min(y), max(y), step=2))
plt.xticks(plt.xticks()[0], rotation=65)
plt.tight_layout()
plt.ylabel("Amount of tweets")
plt.xlabel("Sentiment polarity")

plt.show()

As you can see, this sentiment analysis is very basic and not very accurate
This analysis would point out that most people who tweet about heatwaves, have a possitive or neutral sentiment about it. 
* sentiment polarity < 0 &nbsp;&nbsp;&nbsp; => negative 
* sentiment polarity > 0 &nbsp;&nbsp;&nbsp; => positive 
* sentiment polarity = 0 &nbsp;&nbsp;&nbsp; => neutral

## 3.Sentiment analysis with SciKit Learn (Random Forest)

In [None]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

### Lemmatization (reduces words to dictionary root form)

In [None]:
documents = []
from nltk.stem import WordNetLemmatizer
stemmer = WordNetLemmatizer()

for tweet in tweet_texts:
    document = tweet.split()
    document = [stemmer.lemmatize(word) for word in document]
    document = ' '.join(document)
    documents.append(document)

### Vectorize words, filter stopwords and initialize training and testing sets

In [None]:
# tfidf = term frequency, inverse document frequency ()
vectorizer = TfidfVectorizer(max_features=1000, min_df=5, max_df=0.80, stop_words=stopwords.words('english'))
processed_features = vectorizer.fit_transform(documents).toarray()
X, y = processed_features
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
