# Tweet sentiment analysis

In [None]:
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns
import datetime

## Get dataframe from csv file

In [None]:
# Read CSV file
#tweets_today = pd.read_csv(r"..\datasets\raw_data\tweets{}.csv".format(datetime.date.today()))
tweets_today = pd.read_csv(r"..\datasets\raw_data\tweets2019-10-08.csv")
tweets_today = tweets_today.head(5000)
# Gets tweet texts
tweets_today_text = tweets_today["tweet"]

## Data cleaning

In [None]:
# Preprocessing of tweet texts
def format_tweet(tweet):
    processed_tweet=""
    for word in tweet.split():
        # Removing URL from tweet
        processed_word = re.sub('([^0-9A-Za-z \t])|(\w+:\/\/\S+)', ' ', word)
        # remove all single characters
        processed_word = re.sub('\s+[a-zA-Z]\s+', ' ', processed_word)
        # Remove single characters from the start
        processed_word = re.sub('\^[a-zA-Z]\s+', ' ', processed_word) 
        # Substituting multiple spaces with single space
        processed_word = re.sub('\s+', '', processed_word, flags=re.I)
        # Removing prefixed 'b'
        processed_word = re.sub('^b\s+', ' ', processed_word)
        # Removing &amp
        processed_word = re.sub('&amp', '&', processed_word)
        # Removing breaks
        processed_word = re.sub('<br/>', '', processed_word)
        # converts to lower
        processed_word = processed_word.lower()
        processed_tweet= processed_tweet+" "+processed_word
    return processed_tweet        
    
tweets_today_text = [format_tweet(tweet) for tweet in tweets_today_text]
tweets_today_text = list(filter(None, tweets_today_text))
    


## Labeling tweets with NLTK vader

In [None]:
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')

In [None]:
sid = SentimentIntensityAnalyzer()

for tweet in tweets_today_text[5:10]:
    print("{0}: \n{1}\n\n".format(tweet, sid.polarity_scores(tweet)))

In [None]:
def get_sentiment(polarity):
    if polarity > 0:
        return 'positive'
    if polarity == 0:
        return 'neutral'
    else:
        return 'negative'

polarity_tweets = [round(sid.polarity_scores(tweet)["compound"], 2)
                   for tweet in tweets_today_text]
sentiment_tweets = [get_sentiment(polarity) for polarity in polarity_tweets]
zipped_list = list(zip(sentiment_tweets, polarity_tweets,
                       tweets_today['tweet'], tweets_today['username'], tweets_today['creation date']))

# Store to new dataframe
sentiment_df = pd.DataFrame(zipped_list, columns=[
    "Sentiment", "Polarity", "Tweet", "Username", "Creation Date"])
sentiment_df.tail(20)

In [None]:
sentiment_count = round(sentiment_df["Polarity"].value_counts(), 2)

sentiments = [get_sentiment(polarity)
              for polarity in sentiment_df["Polarity"]]

sentiment_count_list = np.array([sentiments.count(
    "positive"), sentiments.count("neutral"), sentiments.count("negative")])

labels = ["positive", "neutral", "negative"]
colors = ["green", "yellow", "red"]
fig1, ax1 = plt.subplots()
ax1.pie(sentiment_count_list, labels=labels, shadow=True,
        autopct='%1.1f%%', startangle=90, colors=colors)
ax1.axis('equal')
plt.title("Distribution of sentiment values")
plt.show()
print("total positive: {}".format(sentiments.count('positive')))
print("total neutral: {}".format(sentiments.count('neutral')))
print("total negative: {}".format(sentiments.count('negative')))

Using this nltk vader implementation for sentiment analysis, we get entirely diferent results. There are way less neutral tweets and more than half of the tweets now have a negative sentiment assigned to them

In [None]:
sentiment_df["Polarity"].hist(bins=5)
plt.title("Distribution of sentiment polarities")
plt.xlabel("Sentiment polarities")
plt.ylabel("Amount of tweets")
plt.show()

## Implementing machine learning algorithms

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import nltk

### Divide into training and test sets

In [None]:
from sklearn.model_selection import train_test_split

# Extract features from df
features = sentiment_df["Tweet"]

# Extract labels from df
labels = sentiment_df["Sentiment"]


X_train, X_test, y_train, y_test = train_test_split(features,
                                                    labels, test_size=0.2,
                                                    random_state=1)

print(len(X_train))
print(len(X_test))

### Preparing pipeline variables

In [None]:
nltk.download('stopwords')
nltk.download('wordnet')

tfidf = TfidfVectorizer()
stopwords = stopwords.words("english")

def tokenizer(text):
    return text.split()

porter = PorterStemmer()
def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

### Implementing Multinomial Naive Bayes

In [None]:
from sklearn.naive_bayes import MultinomialNB

In [None]:
mnb_pipeline = Pipeline([('vect', tfidf), ('clf', MultinomialNB())])

param_grid_mnb = {
                  'vect__use_idf': [True, False],
                  'vect__ngram_range': [(1,2)],
                  'vect__stop_words': [stopwords, None],
                  'vect__tokenizer': [tokenizer, tokenizer_porter],
                  'clf__alpha': [0.001, 0.01, 0.1, 1.0, 10.0, 100.0],
                  'clf__fit_prior': [True, False]
                 }
grid_mnb = RandomizedSearchCV(mnb_pipeline, param_grid_mnb, n_iter=80, cv=5, verbose=2, n_jobs=-1)
grid_mnb.fit(X_train, y_train)

In [None]:
print("Best parameters: ", grid_mnb.best_params_)
print("Best cross-validation score: {:.2f}%".format(grid_mnb.best_score_*100))

In [None]:
mnb = grid_mnb.best_estimator_
mnb.fit(X_train, y_train)
predictions_mnb = mnb.predict(X_test)
print(confusion_matrix(y_test, predictions_mnb))
print(classification_report(y_test, predictions_mnb))

### Implementing Logistic regression

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
import warnings
warnings.filterwarnings('ignore')

lr_pipeline = Pipeline([('vect', tfidf), ('clf', LogisticRegression(random_state=1))])

param_grid_lr ={
                'vect__use_idf': [True, False],
                'vect__ngram_range': [(1,1),(1,2)],
                'vect__stop_words': [stopwords, None],
                'vect__tokenizer': [tokenizer, tokenizer_porter],
                'clf__dual': [True, False],
                'clf__tol': [1e-6, 1e-5, 1e-4, 1e-3, 1e-2 ],
                'clf__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
                'clf__fit_intercept': [True, False],
                'clf__max_iter' : [100, 110, 120, 130, 140],
                'clf__warm_start': [True, False]
              }
grid_lr = RandomizedSearchCV(lr_pipeline, param_grid_lr, n_iter=500,  verbose=2, cv=5, n_jobs=-1)
grid_lr.fit(X_train, y_train)

In [None]:
print("Best parameters: ", grid_lr.best_params_)
print("Best cross-validation score: {:.2f}%".format(grid_lr.best_score_ *100))

In [None]:
lr = grid_lr.best_estimator_

lr.fit(X_train, y_train)
predictions_lr = lr.predict(X_test)

print(confusion_matrix(y_test, predictions_lr))
print(classification_report(y_test, predictions_lr))

### Implementing Support Vector Machine

In [None]:
from sklearn.svm import SVC

In [None]:
svc_pipeline = Pipeline([('vect', tfidf), ('clf', SVC(random_state=1))])

param_grid_svm = {
                  'vect__use_idf': [True, False],
                  'vect__ngram_range': [(1,2)],
                  'vect__stop_words': [stopwords, None],
                  'vect__tokenizer': [tokenizer, tokenizer_porter],
                  'clf__C': [0.1, 1, 10, 100],
                  'clf__kernel': ['linear', 'rbf', 'poly'],
                  'clf__gamma': [0.1, 1, 10, 100],
                  'clf__degree': [0, 1, 2, 3, 4, 5, 6]
                 }

grid_svc = RandomizedSearchCV(svc_pipeline, param_grid_svm, n_iter=50, cv=3, verbose=2, n_jobs=-1)
grid_svc.fit(X_train, y_train)
print("Best cross-validation score: {:.2f}".format(grid_svc.best_score_))
print("Best parameters: ", grid_svc.best_params_)
print("Best estimator: ", grid_svc.best_estimator_)

In [None]:
svm = grid_svc.best_estimator_
svm_fit = svm.fit(X_train, y_train)
score_svm = svm.score(X_test, y_test)
predictions_svm = svm.predict(X_test)
print(confusion_matrix(y_test, predictions_svm))
print(classification_report(y_test, predictions_svm))
print("The algorithm has reached an accuracy of: {:.2f}%".format(score_svm*100))

### Implementing Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf_pipeline = Pipeline([('vect', tfidf), ('clf', RandomForestClassifier(random_state=1))])
param_grid_rf = {'n_estimators': [1, 2, 4, 8, 16, 32, 64, 100, 200],
               'max_features': ['auto', 'sqrt'],
               'max_depth': np.linspace(1, 32, 32, endpoint=True),
               'min_samples_split': [2, 5, 10],
               'min_samples_leaf': [1, 2, 4],
               'bootstrap': [True, False]}

grid_rf = RandomizedSearchCV(rf_pipeline, param_grid_rf, cv=3, n_iter=1000 , verbose=2, n_jobs=-1)

grid_rf.fit(X_train, y_train)

print("Best cross-validation score: {:.2f}".format(grid_rf.best_score_))
print("Best parameters: ", grid_rf.best_params_)
print("Best estimator: ", grid_rf.best_estimator_)

In [None]:
rf = grid_rf.best_estimator_

rf.fit(X_train, y_train)
predictions_rf = rf.predict(X_test)
print(confusion_matrix(y_test, predictions_rf))
print(classification_report(y_test, predictions_rf))
print("The algorithm has reached an accuracy of: {:.2f}%".format(
    accuracy_score(y_test, predictions_rf)*100))

In [None]:
rf.score(X_test, y_test)