# Tweet sentiment analysis

In [None]:
from textblob import TextBlob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns
import datetime

## 1.Get dataframe from csv file

In [None]:
# Read CSV file
#tweets_today = pd.read_csv(r"C:\stage\project\datasets\tweets{}.csv".format(datetime.date.today()))
tweets_today = pd.read_csv(r"C:\stage\project\datasets\tweets2019-09-24.csv")
tweets_today = tweets_today.head(5000)
# Gets tweet texts
tweets_today_text = tweets_today["tweet"]

## 2.Labeling data with TextBlob

In [None]:
def get_sentiment(polarity):
    if polarity > 0:
        return 'positive'
    if polarity == 0:
        return 'neutral'
    else:
        return 'negative'


# Extract columns from df & perform textblob sentiment analysis
textblob_tweets = [TextBlob(tweet) for tweet in tweets_today_text]
polarity_tweets_textblob = [round(tweet.sentiment.polarity,2) for tweet in textblob_tweets]
sentiment_tweets_textblob = [get_sentiment(polarity) for polarity in polarity_tweets_textblob]
zipped_list_textblob = list(zip(sentiment_tweets_textblob, polarity_tweets_textblob, tweets_today['creation date'], tweets_today['tweet'], tweets_today['username']))

# Store to new dataframe
sentiment_df_textblob = pd.DataFrame(zipped_list_textblob, columns=["Sentiment", "Polarity","Creation Date", "Tweet", "Username"])

sentiment_df_textblob.tail()

In [None]:
test_df = sentiment_df_textblob[["Tweet","Polarity", "Sentiment"]].tail(20)
for index, row in test_df.iterrows():
    print("Tweet text: "+row["Tweet"])
    print("Polarity of the tweet: "+str(row["Polarity"]))
    print("Sentiment of the tweet: "+str(row["Sentiment"]))
    print("\n")

In [None]:
sentiment_count = sentiment_df_textblob["Polarity"].value_counts()

sentiments = [get_sentiment(polarity) for polarity in sentiment_df["Polarity"]]

sentiment_count_list = np.array([sentiments.count("positive"), sentiments.count("neutral"), sentiments.count("negative")])

labels = ["positive", "neutral", "negative"]
colors = ["green", "yellow", "red"]
fig1, ax1 = plt.subplots()
ax1.pie(sentiment_count_list, labels=labels, shadow=True, autopct='%1.1f%%', startangle=90, colors=colors)
ax1.axis('equal') 
plt.show()
print("total positive: {}".format(sentiments.count('positive')))
print("total neutral: {}".format(sentiments.count('neutral')))
print("total negative: {}".format(sentiments.count('negative')))

In [None]:
sentiment_df_textblob["Polarity"].hist(bins=100)

## 3.Implementing Machine Learning algorithms with textblob labeled data

In [None]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize

### Lemmatization (reduces words to dictionary root form) + filtering stop words

In [None]:
labels = sentiment_df_textblob["Sentiment"].values
documents = []
from nltk.stem import WordNetLemmatizer
stemmer = WordNetLemmatizer()

def filter_stopwords(word):
    if word in stopwords.words('english'):
        return False
    else:
        return True

# Extract tweet column from df
sentiment_df_text = sentiment_df["Tweet"]

for tweet in sentiment_df_text:
    document = tweet.split()
    document = [stemmer.lemmatize(word) for word in document]
    document = filter(filter_stopwords,document)
    document = ' '.join(document)
    documents.append(document)
print(documents[555])
print(len(documents))

### Vectorize words, filter stopwords

In [None]:
# tfidf = term frequency, inverse document frequency ()
vectorizer = TfidfVectorizer(max_features=3000, min_df=5, max_df=0.80)
processed_features = vectorizer.fit_transform(documents).toarray()

feature_names = vectorizer.get_feature_names()

processed_features

### Divide into training and test sets

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(processed_features, 
                                                    labels, test_size=0.2, 
                                                    random_state=1)
print(len(X_train))
print(len(X_test))

### Implementing Random Forest Classifier & making predictions

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

text_classifier = RandomForestClassifier(n_estimators=350, random_state=0)
text_classifier.fit(X_train, y_train)

predictions = text_classifier.predict(X_test)

print(confusion_matrix(y_test,predictions))
print(classification_report(y_test,predictions))
print("The algorithm has reached an accuracy of: "+ str(accuracy_score(y_test, predictions)*100)+"%")

### Implementing Logistic regression Classifier

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
import warnings
warnings.filterwarnings('ignore')

param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}
grid = GridSearchCV(LogisticRegression(), param_grid, cv=5)
grid.fit(X_train, y_train)

print("Best cross-validation score: {:.2f}".format(grid.best_score_))
print("Best parameters: ", grid.best_params_)
print("Best estimator: ", grid.best_estimator_)

print(len(grid.best_estimator_.coef_.ravel()))

In [None]:
lr = grid.best_estimator_
lr.fit(X_train, y_train)
lr.predict(X_test)
print("Score: {:.2f}".format(lr.score(X_test, y_test)))

## 4.Labeling tweets with NLTK vader

In [None]:
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [None]:
sid = SentimentIntensityAnalyzer()
        
for tweet in tweets_today_text[:5]:
    print("{0}: \n{1}\n\n".format(tweet, sid.polarity_scores(tweet)))

In [None]:
polarity_tweets_vader = [round(sid.polarity_scores(tweet)["compound"],2) for tweet in tweets_today_text]
sentiment_tweets_vader = [get_sentiment(polarity) for polarity in polarity_tweets_vader]
zipped_list_vader = list(zip(sentiment_tweets_vader, polarity_tweets_vader, tweets_today['tweet'], tweets_today['username'],tweets_today['creation date'] ))

# Store to new dataframe
sentiment_df_vader = pd.DataFrame(zipped_list_vader, columns=["Sentiment", "Polarity", "Tweet", "Username", "Creation Date"])
sentiment_df_vader.tail(20)

In [None]:
sentiment_count_vader = round(sentiment_df_vader["Polarity"].value_counts(),2)

sentiments_vader = [get_sentiment(polarity) for polarity in sentiment_df_vader["Polarity"]]

sentiment_count_list_vader = np.array([sentiments_vader.count("positive"), sentiments_vader.count("neutral"), sentiments_vader.count("negative")])

labels = ["positive", "neutral", "negative"]
colors = ["green", "yellow", "red"]
fig1, ax1 = plt.subplots()
ax1.pie(sentiment_count_list_vader, labels=labels, shadow=True, autopct='%1.1f%%', startangle=90, colors=colors)
ax1.axis('equal')
plt.title("Distribution of sentiment values")
plt.show()
print("total positive: {}".format(sentiments_vader.count('positive')))
print("total neutral: {}".format(sentiments_vader.count('neutral')))
print("total negative: {}".format(sentiments_vader.count('negative')))

Using this nltk vader implementation for sentiment analysis, we get entirely diferent results. There are way less neutral tweets and more than half of the tweets now have a negative sentiment assigned to them

In [None]:
sentiment_df_vader["Polarity"].hist(bins=100)
plt.title("Distribution of sentiment polarities")
plt.xlabel("Sentiment polarities")
plt.ylabel("Amount")
plt.show()

## 5.Implementing machine learning algorithms with nltk vader labeled data

### Lemmatization (reduces words to dictionary root form) + filtering stop words

In [None]:
labels = sentiment_df_vader["Sentiment"].values
documents = []

def filter_stopwords(word):
    if word in stopwords.words('english'):
        return False
    else:
        return True

# Extract tweet column from df
sentiment_df_text = sentiment_df_vader["Tweet"]

for tweet in sentiment_df_text:
    document = tweet.split()
    document = [stemmer.lemmatize(word) for word in document]
    document = filter(filter_stopwords,document)
    document = ' '.join(document)
    documents.append(document)
print(len(documents))

### Vectorize words, filter stopwords

In [None]:
# tfidf = term frequency, inverse document frequency ()
vectorizer = TfidfVectorizer(max_features=3000, min_df=5, max_df=0.80)
processed_features = vectorizer.fit_transform(documents)
processed_features = normalize(processed_features)
processed_features = processed_features.toarray()

feature_names = vectorizer.get_feature_names()

processed_features

### Divide into training and test sets

In [None]:
X_train, X_test, y_train, y_test = train_test_split(processed_features, 
                                                    labels, test_size=0.2, 
                                                    random_state=1)

### Implementing Random Forest Classifier & making predictions

In [None]:
text_classifier = RandomForestClassifier(n_estimators=300, random_state=1)
text_classifier.fit(X_train, y_train)

predictions = text_classifier.predict(X_test)

print(confusion_matrix(y_test,predictions))
print(classification_report(y_test,predictions))
print("The algorithm has reached an accuracy of: "+ str(accuracy_score(y_test, predictions)*100)+"%")

### Implementing Logistic regression Classifier

In [None]:
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}
grid = GridSearchCV(LogisticRegression(), param_grid, cv=5)
grid.fit(X_train, y_train)

print("Best cross-validation score: {:.2f}".format(grid.best_score_))
print("Best parameters: ", grid.best_params_)
print("Best estimator: ", grid.best_estimator_)

In [None]:
lr = grid.best_estimator_
lr.fit(X_train, y_train)
lr.predict(X_test)
print("Score: {}%".format(round(lr.score(X_test, y_test)*100),2))