# <b>twitter sentiment detection - sentiment 140 - random forest & xgboost</b>
### by vidu widyalankara

### [slidedeck with more info!](https://docs.google.com/presentation/d/e/2PACX-1vSOffim8oAY4UuielXu4wAQHRXD1puxc2szmvtBsZBtC6nnUoQOegfjHsK3UnQZdwZhD-QdjokBDBVx/pub?start=true&loop=false&delayms=5000)

# library imports

In [3]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.corpus import stopwords
adjustedStopwords = stopwords.words("english") + ["quot", "going", "twitpic", "I'm", "I've", "amp", "u", "im", "get", "got"]

# importing data

In [4]:
df = pd.read_csv('tweets.csv', names=["Sentiment", "ID", "Date", "Query", "User", "Tweet"], encoding='latin-1')
df.head(10)

Unnamed: 0,Sentiment,ID,Date,Query,User,Tweet
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
5,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew
6,0,1467811592,Mon Apr 06 22:20:03 PDT 2009,NO_QUERY,mybirch,Need a hug
7,0,1467811594,Mon Apr 06 22:20:03 PDT 2009,NO_QUERY,coZZ,@LOLTrish hey long time no see! Yes.. Rains a...
8,0,1467811795,Mon Apr 06 22:20:05 PDT 2009,NO_QUERY,2Hood4Hollywood,@Tatiana_K nope they didn't have it
9,0,1467812025,Mon Apr 06 22:20:09 PDT 2009,NO_QUERY,mimismo,@twittera que me muera ?


# cleaning data

In [None]:
# drop query & ID column
df = df.drop(['Query', 'ID'], axis=1)
df.sample(5)

In [None]:
# formatting date to datetime object & converting timezone to EST
dateCol = pd.to_datetime(df['Date'])
dateCol = dateCol.dt.tz_localize(tz='America/Los_Angeles').dt.tz_convert('US/Eastern')

# splitting date & hour column
df['Date'] = dateCol.dt.date
df['Hour'] = dateCol.dt.hour 

# changing order of columns
df = df.reindex(columns = ['Sentiment', 'User', 'Tweet', 'Date', 'Hour'])

df.sample(10)

# eda

In [None]:
print(f"Date range of Tweets: {df['Date'].min()} -- {df['Date'].max()}")

In [None]:
# get number of tweets by sentiment
tweets = df.groupby(['Sentiment']).count()

plt.bar(tweets.index, tweets["Tweet"]) 
plt.xlabel("Sentiment") 
plt.ylabel("No. of Tweets") 
plt.title("Number of Tweets per Sentiment") 
plt.show() 

In [None]:
# get only happy/sad tweets
happyTweets = df.loc[(df["Sentiment"] == 4)]
sadTweets = df.loc[(df["Sentiment"] == 0)]

# get number of tweets by hour posted
happyTweets = happyTweets.groupby(happyTweets['Hour']).count()
sadTweets = sadTweets.groupby(sadTweets['Hour']).count()

# labelling axes & title
plt.figure(figsize=(10, 7))
plt.ylabel('No. of Tweets')
plt.xlabel('Time of Day (EST)')
plt.title("Time of Day of Happy & Sad Tweets")

# creating line graphs
plt.plot(happyTweets.index, happyTweets["Tweet"], label="happy tweets", color='#e3d800')
plt.plot(sadTweets.index, sadTweets["Tweet"], label="sad tweets", color='b')
plt.xticks(ticks=happyTweets.index[::2], labels=happyTweets.index[::2].astype(str) + ':00')
plt.legend(loc="upper center")

plt.show()

In [None]:
# using boolean filter, get only the text of happy tweets
happyTweets = df.loc[(df["Sentiment"] == 4)]["Tweet"]
happyText = happyTweets.str.cat(sep=" ")
sadTweets = df.loc[(df["Sentiment"] == 0)]["Tweet"]
sadText = sadTweets.str.cat(sep=" ")

# graph data
plt.bar(["Happy Tweets", "Sad Tweets"], [len(happyText.split(' ')) / 1_600_000, len(sadText.split(' ')) / 1_600_000]) # get average number of words
plt.ylabel("Number of Words")
plt.title("Average Number of Words for Happy & Sad Tweets")
plt.show()

In [None]:
from wordcloud import WordCloud

# render wordcloud
wordcloud = WordCloud(
    width=1000, height=500,
    stopwords=adjustedStopwords,
    background_color="white"
).generate(happyText)

# show
plt.imshow(wordcloud)
plt.axis("off")
plt.show(wordcloud)

In [None]:
# render wordcloud
wordcloud = WordCloud(
    width=1000, height=500,
    stopwords=adjustedStopwords,
    background_color="black"
).generate(sadText)

# show
plt.imshow(wordcloud)
plt.axis("off") 
plt.show(wordcloud)

# modelling

In [None]:
# translate 4s to 1s
df.loc[(df["Sentiment"] == 4), "Sentiment"] = 1
df.sample(10)

In [None]:
from sklearn.model_selection import train_test_split

# split dataset to get equal number of happy & sad tweets
numberOfTweets = 500_000
x = df['Tweet'][800_000 - (numberOfTweets//2) : 800_000 + (numberOfTweets//2)]
y = df['Sentiment'][800_000 - (numberOfTweets//2) : 800_000 + (numberOfTweets//2)]

# split dataset into train & test
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 42)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# bag of words, vectorize tweets
vectorizer = CountVectorizer(analyzer = "word",
                             lowercase = True,
                             stop_words = adjustedStopwords,
                             max_features = 30_000) 

x_train = vectorizer.fit_transform(x_train)
x_test = vectorizer.transform(x_test)

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

# train random forest model
rf = RandomForestClassifier(n_estimators = 100)
rf.fit(x_train, y_train)

In [None]:
# train accuracy
rf.score(x_train, y_train)

In [None]:
from sklearn.metrics import accuracy_score

# test accuracy
y_pred = rf.predict(x_test)
accuracy_score(y_test, y_pred)

In [None]:
from sklearn.metrics import confusion_matrix

# build confusion matrix
rfcm = confusion_matrix(y_test, y_pred)

sns.heatmap(rfcm, annot = True, fmt='1,').invert_yaxis()
plt.xlabel("Y_pred")
plt.ylabel("Y_test")
plt.title("Confusion Matrix")
plt.show()

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

### XGBoost

In [None]:
import xgboost as xgb

# train xgb model
xg = xgb.XGBClassifier(n_estimators=2000)
xg.fit(x_train, y_train)

In [None]:
# train accuracy
xg.score(x_train, y_train)

In [None]:
# test accuracy
y_pred = xg.predict(x_test)
accuracy_score(y_test, y_pred)

In [None]:
# build confusion matrix
xgcm = confusion_matrix(y_test, y_pred)

sns.heatmap(xgcm, annot = True, fmt='1,').invert_yaxis()
plt.xlabel("Y_pred")
plt.ylabel("Y_test")
plt.title("Confusion Matrix")
plt.show()

In [None]:
print(classification_report(y_test, y_pred))

# classifying new input

In [None]:
sentence = pd.Series(np.array(["i hate the rain"])) # example phrase

sentence = vectorizer.transform(sentence) 
xg.predict(sentence)

# 0 = sad
# 1 = happy