# Tweet sentiment analysis

In [None]:
from textblob import TextBlob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns

## 1.Sentiment analysis with TextBlob

In [None]:
# Read CSV file
tweets_week16september = pd.read_csv(r"C:\stage\project\datasets\tweets_week16-09.csv")

# Extract columns from df & perform textblob sentiment analysis
tweets_week16september_text = tweets_week16september["tweet"]
textblob_tweets = [TextBlob(tweet) for tweet in tweets_week16september_text]
sentiment_tweets = [round(tweet.sentiment.polarity,2) for tweet in textblob_tweets]
zipped_list = list(zip(tweets_week16september['creation date'], tweets_week16september['tweet'], sentiment_tweets, tweets_week16september['username']))

# Store to new dataframe
sentiment_df = pd.DataFrame(zipped_list, columns=["Creation Date", "Tweet", "Polarity", "Username"])

sentiment_df.tail()

In [None]:
sentiment_count = sentiment_df["Polarity"].value_counts()
x = sentiment_count.index
y = sentiment_count.values
plt.figure(figsize=(20, 8))
sns.set_palette("RdYlGn")
sns.barplot(x, y, alpha=0.8)
plt.yticks(np.arange(min(y), max(y), step=10))
plt.xticks(plt.xticks()[0], rotation=65)
plt.tight_layout()
plt.ylabel("Amount of tweets")
plt.xlabel("Sentiment polarity")

plt.show()

In [None]:
def get_sentiment(polarity):
    if polarity > 0:
        return 'positive'
    if polarity == 0:
        return 'neutral'
    else:
        return 'negative'

sentiments = [get_sentiment(polarity) for polarity in sentiment_df["Polarity"]]

sentiment_count_list = np.array([sentiments.count("positive"), sentiments.count("neutral"), sentiments.count("negative")])

labels = ["positive", "neutral", "negative"]
colors = ["green", "yellow", "red"]
fig1, ax1 = plt.subplots()
ax1.pie(sentiment_count_list, labels=labels, shadow=True, autopct='%1.1f%%', startangle=90, colors=colors)
ax1.axis('equal') 
plt.show()

As you can see, this sentiment analysis is very basic and not very accurate
This analysis would point out that most people who tweet about climate change, have a possitive or neutral sentiment about it. 
* sentiment polarity < 0 &nbsp;&nbsp;&nbsp; => negative 
* sentiment polarity > 0 &nbsp;&nbsp;&nbsp; => positive 
* sentiment polarity = 0 &nbsp;&nbsp;&nbsp; => neutral

## 2.Sentiment analysis with SciKit Learn (Random Forest)

In [None]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

### Lemmatization (reduces words to dictionary root form)

In [None]:
documents = []
from nltk.stem import WordNetLemmatizer
stemmer = WordNetLemmatizer()

# Read CSV file
tweets_week16september = pd.read_csv(r"C:\stage\project\datasets\tweets_week16-09.csv")

# Extract tweet column from df
tweets_week16september_text = tweets_week16september["tweet"]

for tweet in tweets_week16september_text:
    document = tweet.split()
    document = [stemmer.lemmatize(word) for word in document]
    document = ' '.join(document)
    documents.append(document)

### Vectorize words, filter stopwords and initialize training and testing sets

In [None]:
# tfidf = term frequency, inverse document frequency ()
vectorizer = TfidfVectorizer(max_features=1000, min_df=5, max_df=0.80, stop_words=stopwords.words('english'))
processed_features = vectorizer.fit_transform(documents).toarray()
X, y = processed_features
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)


### Training the model