In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

About this Competition

Where is this data from?

The collection of this data was funded by a Canada Foundation for Innovation JELF Grant to Chris Bauch, University of Waterloo. The dataset aggregates tweets pertaining to climate change collected between Apr 27, 2015 and Feb 21, 2018. In total, 43,943 tweets were collected. Each tweet is labelled as one of 4 classes, which are described below.

Class Description

2 News: the tweet links to factual news about climate change

1 Pro: the tweet supports the belief of man-made climate change

0 Neutral: the tweet neither supports nor refutes the belief of man-made climate change

-1 Anti: the tweet does not believe in man-made climate change Variable definitions

Features

sentiment: Which class a tweet belongs in (refer to Class Description above)

message: Tweet body

tweetid: Twitter unique id

The files provided

train.csv - You will use this data to train your model.

test.csv - You will use this data to test your model.

SampleSubmission.csv - is an example of what your submission file should look like. The order of the rows does not matter, but the names of the tweetid's must be correct.

Predict Overview: EA Twitter Sentiment Classification

Companies would like to determine how people perceive climate change and whether or not they believe it is a real threat. Our mission is to deliver a precise and durable solution to this objective, granting companies the ability to tap into a wide range of consumer sentiments across various demographics and geographic regions. This, in turn, enhances their understanding and empowers them to shape future marketing strategies based on valuable insights.

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.style as style
%matplotlib inline
import nltk
import re
import string
nltk.download('stopwords')
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer 
nltk.download('vader_lexicon')

Our data set consists of two features and a label. The main feature is the message column that contains a tweet related to global warming. The label sentiment catagorizes tweets according to four classes, namely news, neutral, pro and anti. Our aim will be to create a machine learning model that will be able to acurately classify any tweet according to these four buckets based on the textual message data of a tween only.

In [None]:
df_test = pd.read_csv('/kaggle/input/edsa-sentiment-classification/test.csv')
df_test.head()

In [None]:
df_train = pd.read_csv('/kaggle/input/edsa-sentiment-classification/train.csv')
df_train.head()

Exploratory Data Analysis(EDA)

In [None]:
df_train.shape

There are 15819 features and 3 columns

In [None]:
df_test.shape

There are 10546 features and 2 columns

In [None]:
df_train.isnull().sum()

In [None]:
df_test.isnull().sum()

There are no null values

The tweetid feature simply uniquely identifies each tweet and most probably will add no real value in classification machine model training.

In [None]:
df_train['tweetid'].nunique()

In [None]:
df_train = df_train.drop('tweetid', axis=1)
df_train.head()

In order to analyze the length of tweets, we created a new feature called size which is a count of the number of characters per tweet.

In [None]:
size = [len(tweet) for tweet in df_train['message']]
df_train['size'] = size

In [None]:
df_train['size'].mode()[0]
df_train['size'].describe()

The tweets range from 14 to 208 characters in length. The average size of a tweet is about 124 characters long. Most tweets are 140 characters in length.

In [None]:
plt.figure( figsize=(9,4))
plt.hist(df_train['size'])
plt.title("Distribution of Tweet Lengths")
plt.xlabel("Length of Tweet In Charaters") #X-label of the data
plt.ylabel("Number of Tweets")      #Y_label of the data
plt.show()

Box Plots

Below we attempt to visualize the 5 number summary of each category of tweet as well as the dataset as a whole using box and whiskers diagrams.

In [None]:
#creating class subsets for the datase

df_anti = df_train.copy()[df_train['sentiment'] == -1]
df_neutral = df_train.copy()[df_train['sentiment'] == 0]
df_pro = df_train.copy()[df_train['sentiment'] == 1]
df_news = df_train.copy()[df_train['sentiment'] == 2]

#storing the size data in separate variables

pro_len = df_pro['size']
neutral_len = df_neutral['size']
anti_len = df_anti['size']
news_len = df_news['size']
data_len = df_train['size']

#creating a list of all the length datasets

len_data = [pro_len, anti_len, neutral_len, news_len, data_len]

# Create a figure and axis
fig, ax = plt.subplots(figsize=(9,4))

# Create the box plots
ax.boxplot(len_data, vert=False)

# Set the labels for each box plot
labels = ['pro', 'anti', 'neutral', 'news', 'main data']
ax.set_yticklabels(labels)

# Set the title and axis labels
plt.title('Box and Whiskers Diagram For Tweet Lengths Per Category')
plt.xlabel('Length In Characters')
plt.ylabel('Category of Tweet')

# Show the plot
plt.show()

Cleaning Data
In order to process the tweet messages more effectively the tweets are cleaned using the clean function defined in the code cell below. The clean function does the following.

* Converts all tweet text to lowercase.
* Removes URLs.
* Removes punctuation.
* Removes numbers.
* Removes stopwords.
* Removes line-break code syntax.

In [None]:
stopword=set(stopwords.words('english'))  
def clean(text):
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = [word for word in text.split(' ') if word not in stopword]
    text=" ".join(text)
    return text

In [None]:
df_train["message"] = df_train["message"].apply(clean)

In [None]:
df_train.head()

Generating Wordcloud To Analyse Commonly Used Phrases¶

In [None]:
from wordcloud import WordCloud,ImageColorGenerator
text = " ".join(i for i in df_train["message"])
text = str(text)

In [None]:
wordcloud = WordCloud()
tweet_cloud = wordcloud.generate(text)
plt.figure( figsize=(9,4))
plt.imshow(tweet_cloud, interpolation='bilinear')
plt.axis("off")
plt.show()

Using a Word Cloud, we attempt to visualize which words are and phrases are most commonly used in tweets related to global warming. The top three phrases seem to be:

* Climate Change
* Global Warming
* RT
* Change RT
* Believe Climate

In [None]:
has_rt = lambda x: 'rt' in x
df_train['rt'] = [1 if has_rt(i) else 0 for i in df_train['message']]
#df_train['rt'] = df_train['message'].apply(has_rt)
df_train.head()

The RT (an abbreviation of 'retweet') could signal that many individuals either share the same sentiments surrounding climate change or are active in attempting to engage with others on the topic based on information shared by others.

In [None]:
rt_counts = df_train["rt"].value_counts()
rt_counts

Generating a Pie Chart for Retweet Analysis

Most tweets have an rt which means it is highly likely if not certain that they are Retweets.

According to twitter.com:

'A Retweet is a re-posting of a Tweet. Twitter's Retweet feature helps you and others quickly share that Tweet with all of your followers. You can Retweet your own Tweets or Tweets from someone else. Sometimes people type "RT" at the beginning of a Tweet to indicate that they are re-posting someone else's content. This isn't an official Twitter command or feature, but signifies that they are quoting another person's Tweet.'

In [None]:
rt_counts = df_train["rt"].value_counts()
plt.figure( figsize=(9,4))
plt.pie(rt_counts, labels=["Has RT", "Has No RT"], explode=[0.05,0], autopct='%1.1f%%')
plt.title("Pie Chart of Percentage Tweets with 'RT' Vs Without")
plt.axis('equal')
plt.show()

Plotting the distribution of classes in our dataset we see that most tweets are Pro (display belief in) global warming and climate change (more than twice any other class). Other than that, alot of the tweets are News related. A fewer amount of the tweets are Neutral and the least amount of tweets are Anti (show little or no signs of belief in) global warming or climate change.

In [None]:
#Create a barplot for the train dataset classes
senti_counts = df_train["sentiment"].value_counts()
news = senti_counts[2] 
pro = senti_counts[1]   
neutral = senti_counts[0]
anti = senti_counts[-1]  

plt.figure( figsize=(9,4))
plt.barh(['News ','Pro','Neutral','Anti'], [news,pro,neutral,anti]) #Use matplotlib horizontal bar graph to compare classes of tweets.
plt.xlabel('Count of Tweets') #X-label of the data
plt.ylabel('Tweet Classification') #Y_label of the data 
plt.title('Distribution of Classes In The Dataset') #Give the data a title 'Dataset lables distribution'
plt.show() ##Display the dataset

Analyzing The Overall Sentiment of The Data

The sentiment_score function is used to get a better idea of the what the underlying sentiment each classification has and to see if any if the classes correlate in term of it. The sentiment analyzer gives each tweet a score between 0 and 1 for each catagory of positive, negative and neutral. If tthe primary overall sentiment of a class is neutral then the function will output a secondary sentiment score to give the best overall picture of the general sentiment of each class.

In [None]:
def sentiment_score(df): # evaluates the sentiment of each tweet numerically
    sentiments = SentimentIntensityAnalyzer()
    df["Positive"] = [sentiments.polarity_scores(i)["pos"] for i in df["message"]]
    df["Negative"] = [sentiments.polarity_scores(i)["neg"] for i in df["message"]]
    df["Neutral"] = [sentiments.polarity_scores(i)["neu"] for i in df["message"]]
    a = sum(df["Positive"])
    b = sum(df["Negative"])
    c = sum(df["Neutral"])
    return (a, b, c) # outputs the overall score of the dataset per catagory

def senti_score_analyzer(score):  # analyzes the sentiment score catagorically
    result =[]
    a = score[0]
    b = score[1]
    c = score[2]

    if (a>b) and (a>c):
        result.append("Positive 😊")
    elif (b>a) and (b>c):
        result.append("Negative 😠")
    else:
        result.append("Neutral 🙂")
    if result[0] == "Neutral 🙂":
        if a > b:
            result.append("Positive 😊")
        else:
            result.append("Negative 😠")
    return result # outputs the value as a list of catagories

First we apply the sentiment_score function to the dataset as a whole.

In [None]:
set_score = sentiment_score(df_train)
set_sentiment = senti_score_analyzer(set_score)

print("The dataset is mostly", set_sentiment[0], "in sentiment")
print("The dataset as a whole has an overall underlying sentiment of", set_sentiment[-1])

In [None]:
df_news = df_train.copy()[df_train['sentiment'] == 2]
news_score = sentiment_score(df_news)
news_sentiment = senti_score_analyzer(news_score)

print("The news class is mostly", news_sentiment[0], "in sentiment")
print("The news class has an overall underlying sentiment of", news_sentiment[-1])

In [None]:
df_pro = df_train.copy()[df_train['sentiment'] == 1]
pro_score = sentiment_score(df_pro)
pro_sentiment = senti_score_analyzer(pro_score)

print("The pro class is mostly", pro_sentiment[0], "in sentiment")
print("The pro class has an overall underlying sentiment of", pro_sentiment[-1])

In [None]:
df_neutral = df_train.copy()[df_train['sentiment'] == 0]
neutral_score = sentiment_score(df_neutral)
neutral_sentiment = senti_score_analyzer(neutral_score)

print("The neutral class is mostly", neutral_sentiment[0], "in sentiment")
print("The neutral class has an overall underlying sentiment of", neutral_sentiment[-1])

In [None]:
df_anti = df_train.copy()[df_train['sentiment'] == -1]
anti_score = sentiment_score(df_anti)
anti_sentiment = senti_score_analyzer(anti_score)

print("The anti class is mostly", anti_sentiment[0], "in sentiment")
print("The anti class has an overall underlying sentiment of", anti_sentiment[-1])

Although the entire dataset is mostly neutral in sentiment, it is slightly biased towards the negative side in all of the four classes. The neutral class is the only exception. It has an underlying overall sentiment of positive.

In [None]:
data = [set_score, pro_score, anti_score, neutral_score, news_score]  # List of tuples
titles = ["All Tweets", "Pro", "Anti", "Neutral", "News"]



accumulated_data = np.zeros((3, len(data)))  # Initialize an array of zeros

for i, (a, b, c) in enumerate(data):
    accumulated_data[:, i] = [a, b, c]

# Create the figure and axes
fig, ax = plt.subplots(figsize=(9,4))

# Create a list of x-coordinates for the bars
x = np.arange(len(data))

# Plot the bars for each variable
for idx, variable in enumerate(('Positive', 'Negative', 'Neutral')):
    ax.bar(x + idx * 0.2, accumulated_data[idx, :], width=0.2, label=variable)

# Customize the plot
ax.set_xticks(x)
ax.set_xticklabels([f'{titles[i]}' for i in range(len(data))])
ax.set_xlabel('Data Sets')
ax.set_ylabel('Sentiment Score')
ax.set_title('Sentiment Score Comparison Across Different Classes')
ax.legend()

# Display the plot
plt.show()

In [None]:
freq_dict = {}
words = text.split(" ")
for word in words:
    if word != " " and word !="":
        if word not in freq_dict:
            freq_dict[word] = 1
        else:
            freq_dict[word] += 1

In [None]:
# Sort the dictionary by values and get the top 20 items
sorted_freq_dict = sorted(freq_dict.items(), key=lambda x:x[1], reverse=True)[:30]
top_20_words = dict(sorted_freq_dict)

In [None]:
# Extract the x-labels and values from the top 20 data
x_labels = list(top_20_words.keys())
values = list(top_20_words.values())

# Create the figure and axes
fig, ax = plt.subplots(figsize=(9,4))

# Plot the data
ax.bar(x_labels, values)

# Customize the plot
ax.set_xlabel('Frequency')
ax.set_ylabel('Word Count')
ax.set_title('Top 30 Most Used Words')

# Rotate the x-labels if needed
plt.xticks(rotation=90)

# Display the plot
plt.show()

In [None]:
total_words = sum([word for word in freq_dict.values()])
print("The dataset has", total_words, "unique words in total.")

Feature Engineering¶

In [None]:
from nltk.tokenize import TreebankWordTokenizer
from nltk import SnowballStemmer

tokeniser = TreebankWordTokenizer()
df_train['tokens'] = df_train['message'].apply(tokeniser.tokenize)

In [None]:
def bag_of_words_count(words, word_dict={}):
    """ this function takes in a list of words and returns a dictionary 
        with each word as a key, and the value represents the number of 
        times that word appeared"""
    for word in words:
        if word in word_dict.keys():
            word_dict[word] += 1
        else:
            word_dict[word] = 1
    return word_dict

In [None]:
type_labels = list(df_train['sentiment'].unique())
sentiment = {}
for kind in type_labels:
    df = df_train.groupby('sentiment')
    sentiment[kind] = {}
    for row in df.get_group(kind)['tokens']:
        sentiment[kind] = bag_of_words_count(row, sentiment[kind])

In [None]:
stemmer = SnowballStemmer('english')
def tweet_stemmer(words, stemmer):
    return [stemmer.stem(word) for word in words]

In [None]:
df_train['stem'] = df_train['tokens'].apply(tweet_stemmer, args=(stemmer, ))

In [None]:
plt.hist([word for word in freq_dict.values() if word < 10],bins=10)
plt.ylabel("# of words")
plt.xlabel("word frequency")
plt.show()

In [None]:
print(len([v for v in freq_dict.values() if v >= 5])) #words that appear more than 10 times
occurs_more_than_5_times = sum([v for v in freq_dict.values() if v >= 5]) # amount words of the total that account for the above words account for
print(occurs_more_than_5_times)

In [None]:
occurs_more_than_5_times / total_words

In [None]:
df_train.head()

Building Model

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV 
from sklearn.model_selection import GridSearchCV
from sklearn.utils import resample

In [None]:
rlen = 2000
news_resampled = resample(df_news, random_state=2,n_samples = rlen) # reproducible results
pro_resampled = resample(df_pro, random_state=2,n_samples = rlen) # reproducible results
anti_resampled = resample(df_anti, random_state=2,n_samples = rlen) # reproducible results
neutral_resampled = resample(df_neutral, random_state=2,n_samples = rlen) # reproducible results

resampled = pd.concat([news_resampled,pro_resampled,anti_resampled,neutral_resampled])
resampled['sentiment'].value_counts()

In [None]:
classes = ['news', 'pro', 'anti', 'neutral']
len_unsampled = [news, pro, anti, neutral]
resampled_len = [rlen, rlen, rlen, rlen]
labels = df_train['sentiment'].unique()
plt.bar(labels,len_unsampled,color='grey')
plt.bar(labels,resampled_len,color='orange')
plt.xticks(classes)
plt.ylabel("# of observations")
plt.legend(['original','resampled'])
plt.show()

In [None]:
vect = CountVectorizer(stop_words='english', 
                             min_df=5, 
                             max_df=2000, 
                             ngram_range=(2, 3))
x=vect.fit_transform(df_train['message'])
y=df_train['sentiment']

In [None]:
#Split your data into training and testing sets:
X_train,X_test,y_train,y_test =train_test_split(x,y,random_state=26)

In [None]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]

random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [None]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rfr = RandomForestRegressor()
rfc = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rfc_random = RandomizedSearchCV(estimator = rfc, 
                               param_distributions = random_grid, 
                               n_iter = 100, cv = 3, verbose=2, 
                               random_state=42, n_jobs = -1)

In [None]:
rfc_random.fit(x, y)

In [None]:
rfc_random.best_params_

In [None]:
# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True],
    'max_depth': [80, 90, 100, 110],
    'max_features': [2, 3],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200, 300, 1000]
}

# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rfc, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)

In [None]:
#grid_search.best_params_
rfc_best_params = RandomForestClassifier(n_estimators=1800,
 min_samples_split= 2,
 min_samples_leaf= 2,
 max_features= 'auto',
 max_depth= None,
 bootstrap= True)
rfc_best_params.fit(x,y)

In [None]:
y_pred=rfc_best_params.predict(X_test)

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score
f1 = f1_score(y_test, y_pred,average='macro',zero_division =1)
precision = precision_score(y_test, y_pred,average='macro')
recall = recall_score(y_test, y_pred,average='macro',zero_division=1)
print(f1)
print(precision)
print(recall)

In [None]:
output = pd.DataFrame(df_test,y_pred)
#submission = output.join(res)
output.to_csv('submission.csv',index =False)
print(output) 

In [None]:
from nltk import SnowballStemmer
stemmer = SnowballStemmer('english')
def tweet_stemmer(words, stemmer):
    return [stemmer.stem(word) for word in words]

In [None]:
df_train.apply(tweet_stemmer, args=(stemmer))
df_train.head()