# Importing Libraries

In [1]:
from sklearn.pipeline import Pipeline
from nltk.tokenize import RegexpTokenizer
import re
import numpy as np

In [2]:
#importing the CSV files into Jupyter using Pandas
import pandas as pd

training_data = pd.read_csv('twitter_training.csv')
display(training_data)
print(training_data.dtypes)

Unnamed: 0,Tweet ID,Theme,Sentiment,Tweet
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
...,...,...,...,...
74677,9200,Nvidia,Positive,Just realized that the Windows partition of my...
74678,9200,Nvidia,Positive,Just realized that my Mac window partition is ...
74679,9200,Nvidia,Positive,Just realized the windows partition of my Mac ...
74680,9200,Nvidia,Positive,Just realized between the windows partition of...


Tweet ID      int64
Theme        object
Sentiment    object
Tweet        object
dtype: object


# Checking for null values and removing them if found

In [3]:
np.sum(training_data.isnull().any(axis=1))

686

In [4]:
training_data = training_data.dropna()

In [5]:
training_data = training_data[['Tweet', 'Sentiment']]
training_data.head()

Unnamed: 0,Tweet,Sentiment
0,im getting on borderlands and i will murder yo...,Positive
1,I am coming to the borders and I will kill you...,Positive
2,im getting on borderlands and i will kill you ...,Positive
3,im coming on borderlands and i will murder you...,Positive
4,im getting on borderlands 2 and i will murder ...,Positive


In [6]:
import pandas as pd
import numpy as np
from collections import OrderedDict

training_data['Tweet'] = training_data['Tweet'].str.lower()

In [27]:
#importing and initalizing all pre-proccessing tools
import nltk
from nltk.corpus import stopwords
from nltk.corpus import words
from nltk.tokenize import RegexpTokenizer
import string


#lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

In [8]:
print(stop_words)

{'did', 'isn', "shan't", 'having', 'am', 'most', 'on', 'any', 'haven', 'has', 'these', 'yourselves', 'have', "haven't", 'the', 'can', 'my', "you're", "weren't", 'over', 'your', 'hers', 'during', 'that', "aren't", 'out', 'their', 'no', 'because', 'nor', 'than', 'herself', 'being', 'mightn', 'an', "you'd", 'you', 'ourselves', 'should', 'couldn', 'wasn', 'where', 'i', 'whom', "couldn't", 'here', 'now', 'myself', 'ma', 'weren', "hadn't", 'theirs', 'those', "don't", 'some', 'him', 'a', 'they', 'other', 'who', 'wouldn', 'll', "you'll", 'this', 'doesn', "that'll", 'such', "needn't", 'been', 'she', 'which', 'themselves', 'when', 'is', 'to', 'itself', 'had', 'was', 'were', 'once', 'our', "hasn't", 'but', 'while', 'why', 'not', 'too', 'there', 'more', 'shouldn', 'how', "wouldn't", 'them', 'doing', 'does', 'before', 'own', 'ours', "mustn't", 'then', "didn't", 'hasn', 'or', "it's", 'into', 'yours', 'in', 'me', 'd', 'about', "shouldn't", 'didn', 'himself', "you've", 'don', 'mustn', "mightn't", 'unt

# Preprocessing Tweets

The inspiration for the methods below to clean the data came from course homework and from the article "Twitter Sentiment Analysis- A NLP Use-Case for Beginners" by Gunjan Goyal

https://www.analyticsvidhya.com/blog/2021/06/twitter-sentiment-analysis-a-nlp-use-case-for-beginners/

In [9]:
def cleaning_stopwords(tweet):
    return " ".join([word for word in str(tweet).split() if word not in stop_words])

training_data['Tweet'] = training_data['Tweet'].apply(lambda tweet: cleaning_stopwords(tweet))

training_data.head()

Unnamed: 0,Tweet,Sentiment
0,"im getting borderlands murder ,",Positive
1,"coming borders kill all,",Positive
2,"im getting borderlands kill all,",Positive
3,"im coming borderlands murder all,",Positive
4,"im getting borderlands 2 murder all,",Positive


In [10]:
string_punc = string.punctuation

def cleaning_punc(tweet):
    translator = str.maketrans('', '', string_punc)
    return tweet.translate(translator)


training_data['Tweet'] = training_data['Tweet'].apply(lambda x: cleaning_punc(x))

training_data.head()

Unnamed: 0,Tweet,Sentiment
0,im getting borderlands murder,Positive
1,coming borders kill all,Positive
2,im getting borderlands kill all,Positive
3,im coming borderlands murder all,Positive
4,im getting borderlands 2 murder all,Positive


In [11]:
def cleaning_URLs(tweet):
    return re.sub('((www.[^s]+) | (https?://[^s]+))',' ', tweet)

training_data['Tweet'] = training_data['Tweet'].apply(lambda x: cleaning_URLs(x))

training_data.head()

Unnamed: 0,Tweet,Sentiment
0,im getting borderlands murder,Positive
1,coming borders kill all,Positive
2,im getting borderlands kill all,Positive
3,im coming borderlands murder all,Positive
4,im getting borderlands 2 murder all,Positive


In [12]:
def cleaning_numbers(tweet):
    return re.sub('[0-9]+', '', tweet)

training_data['Tweet'] = training_data['Tweet'].apply(lambda x: cleaning_numbers(x))

training_data.head()

Unnamed: 0,Tweet,Sentiment
0,im getting borderlands murder,Positive
1,coming borders kill all,Positive
2,im getting borderlands kill all,Positive
3,im coming borderlands murder all,Positive
4,im getting borderlands murder all,Positive


In [13]:
tokenizer = RegexpTokenizer('\w+|\$[\d\.]+|\S+')
training_data['Tweet'] = training_data['Tweet'].apply(tokenizer.tokenize)
training_data.tail()

Unnamed: 0,Tweet,Sentiment
74677,"[realized, windows, partition, mac, like, year...",Positive
74678,"[realized, mac, window, partition, years, behi...",Positive
74679,"[realized, windows, partition, mac, years, beh...",Positive
74680,"[realized, windows, partition, mac, like, year...",Positive
74681,"[like, windows, partition, mac, like, years, b...",Positive


In [14]:
import nltk

stemmer = nltk.PorterStemmer()

def stemming_text(data):
    text = [stemmer.stem(word) for word in data]
    return text

training_data['Tweet'] = training_data['Tweet'].apply(lambda x: stemming_text(x))
training_data['Tweet'].head()

0          [im, get, borderland, murder]
1              [come, border, kill, all]
2       [im, get, borderland, kill, all]
3    [im, come, borderland, murder, all]
4     [im, get, borderland, murder, all]
Name: Tweet, dtype: object

In [15]:
lemmatizer = nltk.WordNetLemmatizer()

def lemmatizing_text(data):
    text = [lemmatizer.lemmatize(word) for word in data]
    return text

training_data['Tweet'] = training_data['Tweet'].apply(lambda x: lemmatizing_text(x))
training_data['Tweet'].head()

0          [im, get, borderland, murder]
1              [come, border, kill, all]
2       [im, get, borderland, kill, all]
3    [im, come, borderland, murder, all]
4     [im, get, borderland, murder, all]
Name: Tweet, dtype: object

# Splitting Dataset into different Sentiment Values

In [16]:
positive_tweets = training_data.loc[training_data['Sentiment'] == 'Positive']
negative_tweets = training_data.loc[training_data['Sentiment'] == 'Negative']

In [17]:
negative_tweets

Unnamed: 0,Tweet,Sentiment
24,"[biggest, dissappoin, life, came, year, ago, f...",Negative
25,"[biggest, disappoint, life, came, year, ago]",Negative
26,"[biggest, disappoint, life, came, year, ago]",Negative
27,"[biggest, dissappoin, life, come, year, ago, f...",Negative
28,"[biggest, male, dissappoin, life, came, hang, ...",Negative
...,...,...
74665,"[nvidia, realli, delay, week]",Negative
74666,"[nvidia, delay, week]",Negative
74667,"[nvidia, realli, delay, sever, week]",Negative
74668,"[nvidia, realli, delay, flight, week]",Negative


# Training Model

The inspiration for this next part of the project came from the youtube tutorial located at:

https://www.youtube.com/watch?v=OsSkjrNjqNI

Titled: Twitter Sentiment Analysis (Naive Bayes Classifier) by Artificial Intelligence at UCI

In [18]:
test_positive = positive_tweets[:10327].Tweet
train_positive = positive_tweets[:10327].Tweet

test_negative = negative_tweets[:11179].Tweet
train_negative = negative_tweets[:11179].Tweet

In [19]:
X_train = pd.concat([train_positive, train_negative])
X_test = pd.concat([test_positive, test_negative])

y_train = np.append(np.ones(len(train_positive)), np.zeros(len(train_negative)))
y_test = np.append(np.ones(len(test_positive)), np.zeros(len(test_negative)))

# Frequency Dictionary

In [20]:
def process_tweets(tweet):
    
    tweet = cleaning_stopwords(tweet)
    tweet = cleaning_punc(tweet)
    tweet = cleaning_URLs(tweet)
    tweet = cleaning_numbers(tweet)
    tweet = tokenizer.tokenize(tweet)
    tweet = stemming_text(tweet)
    tweet = lemmatizing_text(tweet)
    
    return tweet

In [21]:
def create_frequency(tweets, y_value):
    
    frequency_dictionary = {}
    
    for tweet, y in zip(tweets, y_value):
        for word in process_tweets(tweet):
            
            pair = (word, y)
            
            if pair in frequency_dictionary:
                frequency_dictionary[pair] += 1
                
            else:
                frequency_dictionary[pair] = frequency_dictionary.get(pair, 1)
        
    return frequency_dictionary

In [22]:
freqs = create_frequency(X_train, y_train)

In [23]:
def train_naive_bayes(freqs, X_train, y_train):
    
    loglikelihood = {}
    logprior = 0
    
    unique_words = set([pair[0] for pair in freqs.keys()])
    V = len(unique_words)
    
    N_pos = N_neg = 0
    for pair in freqs.keys():
        
        if pair[1] > 0:
            N_pos += freqs[(pair)]
            
        else:
            N_neg += freqs[(pair)]
            
    D = y_train.shape[0]
    
    D_pos = sum(y_train)
    
    D_neg = D - sum(y_train)
    
    logprior = np.log(D_pos) - np.log(D_neg)
    
    for word in unique_words:
        
        frequency_positive = freqs.get((word, 1), 0)
        frequency_negative = freqs.get((word, 0), 0)
        
        probability_word_positive = (frequency_positive + 1) / (N_pos + V)
        probability_word_negative = (frequency_negative + 1) / (N_neg + V)
        
        loglikelihood[word] = np.log(probability_word_positive / probability_word_negative)
        
    return logprior, loglikelihood

In [24]:
logprior, loglikelihood = train_naive_bayes(freqs, X_train, y_train)

# Predicting Tweets

In [25]:
def naive_bayes_predict(tweet, logprior, loglikelihood):

    word_list = process_tweets(tweet)

    probability = 0

    probability += logprior

    for word in word_list:

        if word in loglikelihood:

            probability += loglikelihood[word]

        return probability

In [26]:
for tweet in ['I am happy', 'angry', 'sad', 'I am so excited', 'good']:
    
    probability = naive_bayes_predict(tweet, logprior, loglikelihood)
    
    print(f'{tweet} -> {probability:.2f}')

I am happy -> 0.43
angry -> -1.37
sad -> -0.75
I am so excited -> 0.43
good -> 1.19
