# ***Loading and Cleaning Data***

In [None]:
import numpy as np
import pandas as pd
import math
import nltk
import re
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer

In [None]:
df = pd.read_csv('/content/twitter_training.csv')
df.head()

Unnamed: 0,2401,Borderlands,Positive,"im getting on borderlands and i will murder you all ,"
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...


In [None]:
df['Positive'] = np.where(df['Positive']=='Positive',1,
                          np.where(df['Positive']=='Neutral',0,-1))
df = df.drop("2401",axis=1)
df = df.drop('Borderlands',axis=1)
df = df.dropna()
df = df.drop(df[df['Positive']=='Irrelevant'].index)
df.rename(columns = {'Positive':'Sentiment', 'im getting on borderlands and i will murder you all ,':'Tweet'},inplace=True)
df.head()

Unnamed: 0,Sentiment,Tweet
0,1,I am coming to the borders and I will kill you...
1,1,im getting on borderlands and i will kill you ...
2,1,im coming on borderlands and i will murder you...
3,1,im getting on borderlands 2 and i will murder ...
4,1,im getting into borderlands and i can murder y...


# ***Preprocessing the tweets***

In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
def preprocess(tweet):

  tweet = re.sub(r'^RT[\s]+','',tweet)
  tweet = re.sub(r'https?:\/\/.*[\r\n]*','',tweet)
  tweet = re.sub(r'#','',tweet)

  tokenizer = TweetTokenizer(
                  preserve_case = False,
                  strip_handles = True,
                  reduce_len = True
              )
  tweet_tokens = tokenizer.tokenize(tweet)
  tweet_tokens

  stopwords_eng = stopwords.words('english')
  tweets_clean = []
  for word in tweet_tokens:
    if (word not in stopwords_eng and word not in string.punctuation):
      tweets_clean.append(word)
  tweets_clean

  stemmer = PorterStemmer()
  tweets_stem = []
  for word in tweets_clean:
    stem_word = stemmer.stem(word)
    tweets_stem.append(stem_word)
  tweets_stem

  return tweets_stem

In [None]:
df['Tokenized_tweet'] = df['Tweet'].apply(preprocess)
df.head()

Unnamed: 0,Sentiment,Tweet,Tokenized_tweet
0,1,I am coming to the borders and I will kill you...,"[come, border, kill]"
1,1,im getting on borderlands and i will kill you ...,"[im, get, borderland, kill]"
2,1,im coming on borderlands and i will murder you...,"[im, come, borderland, murder]"
3,1,im getting on borderlands 2 and i will murder ...,"[im, get, borderland, 2, murder]"
4,1,im getting into borderlands and i can murder y...,"[im, get, borderland, murder]"


# ***Implementing Naive's Bayes***

In [None]:
def count_tweets(tweets, y):
  result = {}
  for i,tweet in zip(y,tweets):
    for word in tweet:
      pair = (word,i)
      result[pair] = result.get(pair,0)+1
  return result

In [None]:
freqs = count_tweets(df['Tokenized_tweet'],df['Sentiment'])

In [None]:
def Train_Naive_Bayes(x,y,freqs):

  log_likelihood = {}

  vocab = set()
  for pair in freqs:
    vocab.add(pair[0])
  V = len(vocab)

  N_total = 0
  N_pos = 0
  N_neg = 0
  N_neu = 0

  for pair in freqs:
    if pair[1]==1 :
      N_pos += freqs[pair]
    elif pair[1]==0:
      N_neu += freqs[pair]
    else:
      N_neg += freqs[pair]

  N_total = N_pos + N_neg + N_neu
  log_prior_pos = math.log(N_pos) - math.log(N_total)
  log_prior_neg = math.log(N_neg) - math.log(N_total)
  log_prior_neu = math.log(N_neu) - math.log(N_total)
  log_prior = [log_prior_neg,log_prior_neu,log_prior_pos]

  for word in vocab:
    P_pos = math.log((freqs.get((word,1),0) + 1)/(N_pos + V))
    P_neu = math.log((freqs.get((word,0),0) + 1)/(N_neu + V))
    P_neg = math.log((freqs.get((word,-1),0) + 1)/(N_neg + V)) + math.log((N_total+V)/(freqs.get((word,-1),0)+ 1))
    log_likelihood[word] = [P_neg, P_neu, P_pos]

  return log_prior,log_likelihood

In [None]:
log_prior, log_likelihood = Train_Naive_Bayes(df['Tokenized_tweet'],df['Sentiment'],freqs)

# ***Predicting the Class***

In [None]:
def class_probablity(tokens,y,log_prior,log_likelihood):
  likelihood = log_prior[y]
  for word in tokens:
    likelihood += log_likelihood.get(word,[0,0,0])[y]
  return likelihood

In [None]:
def predict_class(tweet):
  likelihood = [0,0,0]
  tokens = preprocess(tweet)
  for i in range (3):
    likelihood[i] = class_probablity(tokens,i,log_prior,log_likelihood)
  sentiments = ['Negative', 'Neutral', 'Positive']
  sentiment = sentiments[np.argmax(likelihood)]
  return sentiment

In [None]:
df['Predicted_Sentiment'] = df['Tweet'].apply(predict_class)
df['Predicted_Sentiment'] = np.where(df['Predicted_Sentiment']=='Positive',1,
                          np.where(df['Predicted_Sentiment']=='Neutral',0,-1))
count = sum(df['Sentiment']==df['Predicted_Sentiment'])
accuracy = (count/df.shape[0])*100
accuracy

28.254611798094466

# ***Checking Accuracy on Validaton Dataset***

In [None]:
df_test = pd.read_csv('/content/twitter_validation.csv')
df_test.head()

Unnamed: 0,3364,Facebook,Irrelevant,"I mentioned on Facebook that I was struggling for motivation to go for a run the other day, which has been translated by Tom’s great auntie as ‘Hayley can’t get out of bed’ and told to his grandma, who now thinks I’m a lazy, terrible person 🤣"
0,352,Amazon,Neutral,BBC News - Amazon boss Jeff Bezos rejects clai...
1,8312,Microsoft,Negative,@Microsoft Why do I pay for WORD when it funct...
2,4371,CS-GO,Negative,"CSGO matchmaking is so full of closet hacking,..."
3,4433,Google,Neutral,Now the President is slapping Americans in the...
4,6273,FIFA,Negative,Hi @EAHelp I’ve had Madeleine McCann in my cel...


In [None]:
df_test = df_test.drop('3364',axis=1)
df_test = df_test.drop('Facebook',axis=1)
df_test = df_test.dropna()
df_test = df.drop(df_test[df_test['Irrelevant']=='Irrelevant'].index)
df_test.rename(columns = {'Irrelevant':'Sentiment',
                          'I mentioned on Facebook that I was struggling for motivation to go for a run the other day, which has been translated by Tom’s great auntie as ‘Hayley can’t get out of bed’ and told to his grandma, who now thinks I’m a lazy, terrible person 🤣':'Tweet'},inplace=True)
df_test['Predicted_Sentiment'] = df_test['Tweet'].apply(predict_class)
df.head()

Unnamed: 0,Sentiment,Tweet,Tokenized_tweet,Predicted_Sentiment
0,1,I am coming to the borders and I will kill you...,"[come, border, kill]",1
1,1,im getting on borderlands and i will kill you ...,"[im, get, borderland, kill]",1
2,1,im coming on borderlands and i will murder you...,"[im, come, borderland, murder]",1
3,1,im getting on borderlands 2 and i will murder ...,"[im, get, borderland, 2, murder]",1
4,1,im getting into borderlands and i can murder y...,"[im, get, borderland, murder]",1


In [None]:
df_test['Predicted_Sentiment'] = df_test['Tweet'].apply(predict_class)

In [None]:
df_test.head()

Unnamed: 0,Sentiment,Tweet,Tokenized_tweet,Predicted_Sentiment
0,1,I am coming to the borders and I will kill you...,"[come, border, kill]",Positive
1,1,im getting on borderlands and i will kill you ...,"[im, get, borderland, kill]",Positive
2,1,im coming on borderlands and i will murder you...,"[im, come, borderland, murder]",Positive
3,1,im getting on borderlands 2 and i will murder ...,"[im, get, borderland, 2, murder]",Positive
4,1,im getting into borderlands and i can murder y...,"[im, get, borderland, murder]",Positive


In [None]:
lis = ['Negative','Neutral','Positive']
df_test['Sentiment'] = df_test['Sentiment'].apply(lambda x: lis[x+1])
count = sum(df_test['Sentiment']==df_test['Predicted_Sentiment'])
accuracy = (count/df_test.shape[0])*100
accuracy

28.227947550931948