<a href="https://colab.research.google.com/github/Nourhan-Adell/Natural-Language-Processing-Specialization/blob/main/1.Natural%20Language%20Processing%20with%20Classification%20and%20Vector/week_2/TweetCalassification_NaiiveBayes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Classify tweets by Naive Bayes algorithm**

In [2]:
import pdb
from nltk.corpus import stopwords, twitter_samples
import numpy as np
import pandas as pd
import nltk
import string
from nltk.tokenize import TweetTokenizer
from os import getcwd

nltk.download('twitter_samples')
nltk.download('stopwords')

[nltk_data] Downloading package twitter_samples to /root/nltk_data...
[nltk_data]   Unzipping corpora/twitter_samples.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

## **1. Get and inspect the data**

In [6]:
filePath = f"{getcwd()}/week2/"
nltk.data.path.append(filePath)

In [7]:
# get the sets of positive and negative tweets
all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')

# split the data into two pieces, one for training and one for testing (validation set)
train_pos = all_positive_tweets[:4000]
train_neg = all_negative_tweets[:4000]

test_pos = all_positive_tweets[4000:]
test_neg = all_negative_tweets[4000:]

train_x = train_pos + train_neg
test_x = test_pos + test_neg

# avoid assumptions about the length of all_positive_tweets
train_y = np.append(np.ones(len(train_pos)), np.zeros(len(train_neg)))
test_y = np.append(np.ones(len(test_pos)), np.zeros(len(test_neg)))

## **2. Preprocessing the data**

In [8]:
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer
import re
import string

def preprocess_tweets(tweet):
  # stem the tweet(r3eturn it to its origin)
  stemmer = PorterStemmer()

  # Remove the stope words and signs
  stopwords_english = stopwords.words('english')
  # remove stock market tickers like $GE
  tweet = re.sub(r'\$\w*', '', tweet)
  # remove old style retweet text "RT"
  tweet = re.sub(r'^RT[\s]+', '', tweet)
  # remove hyperlinks
  #tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
  tweet = re.sub(r'https?://[^\s\n\r]+', '', tweet)
  # remove hashtags
  tweet = re.sub(r'#', '', tweet)

  # tokenize tweets
  tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,
                               reduce_len=True)
  tweet_tokens = tokenizer.tokenize(tweet)

  tweets_clean = []
  for word in tweet_tokens:
      if (word not in stopwords_english and  # remove stopwords
          word not in string.punctuation):  # remove punctuation
          # tweets_clean.append(word)
          stem_word = stemmer.stem(word)  # stemming word
          tweets_clean.append(stem_word)
  return tweets_clean

In [9]:
# Count tweets function to calculae the frequencies

def count_tweets(result, tweets, ys):
  '''
    Input:
        result: a dictionary that will be used to map each pair to its frequency
        tweets: a list of tweets
        ys: a list corresponding to the sentiment of each tweet (either 0 or 1)
    Output:
        result: a dictionary mapping each pair to its frequency
    '''
  for y, tweet in zip(ys, tweets):
    for word in preprocess_tweets(tweet):
      pair = (word, y)
      
      if pair in result:
        result[pair] += 1
      else:
        result[pair] = 1
  
  return result

In [11]:
def lookup(freqs, word, label):
  n = 0
  pair = (word, label)
  if pair in freqs:
    n = freqs[pair]
  return n

## **3. Train the Naive Bayes model**

In [10]:
results ={}
freqs = count_tweets(results, train_x, train_y)

In [13]:
def train_NaiveBayes_model(freqs, train_x, train_y):
  logprior = 0
  loglikelihood = {}

  vocab = set([pair[0] for pair in freqs.keys()])
  V = len(vocab)

  # calculate N_pos, N_neg, V_pos, V_neg
  N_pos = N_neg = 0
  for pair in freqs.keys():
    if pair[1]> 0:
      N_pos += freqs[pair]
    else:
      N_neg += freqs[pair]

  # Calculate D, the number of documents
  D = len(train_y)
  D_pos = len(list(filter(lambda x: x > 0, train_y)))
  D_neg = len(list(filter(lambda x: x <= 0, train_y)))

  # Calculate logprior
  logprior = np.log(D_pos) - np.log(D_neg)

  for word in vocab:
    freq_pos = lookup(freqs, word, 1)
    freq_neg = lookup(freqs, word, 0)

    # calculate the probability that each word is positive, and negative with Laplacian smoothing
    prob_word_pos = (freq_pos + 1) / (N_pos + V)
    prob_word_neg = (freq_neg + 1) / (N_neg + V)
    
    # calculate the log likelihood of the word
    loglikelihood[word] = np.log (prob_word_pos / prob_word_neg)
  
  return logprior, loglikelihood

In [14]:
logprior, loglikelihood = train_NaiveBayes_model(freqs, train_x, train_y)
print(logprior)
print(len(loglikelihood))

0.0
9162


## **4. Test the Naive Bayes model**

In [15]:
def NaiveBayes_predict(tweet, logprior, loglikelihood):
  word_list = preprocess_tweets(tweet)
  prob = 0 
  prob += logprior

  for word in word_list:
    if word in loglikelihood:
      prob += loglikelihood[word]
  
  return prob

If the result of the prob > 1, so it's a positive sentiment and vice versa

In [16]:
def test_NaiveBayes_model(test_x, test_y, logprior, loglikelihood, NaiveBayes_predict = NaiveBayes_predict):
  accuracy = 0
  y_predict = []

  for tweet in test_x:
    if NaiveBayes_predict(tweet, logprior, loglikelihood) > 0:
      y_predict_result = 1
    else:
      y_predict_result = 0
    y_predict.append(y_predict_result)

  # Calculate the mean absolute error
  error = np.mean(np.abs(y_predict - test_y))
  accuracy = 1 - error

  return accuracy

In [17]:
print("Naive Bayes accuracy = %0.4f" %(test_NaiveBayes_model(test_x, test_y, logprior, loglikelihood)))

Naive Bayes accuracy = 0.9955


# So, the Naive Bayes Model works with tweets and gives accuracy = 99.5%