Steps:


1.   get a dataset with positive and negative tweets
2.   split into train and test set
3.   Preprocess the tweets
4.   Compute freq(w,class)
5.   Train the model
6.   Make prediction
7.   Find accuracy for test tweets




In [9]:
# import dependencies

import numpy as np
import pandas as pd
import string

import nltk
from nltk.corpus import stopwords, twitter_samples
from nltk.tokenize import TweetTokenizer

In [10]:
nltk.download('stopwords')
nltk.download('twitter_samples')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package twitter_samples to /root/nltk_data...
[nltk_data]   Unzipping corpora/twitter_samples.zip.


True

In [11]:
from google.colab import files
files.upload()

Saving utils.py to utils.py


{'utils.py': b'# -*- coding: utf-8 -*-\n"""utils.ipynb\n\nAutomatically generated by Colaboratory.\n\nOriginal file is located at\n    https://colab.research.google.com/drive/14Fp_ARD8EDcQ4_AtdG3jCXoc9RODzHTV\n"""\n"""Import Dependencies"""\n\nimport numpy as np\nimport string\nimport re\n\nimport nltk\nfrom nltk.corpus import stopwords\nfrom nltk.stem import PorterStemmer\nfrom nltk.tokenize import TweetTokenizer\n\nimport matplotlib.pyplot as plt\nfrom matplotlib.patches import Ellipse\nimport matplotlib.transforms as transforms\n\n"""Preprocessing function"""\n\ndef process_tweet(tweet):\n  """\n  Input: a string\n  output: list of words\n  """\n\n  stemmer = PorterStemmer()\n  tokenizer = TweetTokenizer(preserve_case = False, reduce_len = True, strip_handles = True)\n  stopword_english = stopwords.words(\'english\')\n\n  # remove retweet sign, stock tickers, urls, #\n\n  tweet = re.sub(r\'^RT[\\s]+\', \'\', tweet)\n  tweet = re.sub(r\'^\\$\\w*\', \'\', tweet)\n  tweet = re.sub(r\'h

In [12]:
from utils import process_tweet, lookup, count_freqs

In [13]:
# get all the positive and negative tweets

positive_tweets = twitter_samples.strings('positive_tweets.json')
negative_tweets = twitter_samples.strings('negative_tweets.json')

In [14]:
# split into train and test set

pos_train = positive_tweets[:4000]
pos_test = positive_tweets[4000:]

neg_train = negative_tweets[:4000]
neg_test = negative_tweets[4000:]

train_x = pos_train + neg_train
test_x = pos_test + neg_test

train_y = np.append(np.ones(len(pos_train)), np.zeros(len(neg_train)))
test_y = np.append(np.ones(len(pos_test)), np.zeros(len(neg_test)))

In [16]:
print(type(train_y))
print(train_y.shape)
print(test_y.shape)

<class 'numpy.ndarray'>
(8000,)
(2000,)


In [17]:
# test the process_tweet function with a sample tweet

import random

value = random.randint(0,len(train_x))

sample = train_x[value]
clean_text = process_tweet(sample)

print(sample)
print(clean_text)

My cats have forgotten who I am and they hate me :(
['cat', 'forgotten', 'hate', ':(']


In [41]:
# Test the count_freq your function

tweets = ['i am happy', 'i am tricked', 'i am sad', 'i am tired', 'i am tired']
ys = [1, 0, 0, 0, 0]
count_freqs(tweets, ys)

{('happi', 1): 1, ('trick', 0): 1, ('sad', 0): 1, ('tire', 0): 2}

Training model with Naive Bayes

Steps: 

1. Calculate V: number of unique words that appear in the freqs dictionary
2. Calc N(pos), N(neg): total number of positive and negative words
3. Calc D,D(pos),D(neg): no of tweets, #positive tweets, #negative tweets
4. Calculate Logprior: log(D(pos)) -log(D(neg))
5. Calc freq(pos),freq(neg): positive and negative frequency of each word
6. Calc P(Wpos),P(Wneg): positive and negative probability of each word
7. Calculate log likeliood: log(P(Wpos)/P(Wneg))

P(Wpos) = (freq(pos) + 1)/( N(pos) + V)

P(Wneg) = (freq(neg) + 1)/( N(neg) + V)



Fancy way of calculating D_pos and D_neg


1.   D_pos = len(list(filter(lambda x: x==1, train_y)))
2.   D_neg = len(list(filter(lambda x: x==0, train_y)))


In [19]:
# build the frequency dictionary

freqs = count_freqs(train_x, train_y)
print(freqs)

{('followfriday', 1.0): 23, ('top', 1.0): 30, ('engag', 1.0): 7, ('member', 1.0): 14, ('commun', 1.0): 27, ('week', 1.0): 72, (':)', 1.0): 2960, ('hey', 1.0): 60, ('jame', 1.0): 7, ('odd', 1.0): 2, (':/', 1.0): 5, ('pleas', 1.0): 81, ('call', 1.0): 27, ('contact', 1.0): 4, ('centr', 1.0): 1, ('02392441234', 1.0): 1, ('abl', 1.0): 6, ('assist', 1.0): 1, ('mani', 1.0): 28, ('thank', 1.0): 522, ('listen', 1.0): 15, ('last', 1.0): 39, ('night', 1.0): 55, ('bleed', 1.0): 2, ('amaz', 1.0): 41, ('track', 1.0): 5, ('scotland', 1.0): 2, ('congrat', 1.0): 15, ('yeaaah', 1.0): 1, ('yipppi', 1.0): 1, ('accnt', 1.0): 2, ('verifi', 1.0): 2, ('rqst', 1.0): 1, ('succeed', 1.0): 1, ('got', 1.0): 57, ('blue', 1.0): 8, ('tick', 1.0): 1, ('mark', 1.0): 1, ('fb', 1.0): 4, ('profil', 1.0): 2, ('15', 1.0): 4, ('day', 1.0): 187, ('one', 1.0): 92, ('irresist', 1.0): 2, ('flipkartfashionfriday', 1.0): 16, ('like', 1.0): 187, ('keep', 1.0): 55, ('love', 1.0): 336, ('custom', 1.0): 4, ('wait', 1.0): 55, ('long', 

In [22]:
def train_naive_bayes(freqs, train_x, train_y):

  """ 
  Input: 
  freqs: dictionary containing (word,label): frequency of the word {output of count_freq function}
  train_x = list of tweets
  train_y = list of labels (0,1)

  Output: log prior and log likelihood
  """

  # Step 1: Calculate unique words
  vocab = set([pair[0] for pair in freqs.keys()])
  V = len(vocab)

  # Step 2: Calculate total number of positive and negative words
  n_pos = 0
  n_neg = 0

  for pair in freqs.keys():
    if pair[1] == 1:
      n_pos += freqs[pair]
    else:
      n_neg += freqs[pair]

  # Step 3: Calculate total number of tweets, total number of positive tweets, totla number of negative tweeets
  D = len(train_y)
  D_pos = sum(train_y)
  D_neg = D - D_pos

  # Step 4: Calculate log prior
  logprior = np.log(D_pos) - np.log(D_neg)

  loglikelihood = {}

  # Step 5: Calculate positive and negative frequency of each word
  for word in vocab:
    f_pos = lookup(freqs,word,1)
    f_neg = lookup(freqs,word,0)

    p_w_pos = (f_pos + 1)/(n_pos + V)
    p_w_neg = (f_neg + 1)/(n_neg + V)

    loglikelihood[word] = np.log(p_w_pos/p_w_neg)

  return logprior, loglikelihood

In [23]:
logprior, loglikelihood = train_naive_bayes(freqs, train_x, train_y)

In [24]:
print(logprior)
print(len(loglikelihood))
print(loglikelihood)

0.0
9161
{'pledg': -0.70410469329048, 'x43': 0.6821896678294106, 'elbow': 0.3945075953776297, 'brook': -0.70410469329048, 'closer': -0.70410469329048, 'felton': -0.70410469329048, '⚡': 0.6821896678294106, 'fbc': -1.1095698013986444, 'desir': 0.6821896678294106, 'fuuuck': -1.1095698013986444, 'queenesth': 0.6821896678294106, 'evelineconrad': -0.70410469329048, 'pleas': -1.1014064907594834, 'suck': -0.8582553731177381, 'araw': -0.70410469329048, 'leftov': -1.1095698013986444, 'j': 0.3945075953776297, 'schoolsoutforsumm': 0.6821896678294106, 'marula': 0.6821896678294106, 'steven': -0.70410469329048, 'satyajit': 0.6821896678294106, 'teenchoic': 0.27672455972124627, 'wtfff': -0.70410469329048, 'repeatedli': -0.70410469329048, 'counti': -1.1095698013986444, 'retweet': 1.860844664171057, 'burn': -0.5217831364965253, 'viabl': 0.6821896678294106, "deosn't": -0.70410469329048, '333505': 0.6821896678294106, '7/29': 0.6821896678294106, 'keep': 0.5486582752048881, 'brotheeerrr': 0.6821896678294106,

In [25]:
# predict function

def predict_tweet(tweet,logprior,loglikelihood):
  word_list = process_tweet(tweet)

  p = 0
  p += logprior 

  for word in word_list:
    if word in loglikelihood:
      p += loglikelihood[word]
  
  return p

In [29]:
# Check if the function works
my_tweet = 'She smiled.'
p = predict_tweet(my_tweet, logprior, loglikelihood)
print(p)

1.5576584051833107


In [36]:
# build the test function

def test_naive_bayes(test_x, test_y, logprior, loglikelihood):

  y_hat = []

  for tweet in test_x:
    p = predict_tweet(tweet,logprior,loglikelihood)

    if p>0:
      y_hat.append(1)
    else:
      y_hat.append(0)

  error = np.mean(np.absolute(np.array(y_hat)-np.array(test_y)))
  accuracy = 1-error

  return accuracy

In [37]:
Accuracy = test_naive_bayes(test_x, test_y, logprior, loglikelihood)
print(Accuracy)

0.9955


So, our naive bayes model has a test accuracy of 99.55%

In [40]:
# For a new tweet

my_tweet = 'I am sad because I am learning :('

p = predict_tweet(my_tweet, logprior, loglikelihood)
print(p)

if p > 0:
  print("It's a positive tweet")
else:
  print("It's a negative tweet")

-9.813051100785017
It's a negative tweet
