In [18]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [19]:
dataset = pd.read_csv('sentiment_analysis.csv')

In [20]:
dataset.head(5)

Unnamed: 0,id,label,tweet
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1...
1,2,0,Finally a transparant silicon case ^^ Thanks t...
2,3,0,We love this! Would you go? #talk #makememorie...
3,4,0,I'm wired I know I'm George I was made that wa...
4,5,1,What amazing service! Apple won't even talk to...


In [21]:
dataset.drop(columns={'id'}, inplace=True)

In [22]:
import re
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer
def process_tweet(tweet):
  tweet = re.sub(r'#', '', tweet)
  tweet = re.sub(r'http\S+', '', tweet)
  tweet = re.sub(r'[^a-zA-Z0-9 ]', '', tweet)
  stemmer = PorterStemmer()
  tokenizer = TweetTokenizer()
  clean_tweet = []
  for word in tokenizer.tokenize(tweet):
    clean_tweet.append(stemmer.stem(word))
  return clean_tweet

In [23]:
def freq(tweets, labels):
  labels = np.squeeze(labels).tolist()
  freqs = {}
  for label, tweet in zip(labels, tweets):
    for word in process_tweet(tweet):
      pair = (word, label)
      if pair in freqs:
        freqs[pair] += 1
      else:
        freqs[pair] = 1

  return freqs

In [24]:
freqs = freq(dataset.tweet, dataset.label)

In [25]:
print(freqs)

{('fingerprint', 0): 4, ('pregnanc', 0): 1, ('test', 0): 12, ('android', 0): 364, ('app', 0): 323, ('beauti', 0): 331, ('cute', 0): 328, ('health', 0): 118, ('iger', 0): 150, ('iphoneonli', 0): 133, ('iphonesia', 0): 138, ('iphon', 0): 3589, ('final', 0): 174, ('a', 0): 755, ('transpar', 0): 5, ('silicon', 0): 4, ('case', 0): 344, ('thank', 0): 190, ('to', 0): 880, ('my', 0): 1246, ('uncl', 0): 4, ('yay', 0): 66, ('soni', 0): 770, ('xperia', 0): 58, ('s', 0): 39, ('sonyexperia', 0): 2, ('we', 0): 159, ('love', 0): 460, ('thi', 0): 462, ('would', 0): 95, ('you', 0): 568, ('go', 0): 104, ('talk', 0): 21, ('makememori', 0): 1, ('unplug', 0): 2, ('relax', 0): 41, ('smartphon', 0): 128, ('wifi', 0): 37, ('connect', 0): 9, ('im', 0): 183, ('wire', 0): 5, ('i', 0): 1063, ('know', 0): 61, ('georg', 0): 1, ('wa', 0): 70, ('made', 0): 46, ('that', 0): 178, ('way', 0): 36, ('daventri', 0): 1, ('home', 0): 137, ('what', 1): 79, ('amaz', 1): 7, ('servic', 1): 25, ('appl', 1): 1307, ('wont', 1): 71,

In [29]:
def logprior(y):
  D_pos = (len(list(filter(lambda x: x > 0, y))))
  D_neg = (len(list(filter(lambda x: x <= 0, y))))
  logprior = np.log(D_pos) - np.log(D_neg)

  return logprior


In [31]:
log_prior = logprior(dataset.label.astype(int))

In [32]:
print(log_prior)

-1.0678714784890113


In [44]:
def lookup(freq, word, label):
  n = 0  # freqs.get((word, label), 0)
  pair = (word, label)
  if (pair in freq):
      n = freq[pair]
  return n

In [54]:
def loglikelihood(freq):
  vocab = set()
  for pair in freq.keys():
    vocab.add(pair[0])
  v = len(vocab)
  N_pos= N_neg = 0
  for pair in freq.keys():
    if pair[1] == '1':
      N_pos += freq[pair]
    else:
      N_neg += freq[pair]
  loglikelihood ={}
  for word in vocab:
    count_pos = lookup(freq, word, '1')
    count_neg = lookup(freq, word, '0')

    prob_pos = (count_pos + 1) / (N_pos + v)
    prob_neg = (count_neg + 1) / (N_neg + v)

    loglikelihood[word] = np.log(prob_pos/prob_neg)

  return loglikelihood

In [50]:
log_likelihood = loglikelihood(freqs)

In [51]:
print(log_likelihood)

{'summerinstagramhub': 2.1557372167602282, '2203': 2.1557372167602282, 'ive': 2.1557372167602282, 'wesc': 2.1557372167602282, 'datchet': 2.1557372167602282, '137': 2.1557372167602282, 'digitalinfluencerpictwittercomya': 2.1557372167602282, 'columbia': 2.1557372167602282, 'myeveryth': 2.1557372167602282, 'truck': 2.1557372167602282, 'better': 2.1557372167602282, 'joshbat': 2.1557372167602282, 'momth': 2.1557372167602282, 'beetroot': 2.1557372167602282, 'tooklongenough': 2.1557372167602282, 'airmax': 2.1557372167602282, 'jimgoldman': 2.1557372167602282, 'minnesota': 2.1557372167602282, 'bribe': 2.1557372167602282, 'fame': 2.1557372167602282, 'codww': 2.1557372167602282, '4u5rdxhw': 2.1557372167602282, 'vivir': 2.1557372167602282, '6xsv': 2.1557372167602282, 'notahappycustom': 2.1557372167602282, '5wctqerxmg': 2.1557372167602282, 'member': 2.1557372167602282, 'torn': 2.1557372167602282, 'muahh': 2.1557372167602282, 'ipadpropictwittercomko': 2.1557372167602282, '3d': 2.1557372167602282, '0

In [52]:
def predict(logprior, loglikelihood, tweet):
  p = 0
  p += logprior
  for word in process_tweet(tweet):
    p += loglikelihood[word]
  return p

In [60]:
p = predict(log_prior, log_likelihood, dataset.tweet[2])

In [61]:
dataset.tweet[2]

'We love this! Would you go? #talk #makememories #unplug #relax #iphone #smartphone #wifi #connect... http://fb.me/6N3LsUpCu'

In [62]:
print(p)

29.11244955615419
