## Twitter - Text Classification

#### Question - I

Create a step-by-step tutorial to build a text classifier using Naïve Bayes algorithm in a Jupyter
Notebook. Define each step and evaluate the model

In [36]:
import nltk
from nltk.corpus import twitter_samples
import numpy as np
import sklearn

#### Download the dataset

In [12]:
nltk.download('twitter_samples')

[nltk_data] Downloading package twitter_samples to
[nltk_data]     C:\Users\rashm\AppData\Roaming\nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!


True

In [17]:
print(twitter_samples)

<TwitterCorpusReader in 'C:\\Users\\rashm\\AppData\\Roaming\\nltk_data\\corpora\\twitter_samples'>


In [7]:
# select the set of positive and negative tweets
all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')

In [13]:
print('Number of positive tweets: ', len(all_positive_tweets))
print('Number of negative tweets: ', len(all_negative_tweets))

print('\nThe type of all_positive_tweets is: ', type(all_positive_tweets))
print('The type of a tweet entry is: ', type(all_negative_tweets[0]))

Number of positive tweets:  5000
Number of negative tweets:  5000

The type of all_positive_tweets is:  <class 'list'>
The type of a tweet entry is:  <class 'str'>


In [14]:
print("Positive Tweet Example:")
print(all_positive_tweets[0])

print("\nNegative Tweet Example:")
print(all_negative_tweets[0])

Positive Tweet Example:
#FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)

Negative Tweet Example:
hopeless for tmr :(


### Preprocess tweets

In [16]:
import re  # regular expression operations
import string

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer

### Remove hyperlinks, Twitter marks and styles

In [18]:
def remove_hyperlinks_marks_styles(tweet):
    
    # remove old style retweet text "RT"
    new_tweet = re.sub(r'^RT[\s]+', '', tweet)

    # remove hyperlinks
    new_tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', new_tweet)

    # remove hashtags, only the sign
    new_tweet = re.sub(r'#', '', new_tweet)
    
    return new_tweet

### Tokenize the string

In [19]:
tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,
                               reduce_len=True)

def tokenize_tweet(tweet):
    
    tweet_tokens = tokenizer.tokenize(tweet)
    
    return tweet_tokens

### Remove stop words


In [20]:
nltk.download('stopwords')

#Import the english stop words list from NLTK
stopwords_english = stopwords.words('english')

punctuations = string.punctuation

def remove_stopwords_punctuations(tweet_tokens):
    
    tweets_clean = []
    
    for word in tweet_tokens:
        if (word not in stopwords_english and word not in punctuations):
            tweets_clean.append(word)
            
    return tweets_clean

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rashm\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Stemming

In [21]:
stemmer = PorterStemmer()

def get_stem(tweets_clean):
    
    tweets_stem = []
    
    for word in tweets_clean:
        stem_word = stemmer.stem(word)
        tweets_stem.append(stem_word)
        
    return tweets_stem

In [23]:
# example to understand
tweet_example = all_positive_tweets[2278]
print(tweet_example)

processed_tweet = remove_hyperlinks_marks_styles(tweet_example)
print("\nRemoved hyperlinks, Twitter marks and styles:")
print(processed_tweet)

tweet_tokens = tokenize_tweet(processed_tweet)
print("\nTokenize the string:")
print(tweet_tokens)

tweets_clean = remove_stopwords_punctuations(tweet_tokens)
print("\nRemove stop words and punctuations:")
print(tweets_clean)

tweets_stem = get_stem(tweets_clean)
print("\nGet stem of each word:")
print(tweets_stem)

Hello :) Get Youth Job Opportunities follow &gt;&gt; @tolajobjobs @AlettaPetros

Removed hyperlinks, Twitter marks and styles:
Hello :) Get Youth Job Opportunities follow &gt;&gt; @tolajobjobs @AlettaPetros

Tokenize the string:
['hello', ':)', 'get', 'youth', 'job', 'opportunities', 'follow', '>', '>']

Remove stop words and punctuations:
['hello', ':)', 'get', 'youth', 'job', 'opportunities', 'follow']

Get stem of each word:
['hello', ':)', 'get', 'youth', 'job', 'opportun', 'follow']


### Combine all preprocess techniques

In [24]:
def process_tweet(tweet):
    
    processed_tweet = remove_hyperlinks_marks_styles(tweet)
    tweet_tokens = tokenize_tweet(processed_tweet)
    tweets_clean = remove_stopwords_punctuations(tweet_tokens)
    tweets_stem = get_stem(tweets_clean)
    
    return tweets_stem

In [25]:
tweet_example = all_negative_tweets[1000]
print(tweet_example)

processed_tweet = process_tweet(tweet_example)
print(processed_tweet)

@seanactual You mean you're not offering? :(
['mean', 'offer', ':(']


### split the data

In [38]:

test_pos = all_positive_tweets[4000:]
train_pos = all_positive_tweets[:4000]
test_neg = all_negative_tweets[4000:]
train_neg = all_negative_tweets[:4000]

train_x = train_pos + train_neg
test_x = test_pos + test_neg

train_y = np.append(np.ones(len(train_pos)), np.zeros(len(train_neg)))
test_y = np.append(np.ones(len(test_pos)), np.zeros(len(test_neg)))

### Freq table

In [39]:
def create_frequency(tweets, ys):
    
    freq_d = {}

   
    for tweet, y in zip(tweets, ys):
        for word in process_tweet(tweet):
            pair=(word,y)
            if pair in freq_d:
                freq_d[pair]+=1
            else:
                freq_d[pair]=freq_d.get(pair, 1)
    
    return freq_d

In [40]:
# testing function

tweets = ['i am happy', 'i am tricked', 'i am sad', 'i am tired', 'i am tired']
ys = [1, 0, 0, 0, 0]

freq_d = create_frequency(tweets, ys)
print(freq_d)

{('happi', 1): 1, ('trick', 0): 1, ('sad', 0): 1, ('tire', 0): 2}


In [41]:
freqs = create_frequency(train_x, train_y)

In [56]:
def train_naive_bayes(freqs, train_x, train_y):
    '''
    Input:
        freqs: dictionary from (word, label) to how often the word appears
        train_x: a list of tweets
        train_y: a list of labels correponding to the tweets (0,1)
    Output:
        logprior: the log prior. (equation 3 above)
        loglikelihood: the log likelihood of you Naive bayes equation. (equation 6 above)
    '''
    
    loglikelihood = {}
    logprior = 0
    
    # calculate the number of unique words in vocab
    unique_words = set([pair[0] for pair in freqs.keys()])
    V = len(unique_words)
    
    # calculate N_pos and N_neg
    N_pos = N_neg = 0
    for pair in freqs.keys():
        
        
        
        
        # TODO: get N_pos and N_get
        
        if pair[1]>0:
            N_pos+=freqs[(pair)]
            
        else:
            N_neg+=freqs[(pair)]
    # TODO: calculate the number of documents (tweets)
    D = train_y.shape[0]
    
    # TODO: calculate D_pos, the number of positive documents (tweets)
    D_pos = sum(train_y)
    
    # TODO: calculate D_neg, the number of negative documents (tweets)
    D_neg =  D- sum(train_y)
    
    # TODO: calculate logprior
    logprior = np.log(D_pos)-np.log(D_neg)
    
    
    # for each unqiue word
    for word in unique_words:
        
        # get the positive and negative frequency of the word
        freq_pos = freqs.get((word,1),0)
        freq_neg = freqs.get((word,0),0)
        
        # calculate the probability that word is positive, and negative
        p_w_pos = (freq_pos+1)/(N_pos+V)
        p_w_neg = (freq_neg+1)/(N_neg+V)
        
        # calculate the log likelihood of the word
        loglikelihood[word] = np.log(p_w_pos/p_w_neg)
        
    return logprior, loglikelihood


In [57]:
logprior, loglikelihood = train_naive_bayes(freqs, train_x, train_y)
print(logprior)
print(len(loglikelihood))

0.0
9083


In [58]:
# UNQ_C4 (UNIQUE CELL IDENTIFIER, DO NOT EDIT)
def naive_bayes_predict(tweet, logprior, loglikelihood):
    '''
    Input:
        tweet: a string
        logprior: a number
        loglikelihood: a dictionary of words mapping to numbers
    Output:
        p: the sum of all the logliklihoods of each word in the tweet (if found in the dictionary) + logprior (a number)

    '''

    # TODO: process the tweet to get a list of words
    word_l = process_tweet(tweet)

    # TODO: initialize probability to zero
    p = 0

    # TODO: add the logprior
    p += logprior

    for word in word_l:

        # TODO: get log likelihood of each keyword
        if word in loglikelihood:
            p+=loglikelihood[word]

    return p

In [62]:
# Run this cell to test your function
for tweet in ['I am happy','You are bad and dangerous' ,'I am bad', 'this movie should have been great.', 'great', 'great great', 'great great great', 'great great great great', 'bad bad bad bad']:
    # print( '%s -> %f' % (tweet, naive_bayes_predict(tweet, logprior, loglikelihood)))
    p = naive_bayes_predict(tweet, logprior, loglikelihood)
#     print(f'{tweet} -> {p:.2f} ({p_category})')
    print(f'{tweet} -> {p:.2f}')

I am happy -> 2.15
You are bad and dangerous -> -0.60
I am bad -> -1.29
this movie should have been great. -> 2.14
great -> 2.14
great great -> 4.28
great great great -> 6.41
great great great great -> 8.55
bad bad bad bad -> -5.18


# Higher the value , more negative the word is

### This way, we have build a naive bayes classifier from scratch

#### Question - II 

Define which text preprocessing and text transformation steps did you use for the
above. 

In [None]:
Text Preprocessing

* Removed Hyperlinks
* Removed Hastags
* Removed stop words
* Removed unnecssary symbols


Text transformation

* Tokenizing
* Stemming