In [11]:
# uncomment below line to install dependencies
# !pip install numpy pandas scikit-learn nltk

In [None]:
# uncomment below line to download the dataset
# nltk.download('stopwords')
# nltk.download('twitter_samples')

In [17]:
import string, re
from nltk.corpus import stopwords, twitter_samples
from nltk.tokenize import TweetTokenizer

In this post, we'll implement Naive Bayes classifier on twiiter sentiment dataset given in the `nltk` module.

I will follow [this](https://virksaab.github.io/2020/08/20/worktree_mlcore_naivebayes_intro.html) explanation to implement the Naive Bayes. To train a naive bayes classifier, **the first step is to get the training and test dataset**. Let's import and load the dataset first.

In [2]:
positive_tweets = twitter_samples.strings("positive_tweets.json")
negative_tweets = twitter_samples.strings("negative_tweets.json")
print(f"# tweets: positive: {len(positive_tweets)}, negative: {len(negative_tweets)}")

# tweets: positive: 5000, negative: 5000


Split the dataset into training and testing

In [10]:
split = 4000
X_train = positive_tweets[:split] + negative_tweets[:split]
X_test  = positive_tweets[split:] + negative_tweets[split:]

# Along with the data, we need target values as well. 
# We will assume *positive tweets = 1* and *negative tweets = 0*
y_train = [1]*(len(X_train)//2) + [0]*(len(X_train)//2)
y_test = [1]*(len(X_test)//2) + [0]*(len(X_test)//2)

print(f"# Training: {len(X_train)}, {len(y_train)}")
print(f"# Testing:  {len(X_test)}, {len(y_test)}")

# Training: 8000, 8000
# Testing:  2000, 2000


**Step 2: Preprocess the tweets**

After splitting the data, let's clean the data by removing noise, punctuations, stopwords, and hashtags.
Along with cleaning, we will tokenize the tweet as well. Splitting words into units (here, substrings) from a sentence is called `Tokenization`.

In [22]:
def preprocess(tweet):
    stopwords_english = stopwords.words('english')
    # remove stock market tickers like $GE
    tweet = re.sub(r'\$\w*', '', tweet)
    # remove old style retweet text "RT"
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    # remove hyperlinks
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    # remove hashtags
    # only removing the hash # sign from the word
    tweet = re.sub(r'#', '', tweet)
    
    # tokenize tweets
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
    tokens = tokenizer.tokenize(tweet)

    cleaned_tweet = []
    for word in tokens:
        if (word not in stopwords_english and  # remove stopwords
            word not in string.punctuation):  # remove punctuation
            cleaned_tweet.append(word)
    return cleaned_tweet

# Run an example
print("Example:", preprocess(X_train[0]))

Example: ['followfriday', 'top', 'engaged', 'members', 'community', 'week', ':)']


**Step 3: Compute freq of words with respect to class table, freq(word, class)**

It means count the number of times a word occurs in a class/label (0 for negative, 1 for positive).
We will build a dictionary where the keys are a (word, class) and values are counts. Training data will be used for this step.

In [25]:
def build_freq_table(processed_tweets, labels): pass

In [26]:
processed_tweets = [preprocess(tweet) for tweet in X_train]

[['followfriday', 'top', 'engaged', 'members', 'community', 'week', ':)'],
 ['hey',
  'james',
  'odd',
  ':/',
  'please',
  'call',
  'contact',
  'centre',
  '02392441234',
  'able',
  'assist',
  ':)',
  'many',
  'thanks'],
 ['listen', 'last', 'night', ':)', 'bleed', 'amazing', 'track', 'scotland'],
 ['congrats', ':)'],
 ['yeaaah',
  'yipppy',
  'accnt',
  'verified',
  'rqst',
  'succeed',
  'got',
  'blue',
  'tick',
  'mark',
  'fb',
  'profile',
  ':)',
  '15',
  'days'],
 ['one', 'irresistible', ':)', 'flipkartfashionfriday'],
 ['like',
  'keep',
  'lovely',
  'customers',
  'waiting',
  'long',
  'hope',
  'enjoy',
  'happy',
  'friday',
  'lwwf',
  ':)'],
 ['second',
  'thought',
  '’',
  'enough',
  'time',
  'dd',
  ':)',
  'new',
  'shorts',
  'entering',
  'system',
  'sheep',
  'must',
  'buying'],
 ['jgh', 'go', 'bayan', ':D', 'bye'],
 ['act',
  'mischievousness',
  'calling',
  'etl',
  'layer',
  'in-house',
  'warehousing',
  'app',
  'katamari',
  'well',
  '…',
 