In [39]:
import nltk
import re                                  # library for regular expression operations
import string                              # for string operations
import numpy as np
import pandas as pd
from nltk.corpus import twitter_samples
from nltk.corpus import stopwords          # module for stop words that come with NLTK
from nltk.stem import PorterStemmer        # module for stemming
from nltk.tokenize import TweetTokenizer   # module for tokenizing strings
nltk.download('twitter_samples', quiet=True)
nltk.download('stopwords', quiet=True)
all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')

In [39]:
def remove_hyperlinks_marks_styles(tweet):
    new_tweet = re.sub(r'^RT[\s]+', '', tweet)
    new_tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', new_tweet)
    new_tweet = re.sub(r'#', '', new_tweet)
    return new_tweet
tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,reduce_len=True)

In [39]:
def tokenize_tweet(tweet):
    tweet_tokens = tokenizer.tokenize(tweet)
    return tweet_tokens
stopwords_english = stopwords.words('english')
punctuations = string.punctuation

In [39]:
def remove_stopwords_punctuations(tweet_tokens):
    tweets_clean = []
    for word in tweet_tokens:
        if (word not in stopwords_english and word not in punctuations):
            tweets_clean.append(word)
    return tweets_clean
stemmer = PorterStemmer()

def get_stem(tweets_clean):
    tweets_stem = []
    for word in tweets_clean:
        stem_word = stemmer.stem(word)
        tweets_stem.append(stem_word)
    return tweets_stem

def process_tweet(tweet):
    processed_tweet = remove_hyperlinks_marks_styles(tweet)
    tweet_tokens = tokenize_tweet(processed_tweet)
    tweets_clean = remove_stopwords_punctuations(tweet_tokens)
    tweets_stem = get_stem(tweets_clean)
    return tweets_stem

In [39]:
print('Training NaiveBayes Classification Model....................')
train_pos = all_positive_tweets
train_neg = all_negative_tweets
train_x = train_pos + train_neg
train_y = np.append(np.ones(len(train_pos)), np.zeros(len(train_neg)))
def create_frequency(tweets, ys):
    freq_d = {}
    for tweet, y in zip(tweets, ys): 
        for word in process_tweet(tweet):
            pair = (word, y)
            if pair in freq_d:
                freq_d[pair] += 1 
            else: 
                freq_d[pair] = freq_d.get(pair, 1)
    return freq_d
freqs = create_frequency(train_x, train_y)

In [39]:
def train_naive_bayes(freqs, train_x, train_y):
    loglikelihood = {}
    logprior = 0
    unique_words = set([pair[0] for pair in freqs.keys()])
    V = len(unique_words)
    N_pos = N_neg = 0
    for pair in freqs.keys():
        if pair[1]>0: 
            N_pos += freqs[(pair)] 
        else: 
            N_neg += freqs[(pair)]
            
    D = train_y.shape[0]
    D_pos = sum(train_y)
    D_neg = D - sum(train_y)
    logprior = np.log(D_pos) - np.log(D_neg)
    for word in unique_words:
        freq_pos = freqs.get((word, 1),0)
        freq_neg = freqs.get((word, 0),0)
        p_w_pos = (freq_pos + 1) / (N_pos + V)
        p_w_neg = (freq_neg + 1) / (N_neg +V)
        loglikelihood[word] = np.log(p_w_pos / p_w_neg)
    return logprior, loglikelihood
logprior, loglikelihood = train_naive_bayes(freqs, train_x, train_y) 

In [39]:
def naive_bayes_predict(tweet, logprior, loglikelihood):
    word_l = process_tweet(tweet)
    p = 0
    p += logprior
    for word in word_l:
        if word in loglikelihood: 
            p += loglikelihood[word]
    return p
print('Training of NaiveBayes Classification Model Completed !!!!')
print('Finding Sentiment of the given Tweets..........')
print()
df = pd.read_csv('Tweets.csv') 
df = df.dropna()
sentiment = []
for index, tweet in df.iterrows():
    # print( '%s -> %f' % (tweet, naive_bayes_predict(tweet, logprior, loglikelihood)))
    p = naive_bayes_predict(tweet['text'], logprior, loglikelihood)
    sentiment.append(round(p,1))
df['sentiment'] = sentiment
df.where(df['sentiment'] <0).dropna()
df.to_csv('NaiveResult.csv')
positiveTweets = df.where(df['sentiment'] > 0)
negativeTweets = df.where(df['sentiment'] < 0.0)
neutralTweets = df.where(df['sentiment'] == 0.0)
positiveTweets = positiveTweets.dropna()
negativeTweets = negativeTweets.dropna() 
neutralTweets = neutralTweets.dropna() 
print('---------------------Result-------------------------')
print(f"Positive tweets percentage: {(100*len(positiveTweets)/len(df)):.2f} %")
print(f"Negative tweets percentage: {(100*len(negativeTweets)/len(df)):.2f} %")
print(f"Neutral tweets percentage: {(100*len(neutralTweets)/len(df)):.2f} %")

Training NaiveBayes Classification Model....................
Training of NaiveBayes Classification Model Completed !!!!
Finding Sentiment of the given Tweets..........

---------------------Result-------------------------
Positive tweets percentage: 63.46 %
Negative tweets percentage: 33.69 %
Neutral tweets percentage: 2.84 %
