<a href="https://colab.research.google.com/github/Sensrdt/Sentiment-Analyzer/blob/master/Sentiment_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# NLTk install and Downloading the Data

In [0]:
import nltk

In [66]:
nltk.download('twitter_samples')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

[nltk_data] Downloading package twitter_samples to /root/nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Tokenizing the Data


In [0]:
from nltk.corpus import twitter_samples
tp = twitter_samples.strings('positive_tweets.json')
tn = twitter_samples.strings('negative_tweets.json')
ns = twitter_samples.strings('tweets.20150430-223406.json')
break_into_tokens = twitter_samples.tokenized('positive_tweets.json')

In [68]:
from nltk.tag import pos_tag
mapping = twitter_samples.tokenized('positive_tweets.json')
print(pos_tag(mapping[0]))

[('#FollowFriday', 'JJ'), ('@France_Inte', 'NNP'), ('@PKuchly57', 'NNP'), ('@Milipol_Paris', 'NNP'), ('for', 'IN'), ('being', 'VBG'), ('top', 'JJ'), ('engaged', 'VBN'), ('members', 'NNS'), ('in', 'IN'), ('my', 'PRP$'), ('community', 'NN'), ('this', 'DT'), ('week', 'NN'), (':)', 'NN')]


# Normalisation of the Data

In [69]:
from nltk.stem.wordnet import WordNetLemmatizer
def lemmatizing_sentence(tt):
  lemmatizer = WordNetLemmatizer()
  lemmatized_sentence = []
  for word, tag in pos_tag(tt):
    if tag.startswith('NN'):
      pos = 'n'
    elif tag.startswith('VB'):
      pos = 'v'
    else:
      pos = 'a'
    lemmatized_sentence.append(lemmatizer.lemmatize(word, pos))
  return lemmatized_sentence

print(lemmatizing_sentence(mapping[0]))

['#FollowFriday', '@France_Inte', '@PKuchly57', '@Milipol_Paris', 'for', 'be', 'top', 'engage', 'member', 'in', 'my', 'community', 'this', 'week', ':)']


# Removing/cleaning the noise 


In [0]:
import string, re
def remove_noise(mapping, stop_words = ()):

    cleaned_tokens = []

    for token, tag in pos_tag(mapping):
        token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\
                       '(?:%[0-9a-fA-F][0-9a-fA-F]))+','', token)
        token = re.sub("(@[A-Za-z0-9_]+)","", token)

        if tag.startswith("NN"):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'

        lemmatizer = WordNetLemmatizer()
        token = lemmatizer.lemmatize(token, pos)

        if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words:
            cleaned_tokens.append(token.lower())
    return cleaned_tokens

In [71]:
from nltk.corpus import stopwords
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [72]:
print(remove_noise(mapping[0], stopwords.words('english')))

['#followfriday', 'top', 'engage', 'member', 'community', 'week', ':)']


In [0]:
tpt = twitter_samples.tokenized('positive_tweets.json')
npt = twitter_samples.tokenized('negative_tweets.json')
stpw = stopwords.words('english')

pctl = []
nctl = []

for tokens in tpt:
  pctl.append(remove_noise(tokens, stpw))
for tokens in npt:
  nctl.append(remove_noise(tokens, stpw))

# Frequency of words

In [0]:
def get_all_words(cleaned_tokens_list):
    for tokens in cleaned_tokens_list:
        for token in tokens:
            yield token

all_pos_words = get_all_words(pctl)

In [75]:
from nltk import FreqDist

freq_dist_pos = FreqDist(all_pos_words)
print(freq_dist_pos.most_common(10))

[(':)', 3691), (':-)', 701), (':d', 658), ('thanks', 388), ('follow', 357), ('love', 333), ('...', 290), ('good', 283), ('get', 263), ('thank', 253)]


# Preparing the data 

In [0]:
def get_all_words(cleaned_tokens_list):
    for tokens in cleaned_tokens_list:
        for token in tokens:
            yield token

all_pos_words = get_all_words(pctl)

In [0]:
def get_tweets_for_model(cleaned_tokens_list):
    for tweet_tokens in cleaned_tokens_list:
        yield dict([token, True] for token in tweet_tokens)

positive_tokens_for_model = get_tweets_for_model(pctl)
negative_tokens_for_model = get_tweets_for_model(nctl)

## Spliting data for training and testing

In [0]:
import random

positive_dataset = [(tweet_dict, "Positive")
for tweet_dict in positive_tokens_for_model]

negative_dataset = [(tweet_dict, "Negative")
for tweet_dict in negative_tokens_for_model]

dataset = positive_dataset + negative_dataset

random.shuffle(dataset)

train_data = dataset[:7000]
test_data = dataset[7000:]

# Train the data and then Test the data on a dummy data

In [79]:
from nltk import classify
from nltk import NaiveBayesClassifier
classifier = NaiveBayesClassifier.train(train_data)

print("Accuracy is :", classify.accuracy(classifier, test_data))

print(classifier.show_most_informative_features(10))

Accuracy is : 0.997
Most Informative Features
                      :( = True           Negati : Positi =   2046.4 : 1.0
                      :) = True           Positi : Negati =   1657.3 : 1.0
                follower = True           Positi : Negati =     35.9 : 1.0
                     x15 = True           Negati : Positi =     19.5 : 1.0
                     bam = True           Positi : Negati =     19.1 : 1.0
                 welcome = True           Positi : Negati =     17.5 : 1.0
                     sad = True           Negati : Positi =     17.3 : 1.0
              appreciate = True           Positi : Negati =     15.1 : 1.0
                    blog = True           Positi : Negati =     15.1 : 1.0
               community = True           Positi : Negati =     13.8 : 1.0
None


In [81]:
from nltk.tokenize import word_tokenize

ctweet = "Just In: 38 CISF personnel from Kolkata have tested positive. They were all quarantined as they were contacts of one CISF personnel who had died of Covid-19. Total 39 positive cases so far."
ctweet1 = "Many countries are taking drastic measures in a bid to halt the spread of the virus, including social distancing, closing bars, restaurants and schools or, in the case of places like Italy, putting the entire community on lockdown."
ctweet2 = "The Indian Army is considering a proposal to attract young working professionals to join the force for a three-year tenure as officers and in other ranks for a variety of roles. The Tour of Duty (ToD) for three years is being mulled in the wake of resurgence of nationalism and patriotism in the country. "
ctweet3 = "Security forces in Burkina Faso killed 20 terrorists in a clash near the border with Niger, the army said Wednesday. "
ctoken = remove_noise(word_tokenize(ctweet3))
print(classifier.classify(dict([token, True] for token in ctoken)))

Negative
