In [1]:
import nltk

from nltk.corpus import stopwords, wordnet
from nltk import sent_tokenize, word_tokenize

import pickle

In [2]:
stop_words = set(stopwords.words('english'))

#### Reading in reviews in text file.

In [3]:
negative_reviews = open('negative.txt') #opening negative reviews
positive_reviews = open('positive.txt') #opening positive reviews

#### Splitting text file into lines where each line represents a Tweet.

In [4]:
negative_reviews_lines = [negative_review.rstrip('\n') for negative_review in negative_reviews]
positive_reviews_lines = [positive_review.rstrip('\n') for positive_review in positive_reviews]

#### Splitting each line into words through tokenization.

In [5]:
negative_reviews_words = [word_tokenize(str(negative_review_words)) for negative_review_words in negative_reviews_lines]
positive_reviews_words = [word_tokenize(str(positive_review_words)) for positive_review_words in positive_reviews_lines]

#### Note: It is interesting to note that str() function creates a list within a list. This is desirable because it helps to keep the words broken down per tweet.

In [7]:
broken_negative_reviews_words = []

for each_broken_tweet in negative_reviews_words:
    
    for each_character_in_each_tweet in each_broken_tweet:
        
        broken_negative_reviews_words.append(each_character_in_each_tweet)
        
broken_positive_reviews_words = []

for each_broken_tweet in positive_reviews_words:
    
    for each_character_in_each_tweet in each_broken_tweet:
        
        broken_positive_reviews_words.append(each_character_in_each_tweet)

#### Removing stopwords from the whole dataset.

In [8]:
cleaned_negative_reviews_words = []
cleaned_positive_reviews_words = []

for negative_words in broken_negative_reviews_words:
    
    if negative_words not in stop_words:
        
        cleaned_negative_reviews_words.append(negative_words)
        
for positive_words in broken_positive_reviews_words:
    
    if positive_words not in stop_words:
        
        cleaned_positive_reviews_words.append(positive_words)

#### Even after cleaning up the dataset with stopwords, we still have words like names of vehicles, people etc. which might be redundant for us. Therefore, we will use part-of-speech tagging.

In [10]:
tagged_negative = []
tagged_positive = []

tagged_negative.append(nltk.pos_tag(cleaned_negative_reviews_words))
tagged_positive.append(nltk.pos_tag(cleaned_positive_reviews_words))

#### Finding the positive and negative adjectives in the datasets.

In [17]:
negative_adjectives = []
positive_adjectives = []

for i in range(len(tagged_negative[0])):
    
    if tagged_negative[0][i][1] == 'JJ':
        
        try:
            negative_adjectives.append(tagged_negative[0][i][0])
            
        except Exception as e:
            print(str(e))
            
for i in range(len(tagged_positive[0])):
    
    if tagged_positive[0][i][1] == 'JJ':
        
        try:
            positive_adjectives.append(tagged_positive[0][i][0])
            
        except Exception as e:
            print(str(e))

#### Removing duplicate words that are found in both positive and negative adjective lists.

In [19]:
for adjective in negative_adjectives:
    
    if adjective in positive_adjectives:
        
        negative_adjectives.remove(adjective)
        positive_adjectives.remove(adjective)

#### Finding synonyms of adjectives.

In [23]:
negative_synonyms = []
positive_synonyms = []

for adjectives in negative_adjectives:
    
    for synonyms_set in wordnet.synsets(adjectives):
        
        for synonyms in synonyms_set.lemmas():
            
            negative_synonyms.append(synonyms.name())
            
for adjectives in positive_adjectives:
    
    for synonyms_set in wordnet.synsets(adjectives):
        
        for synonyms in synonyms_set.lemmas():
            
            positive_synonyms.append(synonyms.name())

#### Removing any duplicate synonyms.

In [27]:
negative_synonyms = list(set(negative_synonyms)) #set is an unordered collection with no duplicate elements
positive_synonyms = list(set(positive_synonyms))

#### It is possible to have duplicate synonyms because we are searching through multiple sets of synonyms which might already have the same synonyms in different sets.

#### Combining Adjectives and Synonyms together.

In [28]:
for adjectives in negative_adjectives:
    
    negative_synonyms.append(adjectives)
    
for adjectives in positive_adjectives:
    
    positive_synonyms.append(adjectives)

#### Performing Frequency Distribution

In [29]:
negative_FreqDist = dict(nltk.FreqDist(negative_synonyms))
positive_FreqDist = dict(nltk.FreqDist(positive_synonyms))

#### We are able to put it into a dictionary form where the key is the adjectives and the value is the number of occurrences.

In [30]:
nltk.FreqDist(negative_synonyms)

FreqDist({'bad': 181, 'much': 84, 'good': 61, 'little': 58, 'dull': 43, 'many': 38, 'flat': 37, 'new': 36, 'pretentious': 32, 'funny': 30, ...})

In [31]:
negative_FreqDist

{'Labor': 1,
 'skeletal': 1,
 'shoddy': 5,
 'debase': 1,
 'break_up': 1,
 'encomium': 1,
 'mellowed': 1,
 'elucidate': 1,
 'drunken': 2,
 'give_it_the_deep_six': 1,
 'undeveloped': 3,
 'brother': 1,
 'envelope': 1,
 'gumption': 1,
 'heterosexual': 1,
 'thingmabob': 1,
 'cypher': 1,
 'unexpended': 1,
 'painkiller': 1,
 'ambiance': 1,
 'unmindful': 1,
 'blow_up': 1,
 'Boche': 1,
 'silent': 2,
 'needed': 1,
 'circle': 1,
 'shadow': 1,
 'draw_in': 1,
 'progress': 1,
 'showing': 1,
 'obligatory': 5,
 'enrol': 1,
 'blind': 2,
 'fictitious_character': 1,
 'movement': 1,
 'invidia': 1,
 'crystallise': 1,
 'prisoner_of_war': 1,
 'unceasing': 2,
 'hackneyed': 2,
 'throw_off': 1,
 'can': 1,
 'geared': 2,
 'trench': 1,
 'Holy_Joe': 1,
 'tenor': 1,
 'open_up': 1,
 'call_off': 1,
 'stocked_with': 1,
 'drop_back': 1,
 'one-time': 1,
 'darkness': 1,
 'required': 2,
 'mainstream': 3,
 'unkindly': 1,
 'commutable': 1,
 'mercurial': 1,
 'crass': 3,
 'specialise': 1,
 'unending': 2,
 'out_of_the_question'

#### Searching for same words in both dictionary and removing while adjusting values

In [32]:
negative_dictionary = {}
positive_dictionary = {}

count = 0

for neg_word_key, neg_word_count_value in negative_FreqDist.items():
    
    for pos_word_key, pos_word_count_value in positive_FreqDist.items():
        
        if neg_word_key == pos_word_key:
            
            count += 1
            
            if (neg_word_count_value > pos_word_count_value):
                
                neg_word_count_value = neg_word_count_value - pos_word_count_value
                
                pos_word_count_value = 0
                
                negative_dictionary.update({neg_word_key : neg_word_count_value})
                
            elif (pos_word_count_value > neg_word_count_value):
                
                pos_word_count_value = pos_word_count_value - neg_word_count_value
                
                neg_word_count_value = 0
                
                positive_dictionary.update({pos_word_key : pos_word_count_value})