# Sentiment analysis : word features for frequency (tf-idf)

### import required modules

In [7]:
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords as sw
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

### define some variables

In [9]:
data = load_data_set('data.csv')
stopwords = sw.words('english')
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
frequency_of_words = dict()
frequency_per_sentiment = dict()
sentiments = set([datum[0] for datum in data])
for sentiment in sentiments:
    frequency_per_sentiment[sentiment] = dict()

### function to process a word

In [10]:
def process_word(word):
    word = lemmatizer.lemmatize(word)
    word = stemmer.stem(word)
    return word

### function to process a sentence

In [11]:
def get_significant_words(sentence):
    sentence = sentence.translate(str.maketrans('', '', string.punctuation))
    sentence = sentence.lower()
    words = word_tokenize(sentence)
    words = list(filter(lambda word: word not in stopwords, words))
    words = list(map(process_word, words))
    return words

### function to update the bag of words

In [12]:
def update_words_list(sentiment, words):
    for word in words:
        if word not in frequency_of_words.keys():
            frequency_of_words[word] = 1
        else:
            frequency_of_words[word] += 1

        if word not in frequency_per_sentiment[sentiment].keys():
            frequency_per_sentiment[sentiment][word] = 1
        else:
            frequency_per_sentiment[sentiment][word] += 1

In [14]:
for datum in data[:10]:
    features = get_significant_words(datum[1])
    update_words_list(datum[0], features)
print(frequency_of_words)
print(frequency_per_sentiment)

{'day': 2, 'feel': 10, 'close': 4, 'partner': 4, 'friend': 2, 'peac': 2, 'also': 2, 'experi': 2, 'contact': 4, 'peopl': 4, 'regard': 2, 'greatli': 2, 'everi': 2, 'time': 8, 'imagin': 2, 'someon': 2, 'love': 2, 'could': 2, 'seriou': 2, 'ill': 2, 'even': 2, 'death': 4, 'obvious': 2, 'unjustli': 2, 'treat': 2, 'possibl': 2, 'elucid': 2, 'think': 4, 'short': 4, 'live': 2, 'relat': 2, 'period': 2, 'life': 2, 'use': 2, 'gather': 2, 'found': 2, 'involuntarili': 2, 'sit': 2, 'next': 2, 'two': 2, 'express': 2, 'opinion': 2, 'consid': 4, 'low': 2, 'discrimin': 2, 'realiz': 6, 'direct': 2, 'discont': 2, 'way': 2, 'tri': 2, 'put': 2, 'blame': 2, 'instead': 2, 'sort': 2, 'feeli': 2, 'guilti': 2, 'materi': 2, 'thing': 2, 'import': 2, 'care': 2, 'rel': 2, 'selfcent': 2, 'girlfriend': 2, 'taken': 2, 'exam': 2, 'went': 2, 'parent': 2, 'place': 2, 'first': 2, 'mean': 2, 'car': 2, 'overtak': 2, 'anoth': 2, 'forc': 2, 'drive': 2, 'road': 2}
{'disgust': {'gather': 2, 'found': 2, 'involuntarili': 2, 'sit': 