# Sentiment analysis : word features for frequency (tf-idf)

### import required modules

In [18]:
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords as sw
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

### define some variables

In [19]:
data = load_data_set('data.csv')
stopwords = sw.words('english')
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
frequency_of_words = dict()
frequency_per_sentiment = dict()
sentiments = set([datum[0] for datum in data])
for sentiment in sentiments:
    frequency_per_sentiment[sentiment] = dict()

### function to load the dataset from the file

In [20]:
def load_data_set(filename):
    csv_reader = csv.reader(open(filename))
    data = list()
    for row in csv_reader:
        senti = row[0]
        content = ','.join(row[1:])
        data.append((senti, content))
    data.pop(0)
    return data

### function to process a word

In [21]:
def process_word(word):
    word = lemmatizer.lemmatize(word)
    word = stemmer.stem(word)
    return word

### function to process a sentence

In [22]:
def get_significant_words(sentence):
    sentence = sentence.translate(str.maketrans('', '', string.punctuation))
    sentence = sentence.lower()
    words = word_tokenize(sentence)
    words = list(filter(lambda word: word not in stopwords, words))
    words = list(map(process_word, words))
    return words

### function to update the bag of words

In [23]:
def update_words_list(sentiment, words):
    for word in words:
        if word not in frequency_of_words.keys():
            frequency_of_words[word] = 1
        else:
            frequency_of_words[word] += 1

        if word not in frequency_per_sentiment[sentiment].keys():
            frequency_per_sentiment[sentiment][word] = 1
        else:
            frequency_per_sentiment[sentiment][word] += 1

In [24]:
for datum in data[:10]:
    features = get_significant_words(datum[1])
    update_words_list(datum[0], features)
print(frequency_of_words)
print(frequency_per_sentiment)

{'day': 1, 'feel': 5, 'close': 2, 'partner': 2, 'friend': 1, 'peac': 1, 'also': 1, 'experi': 1, 'contact': 2, 'peopl': 2, 'regard': 1, 'greatli': 1, 'everi': 1, 'time': 4, 'imagin': 1, 'someon': 1, 'love': 1, 'could': 1, 'seriou': 1, 'ill': 1, 'even': 1, 'death': 2, 'obvious': 1, 'unjustli': 1, 'treat': 1, 'possibl': 1, 'elucid': 1, 'think': 2, 'short': 2, 'live': 1, 'relat': 1, 'period': 1, 'life': 1, 'use': 1, 'gather': 1, 'found': 1, 'involuntarili': 1, 'sit': 1, 'next': 1, 'two': 1, 'express': 1, 'opinion': 1, 'consid': 2, 'low': 1, 'discrimin': 1, 'realiz': 3, 'direct': 1, 'discont': 1, 'way': 1, 'tri': 1, 'put': 1, 'blame': 1, 'instead': 1, 'sort': 1, 'feeli': 1, 'guilti': 1, 'materi': 1, 'thing': 1, 'import': 1, 'care': 1, 'rel': 1, 'selfcent': 1, 'girlfriend': 1, 'taken': 1, 'exam': 1, 'went': 1, 'parent': 1, 'place': 1, 'first': 1, 'mean': 1, 'car': 1, 'overtak': 1, 'anoth': 1, 'forc': 1, 'drive': 1, 'road': 1}
{'disgust': {'gather': 1, 'found': 1, 'involuntarili': 1, 'sit': 1