# Setup

In [1]:
%load_ext autoreload
%autoreload 2

# modules
import numpy as np 
import os

# Build Term Frequency inverse Document Frequency (TFiDF)

In [2]:
def build_tfidf(filename_vocab, filename_tweet, min_tf, min_idf):
    
    tfidf_dict = {}

    # term frequency (tf) 
    print("Read the term frequencies...")
    with open(filename_vocab, 'r', encoding='utf-8') as datafile:
        for line in datafile:
            tf = int(line.strip().split()[0])
            word = line.strip().split()[1]
            if '\ufeff' in word:
                word = "<user>"
            if tf > min_tf:
                tfidf_dict[word] = [1 + np.log(tf), 0]
            else: 
                tfidf_dict[word] = [0, 0]
                
    # inverse document frequency (idf)
    print("Compute the inverse document frequencies...")
    N_tweet = 0
    with open(filename_tweet, 'r', encoding='utf-8') as datafile:
        for line in datafile:
            N_tweet += 1
            for word in line.split(' '):
                if word in tfidf_dict:
                     tfidf_dict[word][1] += 1
    print("Number of tweets analized: ", N_tweet)

    # apply normalization
    print("Apply log smooth normalization to the tf and idf...")
    for word, freq in tfidf_dict.items():
        tf = freq[0]
        idf = freq[1]
        if idf > min_idf and tf > 0:
            tfidf_dict[word] = [1+np.log(tf), np.log(1+N_tweet/idf)] 
    print("Finished.")
    
    return tfidf_dict


In [3]:
# adapt path
os.chdir(r'D:/Documents/etudes/epfl/MA1/cours/MachineLearning/Project2/data/twitter_datasets_epfl/full/')

In [4]:
# read word occurences in a dictionary
filename_vocab = 'vocab_all_full_processed.txt'
filename_tweet = 'all_full_processed.txt'
min_tf = 100
min_idf = 100
tfidf = build_tfidf(filename_vocab, filename_tweet, min_tf, min_idf)

Read the term frequencies...
Compute the inverse document frequencies...
Number of tweets analized:  2280482
Apply log smooth normalization to the tf and idf...
Finished.


# Save the tfidf in a text file

In [5]:
# adapt path
os.chdir(r'D:/Documents/etudes/epfl/MA1/cours/MachineLearning/Project2/data/tfidf/')

In [None]:
# filename
filenameout = 'tfidf.txt'

# write
with open(filenameout, 'w', encoding='utf-8-sig') as datafile:
    datafile.write("word tf idf")
    for word, freq in tfidf.items():
        line = word + ' ' + str(freq[0]) + ' ' + str(freq[1])
        datafile.write(line)
        datafile.write("\n")