In [1]:
import os
import glob


def read_data(data_dir='data'):
    data = []
    
    path = os.path.join(data_dir, '*.txt')
    files = glob.glob(path)
    
    for f in files:
        with open(f,encoding="utf8") as document:
            data.append(document.read())
            
    return data

In [2]:
data = read_data()

In [3]:
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import *
import re

def document_to_words(document):
    nltk.download("stopwords", quiet=True)
    stemmer = PorterStemmer()
    
    text = re.sub(r"[^a-zA-Z0-9]", " ", document.lower())
    words = text.split()
    words = [w for w in words if w not in stopwords.words("english")]
    words = [PorterStemmer().stem(w) for w in words]
    
    return words

In [13]:
data_in_words = []
for document in data:
    data_in_words.append(document_to_words(document))
data_in_words

[['object',
  'orient',
  'program',
  'inherit',
  'way',
  'form',
  'new',
  'class',
  'instanc',
  'call',
  'object',
  'use',
  'class',
  'alreadi',
  'defin',
  'inherit',
  'concept',
  'invent',
  '1967',
  'simula',
  'new',
  'class',
  'known',
  'deriv',
  'class',
  'take',
  'inherit',
  'attribut',
  'behavior',
  'pre',
  'exist',
  'class',
  'refer',
  'base',
  'class',
  'ancestor',
  'class',
  'intend',
  'help',
  'reus',
  'exist',
  'code',
  'littl',
  'modif',
  'inherit',
  'provid',
  'support',
  'represent',
  'categor',
  'comput',
  'languag',
  'categor',
  'power',
  'mechan',
  'number',
  'inform',
  'process',
  'crucial',
  'human',
  'learn',
  'mean',
  'gener',
  'known',
  'specif',
  'entiti',
  'appli',
  'wider',
  'group',
  'given',
  'belong',
  'relat',
  'establish',
  'cognit',
  'economi',
  'less',
  'inform',
  'need',
  'store',
  'specif',
  'entiti',
  'particular',
  'inherit',
  'also',
  'sometim',
  'call',
  'gener',
  '

In [5]:
def term_frequency(data):
    word_count = {}
    for document in data:
        for word in document:
            if word in word_count:
                word_count[word] += 1
            else:
                word_count[word] = 1
    return word_count

In [7]:
freq = term_frequency(data_in_words)

In [8]:
def build_aux(data):
    word_document = {}
    for i in range (0,len(data)):
        for j in range (0,len(data[i])):
            if data[i][j] in word_document:
                word_document[data[i][j]][i].append(j)
            else:
                word_document[data[i][j]] = [[] for i in range(len(data))]
                word_document[data[i][j]][i].append(j)
    return word_document

In [9]:
aux = build_aux(data_in_words)

In [10]:
aux

{'object': [[0, 10, 90], [], [11], [], []],
 'orient': [[1], [], [], [], []],
 'program': [[2, 135], [], [], [], [4, 60, 67, 69, 73, 78, 82, 94, 95]],
 'inherit': [[3, 15, 26, 44, 81, 116, 125, 136, 151, 167, 168],
  [],
  [],
  [],
  []],
 'way': [[4], [270], [48], [64, 168], []],
 'form': [[5], [], [], [166], [65]],
 'new': [[6, 20, 162], [], [], [], []],
 'class': [[7, 12, 21, 24, 31, 34, 36, 89], [], [], [], []],
 'instanc': [[8, 91], [], [], [], [87]],
 'call': [[9, 84, 141], [40], [], [5, 139], [93, 265]],
 'use': [[11, 169],
  [4, 79, 95, 289],
  [18, 27],
  [19, 33],
  [24, 109, 129, 142, 155, 161, 197]],
 'alreadi': [[13], [], [], [], [245, 253, 261]],
 'defin': [[14], [155], [], [], []],
 'concept': [[16], [259], [], [], []],
 'invent': [[17], [286], [], [], []],
 '1967': [[18], [], [], [], []],
 'simula': [[19], [], [], [], []],
 'known': [[22, 62], [217], [52, 58], [], []],
 'deriv': [[23], [194], [], [142], []],
 'take': [[25], [], [], [128], [16]],
 'attribut': [[27], [],

In [11]:
sorted_aux = dict(sorted(aux.items()))

In [19]:
sorted_aux

{'0': [[], [183], [], [], []],
 '1': [[], [73], [94], [], [136, 146]],
 '10': [[], [184], [], [], []],
 '1940': [[], [], [], [], [25]],
 '1953': [[], [], [], [], [39]],
 '1967': [[18], [], [], [], []],
 '2': [[], [], [107], [43], [151, 181]],
 '2005': [[], [83], [], [], []],
 '285': [[], [56], [], [], []],
 '3': [[], [], [122], [], [160]],
 '336': [[], [84], [], [], []],
 '4': [[], [], [136], [], []],
 '6': [[], [55], [], [], []],
 '8': [[], [74], [], [], []],
 '999': [[], [57], [], [], []],
 'abstract': [[102], [], [], [], []],
 'academ': [[], [247], [], [], []],
 'accept': [[], [], [], [], [99]],
 'accomplish': [[153], [], [], [], []],
 'accord': [[], [], [], [73], []],
 'account': [[], [], [], [129], []],
 'act': [[], [], [], [160], []],
 'action': [[], [], [], [], [85, 101]],
 'actual': [[], [225], [], [], []],
 'ad': [[161], [], [], [], []],
 'adjac': [[], [], [], [], [127]],
 'advanc': [[], [], [], [], [288]],
 'advantag': [[124], [], [], [], []],
 'algebra': [[], [], [6], [], []

In [66]:
import numpy as np
def idf(aux, n):
    idf = {}
    for key, val in aux.items():
        count = 0
        for document in val:
            if len(document) > 0:
                count += 1
        idf[key] = np.log10(n/count)
    return idf

In [67]:
idf_dict = idf(aux,len(data))
idf_dict

{'object': 0.3979400086720376,
 'orient': 0.6989700043360189,
 'program': 0.3979400086720376,
 'inherit': 0.6989700043360189,
 'way': 0.09691001300805642,
 'form': 0.2218487496163564,
 'new': 0.6989700043360189,
 'class': 0.6989700043360189,
 'instanc': 0.3979400086720376,
 'call': 0.09691001300805642,
 'use': 0.0,
 'alreadi': 0.3979400086720376,
 'defin': 0.3979400086720376,
 'concept': 0.3979400086720376,
 'invent': 0.3979400086720376,
 '1967': 0.6989700043360189,
 'simula': 0.6989700043360189,
 'known': 0.2218487496163564,
 'deriv': 0.2218487496163564,
 'take': 0.2218487496163564,
 'attribut': 0.6989700043360189,
 'behavior': 0.6989700043360189,
 'pre': 0.6989700043360189,
 'exist': 0.6989700043360189,
 'refer': 0.3979400086720376,
 'base': 0.3979400086720376,
 'ancestor': 0.6989700043360189,
 'intend': 0.6989700043360189,
 'help': 0.3979400086720376,
 'reus': 0.3979400086720376,
 'code': 0.6989700043360189,
 'littl': 0.6989700043360189,
 'modif': 0.6989700043360189,
 'provid': 0.39

In [68]:
def tf_idf(aux,idf):
    tf_idf = {}
    for key, val in aux.items():
        for i in range (0, len(val)):
            if key in tf_idf:
                tf_idf[key][i].append(np.log10(1 + len(val[i])) * idf[key])
                
            else:
                tf_idf[key] = [[] for i in range(len(val))]
                tf_idf[key][i].append(np.log10(1 + len(val[i])) * idf[key])
                
    return tf_idf

In [69]:
tf_idf_dict = tf_idf(aux,idf_dict)
tf_idf_dict

{'object': [[0.23958375817013625], [0.0], [0.11979187908506812], [0.0], [0.0]],
 'orient': [[0.21041093737452468], [0.0], [0.0], [0.0], [0.0]],
 'program': [[0.18986563624075592], [0.0], [0.0], [0.0], [0.3979400086720376]],
 'inherit': [[0.7543153202292586], [0.0], [0.0], [0.0], [0.0]],
 'way': [[0.029172820795611586],
  [0.029172820795611586],
  [0.029172820795611586],
  [0.04623782700130269],
  [0.0]],
 'form': [[0.06678312813507141],
  [0.0],
  [0.0],
  [0.06678312813507141],
  [0.06678312813507141]],
 'new': [[0.42082187474904936], [0.0], [0.0], [0.0], [0.0]],
 'class': [[0.6669868909604184], [0.0], [0.0], [0.0], [0.0]],
 'instanc': [[0.18986563624075592],
  [0.0],
  [0.0],
  [0.0],
  [0.11979187908506812]],
 'call': [[0.05834564159122317],
  [0.029172820795611586],
  [0.0],
  [0.04623782700130269],
  [0.04623782700130269]],
 'use': [[0.0], [0.0], [0.0], [0.0], [0.0]],
 'alreadi': [[0.11979187908506812],
  [0.0],
  [0.0],
  [0.0],
  [0.23958375817013625]],
 'defin': [[0.11979187908