In [46]:
import re
import os
import nltk
import math

In [193]:
def words (path):
    lex = {}
    for file in os.listdir(path):
        with open (path + os.sep + file, 'r', encoding = 'utf-8') as f:
            subtitles = f.read()
        lines = clean_text (subtitles)
        lex[file] = lines
        with open(path + 'copy' + os.sep + file, 'w', encoding = 'utf-8') as f:
            f.write(' '.join(lines))
    return lex

In [192]:
def clean_text (text):
    text = re.sub ('\d+\n^(.*?)-->(.*?)$', '', text)
    text = re.sub ('<.*?>', '', text)
    text = text.split()
    output = []
    for i in range(len(text)):
        text[i] = re.sub("[^A-Za-z]", '', text[i])
        if text[i] != '':
            output.append(text[i].lower())
    return output

In [70]:
def tfidf_count(texts):
    words = {}
    words_by_text = {}
    D = len(texts)
    for name in texts:
        words_by_text[name] = count_words(texts[name])
        words = add_words(words_by_text[name], words)

    tfidfs = {}
    for name in texts:
        tfidfs[name] = tfidf_text(D, texts[name], words_by_text[name], words)

    return tfidfs, words

In [55]:
def tfidf_text(D, text, text_words, words):
    tfidf_words = {}
    text_size = len(text)
    for word in text_words:
        tf = text_words[word] / text_size
        idf = math.log(D / words[word][1])
        tfidf_words[word] = tf * idf
    return tfidf_words

In [53]:
def add_words(new_words, words):
    for word in new_words:
        if word not in words:
            words[word] = (0, 0)
        words[word] = (words[word][0] + new_words[word], words[word][1] + 1)
    return words

In [8]:
def count_words(text):
    frequency = {}
    for word in text:
        if word not in frequency:
            frequency[word] = 0
        frequency[word] += 1
    return frequency

In [195]:
texts = words('movies')

In [157]:
tfidfs, word_count = tfidf_count(texts)

In [165]:
import statistics
lens = [len(tfidfs[i]) for i in tfidfs]

In [168]:
min(lens)

835

In [189]:
words_from_tops = {}
for name in tfidfs:
    #print(name)
    ws = get_top_words(tfidfs[name], 400)
    #print(', '.join(ws) + '\n')
    for word in ws:
        if word not in words_from_tops:
            words_from_tops[word] = 0
        words_from_tops[word] += 1

In [191]:
top_sorted = sorted(words_from_tops.items(), key=lambda words_from_tops: words_from_tops[1],  reverse=True)
with open('top_common_top400_words.txt', 'w', encoding='utf-8') as f:
    f.write('\n'.join([i[0] + '\t' + str(i[1]) for i in top_sorted]))

In [119]:
with open('top100words.csv', 'r', encoding='utf-8') as f:
    stops = f.read().split()

In [134]:
with open('freqs.txt', 'r', encoding='utf-8') as f:
    lines = f.read().split('\n')
    freqs = {line.split('\t')[0]: int(line.split('\t')[1]) / 1000000 for line in lines}

In [160]:
word_count_wo_stops = {w: word_count[w] for w in word_count if w not in stops}

In [99]:
def get_top_words(words, top=10):
    return [x[0] for x in sorted(words.items(), key=lambda words: words[1],  reverse=True)][:top]

In [187]:
words_amount = {x: word_count[x][0] for x in word_count if word_count[x][1] <= 70 and word_count[x][1] >= 50}
with open('50-70_films_top_words.txt', 'w', encoding='utf-8') as f:
    f.write('\n'.join([i[0] + '\t' + str(i[1]) for i in sorted(words_amount.items(), key=lambda words_amount: words_amount[1],  reverse=True)[:100]]))

In [162]:
sorted(word_count_wo_stops.items(), key=lambda word_count_wo_stops: word_count_wo_stops[1][0],  reverse=True)

[('is', (6912, 88)),
 ('im', (4632, 87)),
 ('are', (4085, 88)),
 ('dont', (3767, 88)),
 ('was', (3363, 88)),
 ('here', (3054, 88)),
 ('youre', (2881, 88)),
 ('right', (2702, 88)),
 ('were', (2344, 88)),
 ('yeah', (2210, 88)),
 ('oh', (2096, 87)),
 ('thats', (2020, 88)),
 ('got', (1999, 88)),
 ('hey', (1774, 87)),
 ('okay', (1606, 80)),
 ('gonna', (1590, 83)),
 ('did', (1427, 88)),
 ('hes', (1418, 88)),
 ('why', (1339, 88)),
 ('need', (1297, 87)),
 ('going', (1294, 88)),
 ('man', (1273, 88)),
 ('where', (1273, 88)),
 ('cant', (1248, 88)),
 ('down', (1166, 88)),
 ('ill', (1165, 86)),
 ('been', (1150, 88)),
 ('yes', (1101, 88)),
 ('something', (1047, 88)),
 ('tell', (1046, 88)),
 ('little', (997, 87)),
 ('never', (980, 88)),
 ('lets', (974, 87)),
 ('more', (971, 87)),
 ('ive', (933, 88)),
 ('sorry', (908, 87)),
 ('let', (900, 88)),
 ('really', (897, 88)),
 ('has', (895, 88)),
 ('whats', (874, 86)),
 ('please', (864, 87)),
 ('help', (853, 88)),
 ('had', (848, 88)),
 ('didnt', (844, 88)),
 

In [146]:
total_word_count = 0
for word in word_count:
    total_word_count += word_count[word][0]
total_word_count

641679

In [147]:
D = len(texts)

In [153]:
tfidf_words_all_docs = {}
for word in word_count:
    tf = word_count[word][0] / total_word_count
    idf = math.log(D / word_count[word][1])
    tfidf_words_all_docs[word] = tf * idf

In [154]:
get_top_words(tfidf_words_all_docs, 100)

['rogers',
 'stark',
 'clark',
 'peter',
 'tony',
 'logan',
 'spiderman',
 'superman',
 'reed',
 'lois',
 'charles',
 'fuck',
 'bruce',
 'gotham',
 'uh',
 'wayne',
 'ben',
 'victor',
 'diana',
 'romanoff',
 'wade',
 'batman',
 'erik',
 'johnny',
 'asgard',
 'parker',
 'jean',
 'hank',
 'quill',
 'baymax',
 'fury',
 'mutants',
 'harvey',
 'john',
 'loki',
 'thor',
 'ok',
 'l',
 'fucking',
 'scott',
 'im',
 'groot',
 'jane',
 'april',
 'grunting',
 'harry',
 'luthor',
 'zorro',
 'lane',
 'captain',
 'hiro',
 'professor',
 'thanos',
 'rachel',
 'raph',
 'mikey',
 'mutant',
 'lex',
 'ronan',
 'alfred',
 'mariko',
 'leo',
 'kent',
 'sam',
 'hydra',
 'dr',
 'wakanda',
 'liz',
 'elena',
 'linda',
 'pierce',
 'steve',
 'dent',
 'hal',
 'sacks',
 'banner',
 'castle',
 'dont',
 'jarvis',
 'frank',
 'shit',
 'stone',
 'ninja',
 'okay',
 'youre',
 'turnbull',
 'danny',
 'yondu',
 'hulk',
 'octopus',
 'gwen',
 'ares',
 'elektra',
 'howard',
 'metropolis',
 'alright',
 'king',
 'matt',
 'abby',
 'jo