In [2]:
import json
import pandas as pd
import os
import numpy as np

In [3]:
with open('Dead-Poets-Society_script.json') as f:
    story = json.load(f)

In [4]:
from nltk.stem.porter import *
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# setup a new hash to store the results in
def script_cleaning(script):
    processed_article_hash = {}
    porter_stemmer = PorterStemmer()
    stop_words = set(stopwords.words('english'))
    # iterate through the keys, i.e. document ids, in the hash to pull out the stored text and process
    for key in script.keys():
        text_of_article = script[key]
        word_tokens = word_tokenize(text_of_article)
        words = [word for word in word_tokens if word.isalpha()]
        words = [str.lower(w) for w in words if not str.lower(w) in stop_words]
        processed_article_hash[key] = [porter_stemmer.stem(word) for word in words]
    return processed_article_hash

In [5]:
import numpy as np
def jacardian_distance(document_1_data, document_2_data):
    words_in_doc_1_not_in_doc_2 = list(set(document_1_data) - set(document_2_data))
    words_in_doc_2_not_in_doc_1 = list(set(document_2_data) - set(document_1_data))
    words_in_both_doc_1_and_doc_2 = list(set.intersection(*[set(document_1_data), set(document_2_data)]))
    
    jacardian = len(words_in_both_doc_1_and_doc_2)/(len(words_in_doc_1_not_in_doc_2)+
                                                    len(words_in_doc_2_not_in_doc_1)+
                                                    len(words_in_both_doc_1_and_doc_2))# divide the counts appropiately
    
    return jacardian

def cosine_similarity(document_1_data, document_2_data):
    document_vector_word_index = list(set.union(set(document_1_data),set(document_2_data))) # here fill this with an ordered list of all the unique words across both documents
    document_1_vector = np.array([document_1_data.count(word) for word in document_vector_word_index]) # fill in the array with the frequency of the words in the document
    document_2_vector = np.array([document_2_data.count(word) for word in document_vector_word_index]) # fill in the array with the frequency of the words in the document
    dot_product_of_two_document_vectors = document_1_vector.dot(document_2_vector)/(np.sqrt(np.dot(document_1_vector,document_1_vector)) * np.sqrt(np.dot(document_2_vector,document_2_vector)))
    return dot_product_of_two_document_vectors # you can refer to the numpy information on how to calculate the dot product of vectors

In [6]:
import pandas as pd
import matplotlib.pyplot as plt
# create a variable to store your table data... you could use a hash or some other data structure. 
# We just want it to identify which document is being compared to which other document.
def prep_data_structs(processed_article_hash):
    data_structure_for_jacard_similarity = {}#
    data_structure_for_cosine_similarity = {}#

    for doc_1_key in processed_article_hash.keys():
        data_structure_for_jacard_similarity[doc_1_key] = {}
        data_structure_for_cosine_similarity[doc_1_key] = {}
        for doc_2_key in processed_article_hash.keys():
            # we have the nested for loops as one way to compare each document to each other document
            data_structure_for_jacard_similarity[doc_1_key][doc_2_key] = jacardian_distance(processed_article_hash[doc_1_key], 
                                                                                            processed_article_hash[doc_2_key])
            data_structure_for_cosine_similarity[doc_1_key][doc_2_key] = cosine_similarity(processed_article_hash[doc_1_key], processed_article_hash[doc_2_key])
    return data_structure_for_jacard_similarity, data_structure_for_cosine_similarity
# finally, find some way to present this data back. Either as a straight table or a heatmap.
def create_heatmap(data_dict):
    temp = pd.DataFrame(data_dict)
    temp = temp.sort_index()[temp.sort_index().index]
    
    article_row = [str(each) for each in temp.index]
    article_col = [str(each) for each in temp.columns]

    data = temp.values
    fig, ax = plt.subplots(figsize=(12,12))
    im = ax.imshow(data)

    # We want to show all ticks...
    ax.set_xticks(np.arange(len(article_col)))
    ax.set_yticks(np.arange(len(article_row)))
    # ... and label them with the respective list entries
    ax.set_xticklabels(article_col)
    ax.set_yticklabels(article_row)

    # Rotate the tick labels and set their alignment.
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
             rotation_mode="anchor")

    # Loop over data dimensions and create text annotations.
    for i in range(len(article_row)):
        for j in range(len(article_col)):
            text = ax.text(j, i, round(data[i, j],2),
                           ha="center", va="center", color="w")

    ax.set_title("Similarity Heatmap")
    fig.tight_layout()
    plt.show()

In [7]:
with open('Dead-Poets-Society_script.json') as f:
    story = json.load(f)

processed_data = script_cleaning(story['dialogues'])
data_structs_j, data_structs_c = prep_data_structs(processed_data)
#create_heatmap(data_structs_c)
#create_heatmap(data_structs_j)
for k in data_structs_c.keys():
    print(k, sum(data_structs_c[k].values()), np.std(list(data_structs_c[k].values())), len(set(processed_data[k])))

NEIL 6.1779615018250436 0.22448133704296158 427
KEATING 5.961120375308899 0.20746671642834685 617
CHARLIE 6.205067926273911 0.21409281467343858 310
KNOX 5.949016213828811 0.20524334396625382 190
TODD 5.239155157677 0.22385699157921035 146
CAMERON 5.396288632530922 0.22601329867719602 203
MEEKS 3.8618271983108734 0.2087160435435516 117
CHRIS 3.8367990249902584 0.20623553278627615 74
PITTS 4.839671300385916 0.20981461016793404 112
MCALLISTER 2.4397637965689043 0.20945734504874425 59
BOYS 2.4568313827951553 0.21470326341100868 44
MR NOLAN 2.7964977952980736 0.2106717526902178 86
CHET 2.4844876727004928 0.2200488480472969 15
MR PERRY 3.6108990281309143 0.21449725424829866 44
HAGER 2.801439454046898 0.21378917566662448 39
GLORIA 2.7330743595791516 0.21949435240040732 15
BUBBA 2.451435290492159 0.21280178977650388 28
BOY 3.022597085274888 0.2110331797514609 16
PUCK 1.9955281236760793 0.21535707967300263 94


In [8]:
with open('Matrix,-The_script.json') as f:
    story = json.load(f)

processed_data = script_cleaning(story['dialogues'])
data_structs_j, data_structs_c = prep_data_structs(processed_data)
#create_heatmap(data_structs_c)
#create_heatmap(data_structs_j)
for k in data_structs_c.keys():
    print(k, sum(data_structs_c[k].values()), np.std(list(data_structs_c[k].values())), len(set(processed_data[k])))

NEO 5.846177656602462 0.24518717217528516 239
MORPHEUS 5.5328876676720595 0.24762542200642695 434
TRINITY 5.691913787052784 0.2494729132247568 187
AGENT SMITH 4.453894308800624 0.23285085474369022 278
TANK 4.892356151799113 0.22841507267809152 174
CYPHER 5.510050522675804 0.24767711194581804 201
GIZMO 3.44746916739152 0.22625130785993552 52
ORACLE 4.534667291478955 0.22797570750137616 72
AGENT JONES 2.4211850361440317 0.23602006758723632 27
APOC 3.2412507046602195 0.24289311912833547 13
MOUSE 3.7765998645012395 0.23103800770284924 22
AGENT BROWN 1.2839238097521912 0.2454413326364293 12
SCREEN 3.381209747659355 0.22769090331528014 56
DOZER 1.8156339648855822 0.23917954904974856 11
CABLE 2.246286296370231 0.23433216507306828 15


In [9]:
with open('Indiana-Jones-and-the-Temple-of-Doom_script.json') as f:
    story = json.load(f)

processed_data = script_cleaning(story['dialogues'])
data_structs_j, data_structs_c = prep_data_structs(processed_data)
#create_heatmap(data_structs_c)
#create_heatmap(data_structs_j)
for k in data_structs_c.keys():
    print(k, sum(data_structs_c[k].values()), np.std(list(data_structs_c[k].values())), len(set(processed_data[k])))

INDIANA 2.925704549160656 0.27527275823115915 593
WILLIE 3.1022257572357503 0.2738923714690337 386
SHORT ROUND 1.9593718006668235 0.3198393470480051 136
CHATTAR LAL 2.3947724434206132 0.27292709430860534 161
LAO 1.984761214370947 0.3007090545166056 43
MOLA RAM 1.9988767739663618 0.2907239358784528 72
MAHARAJAH 2.1265517425237634 0.2830271902890434 68
SHAMAN 2.0527455510641173 0.29671035301941806 49


In [10]:
with open('Shawshank-Redemption,-The_script.json') as f:
    story = json.load(f)

processed_data = script_cleaning(story['dialogues'])
data_structs_j, data_structs_c = prep_data_structs(processed_data)
#create_heatmap(data_structs_c)
#create_heatmap(data_structs_j)
for k in data_structs_c.keys():
    print(k, sum(data_structs_c[k].values()), np.std(list(data_structs_c[k].values())), len(set(processed_data[k])))

RED 5.363662332771118 0.23648591573735847 973
ANDY 5.002264781135497 0.2459950527208025 628
NORTON 4.8041832895401395 0.23836073514155867 356
HEYWOOD 4.542109221321405 0.22424701734870925 186
HADLEY 4.5728359125164735 0.21131617525597685 189
BROOKS 4.119083688838406 0.239573833812448 219
TOMMY 4.418295624929571 0.23251212915044617 157
FLOYD 2.631345905856876 0.23578352158700114 44
BOGS 2.867453952944398 0.23508433692972247 52
VOICE 1.3999211375289493 0.2510827252494794 15
SNOOZE 3.506759902153036 0.22709693656338684 25
GUARD 2.077734591854184 0.24031056093079597 21
JIGGER 3.009562358619279 0.22940361021826666 21
HAIG 2.8051084709796728 0.2312043550296021 21


In [11]:
with open('Departed,-The_script.json') as f:
    story = json.load(f)

processed_data = script_cleaning(story['dialogues'])
data_structs_j, data_structs_c = prep_data_structs(processed_data)
#create_heatmap(data_structs_c)
#create_heatmap(data_structs_j)
for k in data_structs_c.keys():
    print(k, sum(data_structs_c[k].values()), np.std(list(data_structs_c[k].values())), len(set(processed_data[k])))

COLIN 7.489506993077247 0.199803858711933 532
BILLY 8.24340686580522 0.17543250503524876 494
COSTELLO 7.358684980331508 0.1970240005338749 482
MADOLYN 5.350708067284642 0.20904295732514003 222
QUEENAN 5.976914411747699 0.20781494741600878 248
ELLERBY 6.346910127503907 0.20776358431534891 282
DIGNAM 6.112070626623671 0.19197149904863997 185
MISTER FRENCH 6.182231892197827 0.19360183562509184 132
BROWN 4.034620785462869 0.21326583457464882 73
DELAHUNT 5.553701067588109 0.1911385211821531 76
FITZY 5.0411949026083045 0.20229531770830544 59
SEAN 4.493532644143483 0.20262556398523537 90
YOUNG COSTELLO 3.9205965707443156 0.21603274901799305 71
DETECTIVE 1 4.379925439840374 0.20944194173981312 31
BARRIGAN 4.670121747437632 0.20025165515423776 38
BANKROBBER 3.813685596487633 0.21226630973120608 26


In [13]:
with open('John-Wick_script.json') as f:
    story = json.load(f)

processed_data = script_cleaning(story['dialogues'])
data_structs_j, data_structs_c = prep_data_structs(processed_data)
#create_heatmap(data_structs_c)
#create_heatmap(data_structs_j)
for k in data_structs_c.keys():
    print(k, sum(data_structs_c[k].values()), np.std(list(data_structs_c[k].values())), len(set(processed_data[k])))

JOHN 4.713708270525782 0.1976218444307212 227
VIGGO 4.912649858322166 0.1990259766088584 195
IOSEF 3.51780442908033 0.22664439965651456 57
AURELIO 4.4031958092365135 0.20765389486425784 45
MANAGER 3.3048531186716414 0.26057193735152673 58
WAITRESS 3.1140849070892394 0.26478745007431864 21
CAPTAIN 2.950061336806725 0.2191558932377404 32
MARCUS 4.042232293007286 0.22434541459482674 74
HARRY 4.413165122291295 0.24851339037037737 26
VIKTOR 2.812274379151498 0.23416884707174576 28
EDWARDO 3.46725439817502 0.2278626813105147 44
CHARLIE 4.000871888667173 0.2518701945473562 37
JENNY 2.6561951644328383 0.22636821578402522 30
JIMMY 3.8771157805157253 0.24276025696229905 20
BANK MANAGER 1.8860229615871291 0.24158349352241612 5
EDDIE 2.249099916781039 0.23000381885945975 10


In [15]:
with open('Wolf-of-Wall-Street,-The_script.json') as f:
    story = json.load(f)

processed_data = script_cleaning(story['dialogues'])
data_structs_j, data_structs_c = prep_data_structs(processed_data)
#create_heatmap(data_structs_c)
#create_heatmap(data_structs_j)
for k in data_structs_c.keys():
    print(k, sum(data_structs_c[k].values()), np.std(list(data_structs_c[k].values())), len(set(processed_data[k])))

JORDAN 7.737368620566339 0.19367404295659255 1477
DONNIE 6.180322370977525 0.20237925536677015 291
NAOMI 5.402352301658026 0.19813534443600644 207
AGENT DENHAM 4.308255125616418 0.1948103688198134 125
TERESA 4.5798295507525735 0.19601574501432853 55
AUNT EMMA 3.72653910463981 0.19464654407763363 62
MAX 4.09784870799753 0.20692733331507054 80
BRAD 3.604241401915729 0.19975221121151998 53
BO DIETL 4.7062671289473 0.19656291591819847 92
SAUREL 3.3124262195981253 0.19329198618581217 115
MARK HANNA 4.945603129369456 0.19954630645323737 136
JORDAN  (CONT'D) 4.795907067225681 0.1945248018747822 102
SEA OTTER 3.917100678867187 0.19501143118097594 39
STEVE MADDEN 2.506547065147964 0.19904919173324082 52
RUGRAT 2.4691689987839855 0.1996281367782609 28
GENE HACKMAN 1.21884150769059 0.20696076533739766 19
JERRY FOGEL 3.7470869893867906 0.20214439341642976 34
DWAYNE 3.228153079864293 0.19886777640057832 54
CHESTER MING 3.2503499101417836 0.19490574866114882 27
JANET 2.2893351244290714 0.20336032709

In [17]:
with open('How-to-Train-Your-Dragon-2_script.json') as f:
    story = json.load(f)

processed_data = script_cleaning(story['dialogues'])
data_structs_j, data_structs_c = prep_data_structs(processed_data)
#create_heatmap(data_structs_c)
#create_heatmap(data_structs_j)
for k in data_structs_c.keys():
    print(k, sum(data_structs_c[k].values()), np.std(list(data_structs_c[k].values())), len(set(processed_data[k])))

HICCUP 4.981710506036841 0.20795574842536127 457
STOICK 4.483731055516061 0.22250188189607745 226
VALKA 4.67898571402542 0.22368359353188003 232
ASTRID 4.189342420675554 0.2203174168130256 144
ERET 4.344440863489888 0.2276038839406494 166
DRAGO 3.9099432528016553 0.24000232950532957 93
GOBBER 3.2869439779245595 0.23542587153745714 166
RUFFNUT 3.033708305593434 0.23550098630051775 46
FISHLEGS 3.0282447404375157 0.2336490467310706 45
TUFFNUT 3.2436644741119225 0.2281556897532403 49
SNOTLOUT 2.5016563980603 0.24670943021879682 40
