-
Notifications
You must be signed in to change notification settings - Fork 6
/
script_processor_v2.py
125 lines (113 loc) · 5.41 KB
/
script_processor_v2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import os
import json
import operator
def get_characters(script, threshold_char=5, threshold_space=-1):
with open(os.getcwd() + "/scripts/" + script, "r") as s:
script = s.read()
poss_char = {}
for line in script.split("\n"):
if line.count(" ") > threshold_space:
key = line.strip(" ").replace(line[line.find("("):line.find(")") + 1], "").strip(" ").replace("\t", "").strip(" ")
if key not in poss_char:
poss_char[key] = 1
else:
poss_char[key] += 1
sorted_poss_chars = sorted(poss_char.items(), key=operator.itemgetter(1), reverse=True)
chars = []
for char_tuple in sorted_poss_chars:
if char_tuple[1] > threshold_char:
chars.append(char_tuple)
else:
break
return [char[0] for char in chars if char[0] != "" and "-" not in char[0] and "." not in char[0] and ":" not in char[0] and "?" not in char[0] and len(char[0]) > 1]
#Alternate Approach which works equally well.
"""
script = open('NLP-Movie_Scripts/scripts/'+file,'r')
for line in script:
temp = line.strip()
if 'thor'.upper() in temp:
if (len(temp.split(' ')) <= 2) & (temp.split(' ')[0]=='thor'.upper()):
print(temp)
"""
movies_diag = {}
for movie in os.listdir(os.getcwd() + "/scripts/"):
movies_diag[movie] = {}
if movie[-4:] == ".txt":
with open(os.getcwd() + "/scripts/" + movie, "r") as s:
script = s.read()
#create a string containing character names seperated by commas
characters = ','.join([name for name in get_characters(movie) if len(name.split(' ')) < 5])
# data structure to store the dialogues
dialogues = {}
for ch in characters.split(','):
if 'Mr.' in ch or '-' in ch:
continue
else:
name = ch
dialogue = ''
flag = 0
count = 0
for line in script.split("\n"):
temp = line.strip(" ").replace(line[line.find("("):line.find(")") + 1], "").strip(" ").replace("\t", "").strip(" ")
if temp.isupper(): # If Character Intro
if name.upper() == temp:
flag = 1
count += 1
dialogue += '\n['+str(count)+']'
continue
else: # If it's another character
flag = 0
continue
if flag:
if flag == 1:
if line.strip(): # Accounts for first lines == "" or == " "
if line.lstrip("\t").lstrip(" ")[0] == "(" or line.rstrip(" ")[-1] == ")": # To still get the dialogue
ident_level_first = None # when the first line below a character
flag = 0 # intro is something like "(looking around with a grin)"
else: # which describes how the character is acting but it is not his dialogue
if "\t" in line: # To account for format where there are "\t" instead of " "
ident_level_first = line.count("\t")
else:
ident_level_first = len(line) - len(line.lstrip(" "))
else:
ident_level_first = None
flag = 0
if "\t" in line:
ident_level = line.count("\t")
else:
ident_level = len(line) - len(line.lstrip(" "))
if ident_level_first:
if ident_level == ident_level_first:
dialogue += temp + " "
else:
continue
else:
pass
flag += 1
dialogues[name] = dialogue
movies_diag[movie]['dialogues'] = dialogues
#information available on each movie ---- dict_keys(["dialogues"])
for movie in movies_diag.keys():
with open(os.getcwd() + "/diag_jsons/" + movie.replace('.txt','.json'), 'w') as fp:
json.dump(movies_diag[movie], fp)
# Deletes scripts with weird formats
# Spotted after inspection by doing
#a = []
#for i in os.listdir(os.getcwd() + "/diag_jsons/"):
# with open(os.getcwd() + "/diag_jsons/" + i, "r") as s:
# if len(s.read()) < 2000:
# a.append(i)
#os.remove(os.getcwd() + "/scripts/" + "Road,-The_script.txt")
#os.remove(os.getcwd() + "/scripts/" + "True-Grit_script.txt")
#os.remove(os.getcwd() + "/scripts/" + "Armageddon_script.txt")
#os.remove(os.getcwd() + "/diag_jsons/" + "Road,-The_script.json")
#os.remove(os.getcwd() + "/diag_jsons/" + "True-Grit_script.json")
#os.remove(os.getcwd() + "/diag_jsons/" + "Armageddon_script.json")
# Deletes more problematic scripts... spotted
# when doing Flesch-Kincaid Grade Level
#probs = ['9', 'Alien-Resurrection', 'American-History-X', 'American-Psycho', 'Big',
#'Dogma', 'Ex-Machina', 'Finding-Nemo', 'Gangs-of-New-York', 'Ghostbusters', 'Gravity',
#'Heavenly-Creatures', 'Kids', 'Lock,-Stock-and-Two-Smoking-Barrels', 'Office-Space', 'Titanic']
#for script in probs:
# os.remove(os.getcwd() + "/scripts/" + script + "_script.txt")
# os.remove(os.getcwd() + "/diag_jsons/" + script + "_script.json")