-
Notifications
You must be signed in to change notification settings - Fork 7
/
script_manager.py
79 lines (71 loc) · 2.93 KB
/
script_manager.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
'''
This module handles the script files management. It has methods to scrape the scripts off www.imsdb.com and load saved scripts
Author: Prithvijit Chakrabarty (prithvichakra@gmail.com)
'''
from bs4 import BeautifulSoup as bs44
from bs4 import BeautifulSoup as bs
import urllib2
import os
import nltk
from nltk import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
import random
DESC_TAG = 'DESCRIPTION'
#Method to download all scripts from all genres hosted at www.imsdb.com
#This is quite crude and specific to parsing the imsdb format
def get_scripts():
save_to = 'C:\\School\\My Work\\Computer\\Datasets\\cornell_movie_dialogs_corpus\\Scripts2\\'
script_list = urllib2.urlopen('http://www.imsdb.com/all%20scripts/')
soup = bs.BeautifulSoup(script_list)
count = 0
lost = []
for link in soup.findAll('a'):
url = link.get('href')
if url.startswith('/Movie Scripts/'):
name = (url.split('/')[2])[:-11].strip()
name = name.replace(':','')
name = name.replace('?','')
file_name = name
name = '-'.join([n.strip() for n in name.split(' ')])
print '--',name
try:
html = urllib2.urlopen('http://www.imsdb.com/scripts/'+name+'.html').read()
sp = bs44(html,"html.parser")
text = sp.get_text()
lines = [''.join([i if ord(i)<128 else ' ' for i in line]) for line in text.splitlines()]
filtered = []
started = False
for line in lines:
if line.upper().strip() == 'ALL SCRIPTS':
started = True
elif line.startswith('Writers : '):
started = False
if started == True:
filtered.append(line)
script = '\n'.join(filtered)
print file_name
with open(save_to+file_name+'.txt','a') as sc_file:
sc_file.write(script)
except urllib2.HTTPError:
count = count+1
lost.append(file_name)
print lost
#Method to load script(s). Returns a list of tuples: (movie name,lines of script)
#Setting count = N will return N random scripts
#Specifiying a name will load only that script
def load_scripts(path,count=None,name=None):
scripts = []
if not name is None:
return [(name,open(path+name).readlines())]
files = os.listdir(path)
if count is None:
count = len(files)
else:
random.shuffle(files)
for sc_file in files[:count]:
scripts.append((sc_file,open(path+sc_file).readlines()))
return scripts
#Auxiliary method to remove all non-ASCII characters in text - cleans unknown characters
def clean(text):
return ''.join([c if ord(c)<128 else '' for c in text])