-
Notifications
You must be signed in to change notification settings - Fork 1
/
casts.py
611 lines (603 loc) · 26.8 KB
/
casts.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
# casts.py
# version 0.1.2
# now handles the minority of cast lists organized as html tables rather than html lists.
# minor additions to genderwords.json
# excludes a few more "nonstandard" lines that are not actor/ character pairs.
# problem: rising % of "unknown" gender over time (from 2.5% -> 5.5% 1930-2018, US)
# problem: rising % of mismatched actor and character genders over time (1.5% -> 2.5% US)
# problem: no case distinctions e.g., chick-> female; Chick-> male.
# problem: multiple characters for same actor; divided sometimes by /, sometimes indented lists
# 1. reads a series of text files of cast lists from wikipedia movie .html files and parses each line into 3 segments
# - actor
# - character
# - other description
# 2. assigns genders to actor and character names
# (female, mostly_female, andy, mostly_male, male, unknown)
# counts how often actor and character genders match, or partly match
# eventually will:
# 3. count how often the character's name (first, last, both) appear in plot summary
# eventually even attribute "he","she","her", etc to that character
# 4. associate each character with one or more "occupations"
# codes some ~30,000 "jobtitles" into 2010 3-digit U.S. Census codes
#
# arg= prefix is a filename prefix to output (& input) files
# e.g. python3 casts.py NYT
#
# input:
# prefix+files.txt (e.g., USfiles.txt)= a file of filenames of text files (cast lists) to be read and coded for gender.
# gendernames.json revisions of gender_guesser, "name": "gender",
#
# to compile casts.py:
# uses python standard packages: sys json
# uses python packages that must be downloaded and installed: gender_guesser BeautifulSoup
# e.g. pip install BeautifulSoup4
# e.g. pip install gender-guesser (see: https://pypi.org/project/gender-guesser/)
#
#
import json
import sys
import gender_guesser.detector as gender
g = gender.Detector(case_sensitive=False)
# should be called with one argument, a prefix (e.g., US, Fr, SK, etc.)
# there should be one and only one argument in calling Python
Nargs= len(sys.argv)
if Nargs<2:
print("You need to call casts.py with an argument giving the filename prefix")
print(" e.g., python3 casts.py movies")
print(" exiting...")
sys.exit()
else:
prefix= sys.argv[1]
print("prefix: ", prefix)
# to strip html code from text:
from bs4 import BeautifulSoup
#
# Korean surnames that would be coded as English first names:
notgivenname= ['Gil', 'Kim', 'Lee', 'Park', 'Choi', 'Jung', 'Kang', 'Cho', 'Jo', 'Yoon',
'Jang', 'Iim', 'Han', 'Oh', 'Seo', 'Shin', 'Kwon', 'Hwang', 'Ahn', 'Song',
'Jeon', 'Hong', 'Yu', 'Ko', 'Moon', 'Yang', 'Bae', 'Baek', 'Heo', 'Yoo',
'Nam', 'Sim', 'Rho', 'Ha', 'Kwak', 'Sung', 'Cha', 'Choo', 'Joo', 'Wu','Koo',
'Min', 'Ku', 'Na', 'Keum', 'Chae', 'Chun', 'Bang', 'Kong', 'Yeon', 'Yeo',
'On', 'Lm', 'Gi','Chu','Do','Seong','Seon','Shim','Ji','Bok','Bong','Goo',
'Kil','Noh','Pyo','Ra','Ri','Ro','Uhm']
#
# first words in some nonstandard lines that will be dropped:
nonstandard= ['cast', '(cast', 'actor', 'additional', 'additionally', 'cameo', 'cameos', 'casting', 'categories', 'character', 'featuring', 'list', 'main', 'minor', 'notes', 'opening', 'other', 'others', 'retrieved', 'role', 'starring', 'supporting', 'unbilled', 'uncredited', 'voice', 'voices', '@media', '[note']
nonstandard2= ['The cast', 'The film', 'As appearing', 'See also']
uncredited= ['unbilled', 'uncredited']
cameos= ['cameo', 'cameos']
minor= ['additional', 'additionally', 'minor', 'other', 'others', 'supporting']
#
# most cast lists separate actor and character with " as "
# a few cast lists separate actor and character with
# " ... " or more periods,
# " - " dash with spaces around (not a hyphen)
# "â" non-ascii
# "–" endash
# " ... " or more periods
# actorend is a dict with separation marker and # spaces to skip to beginning of character name
actorend = {
" as ": 4,
" - ": 3,
"---": 3,
"...": 4,
":": 1,
"â": 6,
"—": 1,
"–": 1
}
# charend is a list of separation markers between the character name and additional material on that line
charend= [",", ":", ";", "–", "â", " - ", "(", " who " ]
xtab= [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
####################################
# 3 output files:
#
# name-level summary: after proccessing all casts, list each name found & its counts
namesfile= prefix + "names.xls"
namesf= open(namesfile, "w")
namesf.write ("iname name gender Nfound Nactor Nchar Nfemale Nmfemale Nandy Nmmale Nmale Nunknown PairScore PctFemale PctMale\n")
#
# detailed output: XXcharacters.xls= after processing each text file, writes one record for each character
# writes as tab delimited file for input into spreadsheet/ stats program
charfile= prefix + "characters.xls"
charf= open(charfile, "w")
charf.write ("filename" + " status" + " AName" + " Actor" + " AGender" + " CName" + " Character" + " ChGender" + " Final" + " additional" + "\n" )
#
# nonstandard lines have no actor as character... OR actor - character...
# print to separate file to figure out what is wrong
nonstandardfile= prefix + "nonstandard.xls"
nonstandardf= open(nonstandardfile, "w")
nonstandardf.write ("filename" + " " + "nonstandard line:\n")
#
# text-level summary: after processing each text, writes a count of #words, #sentences, etc., one line per text
# need to check for Nchars=0 or =1; that movie not processed correctly
textsfile= prefix + "casts.xls"
textsf= open(textsfile, "w")
textsf.write ("Nlines Nblanks Nread Nwords Nactors Nchars Nmiss Nextra Nnonstandard movie\n")
####################################
# 2 input files:
#
# genderwords.json a file of supplemental (corrected) names,titles,words each with assigned gender:
# Queen, Princess, King, Prince, -- already in gender_guesser
genderwords= open("genderwords.json").read()
# genderwords is a long string, make genderwords into a dict, namegender:
namegender= json.loads(genderwords)
print ('\nnamegender (names with fixed gender)= ' + str(type(namegender)) + ' Nlines=' + str(len(namegender)) )
#
#
# XXfiles.txt: list of filenames from Wikipedia cast lists that casts.py will read
# read list of filenames of input texts: all movies with cast lists
infilelist= prefix + "files.txt"
files=open(infilelist).read()
# files is a long string with all file names:
# number of files:
textfiles=files.split('\n')
Ntextfiles=len(textfiles)
Sumtextlines=0
# strip off end of file line:
textfiles=textfiles[0:Ntextfiles-1]
print ('\nList of cast files to be analyzed: ' + infilelist + " " + str(type(textfiles)) + ", #files= " + str(len(textfiles)) )
#
foundnames= [""]
gendernames= ["unknown"]
countnames= [0]
countanames= [0]
countcnames= [0]
countmales= [0]
countmmales= [0]
countmfemales= [0]
countfemales= [0]
countandy= [0]
countunknown= [0]
Sumcharacters=0
for file in textfiles:
Nactors=0
Ncharacters=0
Nnochar=0
Nadditional=0
Nnonstandard=0
Nblanklines=0
# status records the prominence of the actor / character
# eventually will be revised used "starring" from the infobox on each page
# starring vs. others may be the most important distinction
# another prominence indicator might be the order in the list (iactor)
# in the cast lists, wikipages are inconsistent with indicating prominence of the role;
# some casts ignore seconday or uncredited roles;
# others label them as uncredited etc.;
# others include them with main actors undifferentiated.
# non-main characters are indicated in two ways:
# a separate line (e.g., Uncredited:) below which all the roles are uncredited (statustype=list)
# or on the line of the actor / character only (statustype=line)
status="main"
statustype="statuslist"
# write blank line to character.xls file to separate each movie:
charf.write ("\n")
# todo: work with different codings
#text0=open(file, encoding = "ISO-8859-1").read()
text0=open(file).read()
# drop html:
# first, to accodomdate the few movies that use tables instead of lists:
text1=text0.replace('\n</td>\n','</td> : ')
text1=text1.replace('</td>\n','</td> : ')
text1= BeautifulSoup(text1, features="html.parser")
textstring= text1.get_text()
Nbytes=len(textstring)
Nwords=textstring.count(" ")
Nlines=textstring.count("\n")
textstring= textstring.replace('/',' / ') # often a marker between segments or multiple roles
textstring= textstring.replace('[',' [') # wikipedia footnotes: make it a separate word
textstring= textstring.replace(':',' : ') # wikipedia footnotes: make it a separate word
textstring= textstring.replace(' ', ' ') # tabs create problems for casts.py
textstring= textstring.replace(' ',' ')
#print ("\nafter BeautifulSoup: " + file + ":\n" + str(textstring) + "\nNlines= " + str(Nlines) + "\nNwords= " + str(Nwords))
#
# make each sentence a new string
textlines= textstring.splitlines()
Ntextlines= len(textlines)
Sumtextlines= Sumtextlines + Ntextlines
# process each line in the file:
#
iline=0
Nblanklines=0
for line in textlines:
line= line.strip()
line= line.replace(' ', ' ')
line= line.replace(' ',' ')
words= line.split(" ")
Nwordline= line.count(" ") + 1
if statustype=="statusline": # status changed just for previous line using "additional"
status="main"
statustype="statuslist" # if statustype="statuslist" will keep whatever previous status
if Nwordline>=1:
wordtest= words[0].lower()
if Nwordline>=2:
words2= words[0] + " " + words[1]
else:
words2= ""
lenline= len(line)
actor=""
char=""
aname=""
additional=""
if lenline<=1 :
Nblanklines= Nblanklines+1
elif words2 in nonstandard2:
Nnonstandard= Nnonstandard+1
nonstandardf.write (file + " common nonstandard2 line: " + str(line) + "\n" )
elif wordtest in nonstandard:
Nnonstandard= Nnonstandard+1
nonstandardf.write (file + " common nonstandard line: " + str(line) + "\n" )
if wordtest in uncredited:
status="uncredited"
statustype="statuslist"
#print (file, words[0],line)
elif wordtest in minor:
status="supporting"
statustype="statuslist"
elif wordtest in cameos:
status="cameo"
statustype="statuslist"
elif wordtest == "main":
status="main"
statustype="statuslist"
#elif words[0[0:-1]] in nonstandard: # sometimes there is a colon, comma, or other punctuation after the nonstandard first word.
#Nnonstandard= Nnonstandard+1
#nonstandardf.write (file + " common nonstandard line: " + str(line) + "\n" )
else: # not a blank line and not a common nonstandard cast line
iline= iline+1
#
# find the separation marker between actor name and character name segments
# endactor= the position in the line where it is found
# if more than one separation marker, use the first one
# startchar= value in dict to skip spaces to start of character name
#
endactor=lenline # initialize endactor as the end of the line
startchar=0
for testmark in actorend: # actorend is a dict with the possible separation markers
endtest= line.find(testmark) # endtest is the position of testmark if testmark is found
if endtest != -1: # if testmark was not found in the line, find() returns -1
if endtest < endactor: # only if this marker occurred before any other marker or lenline
endactor=endtest # the position where the marker was found
startchar= actorend[testmark] # how many bytes to skip to start the next segment, character name
# test if no marker found
# this does not work if the cast is stored as a table; then there are no segment markers (maybe <td> but html is gone?)
if endactor==lenline:
#no actor/character segment marker found
if Nwordline>=6:
# too many words in line without actor/character segment makres
# can't figure out what this line is; save in file of nonstandard lines to examine later
# keep actor and char as empty strings ""
# go to next line
nonstandardf.write (file + " longline; endline=" + str(lenline) + " " + line + "\n")
Nnonstandard= Nnonstandard+1
else:
# probably only an actor listed with no character
actor= line.strip()
char="missing"
cname="missing"
cgender="missing"
Nnochar=Nnochar+1
else:
# found an actor/character segment marker
if endactor>80:
# segment marker is too deep into line; skip and write to nonstandard line file
nonstandardf.write (file + " longact; endactor=" + str(endactor) + " " + line + "\n")
else:
actor=line[:endactor].strip()
Nactors= Nactors+1
#
# next, find end of character name marker that separates character name segment from additional material.:
# forward slash usually means multiple characters for the same actor; deal with that later.
# character name segment ends with a , : , end of line, or some distinctive character
# begin find() only after actor's name since a few actor name segments have a comma ( , Jr. )
startchar=endactor+startchar
Nwordchar= line[startchar:].count(" ")
endchar=lenline # initialize end of character segment to end of line
for testmark in charend:
endtest= line[startchar:].find(testmark)
if endtest != -1:
if endtest < endchar:
endchar=endtest
#
# sometimes a period separates the character from additional text. This is more of a problem.
# it could be an abbreviation for a title (e.g., Mrs. Jr. Gen. etc.) and not a separation marker.
# or it could be a character's initials (e.g., J.R. W.C.) and not a separation
#endperiod= line[startchar:].find(".")
#if endperiod!= -1: # found a period because -1 == not found
#if endperiod < endchar # if period is before another separation character, check whether the period is not a separation
if endchar==lenline: # no comma, colon, etc., probably no additional material
if Nwordchar>=8 :
# more than 8 words describing character (even if no , or :)
nonstandardf.write (file + " char 8words+: " + str(line) + "\n" )
Nnonstandard= Nnonstandard+1
else:
char= line[startchar:].strip() # store character segment in char even if 8+ words.
Ncharacters= Ncharacters+1
else:
# line has a comma, colon, semicolon, etc. segment marker between character and additional material
char= line[startchar:endchar+startchar].strip()
Ncharacters= Ncharacters+1
if line[endchar+startchar]=="(": # unlike other segment markers, keep the left paren in the additional string
startchar= startchar-1
additional= line[1+endchar+startchar:].strip() # store additional material segment in additional
chkadd= additional.lower()
Nadditional= Nadditional+1
if chkadd.find("uncredited")!=-1:
status="uncredited"
statustype="statusline"
if chkadd.find("cameo")!=-1:
status="cameo"
statustype="statusline"
#
# line is now separated into 3 segments: full actor name, full character name (if any), additional words (if any)
# fix some problems with markers that extend into char or additional
lenchar= len(char)
if lenchar>0:
if char[-1]==".":
char= char[0:-1]
lenchar= len(char)
if lenchar>0:
for ichar in range(lenchar):
if char[0]=="-" or char[0]==".":
char= char[1:]
else:
break
lenadditional= len(additional)
if lenadditional>0:
if additional[-1]==".":
additional= additional[0:-1]
lenadditional= len(additional)
if lenadditional>0:
for iadditional in range(lenadditional):
if additional[0]=="-" or additional[0]==".":
additional= additional[1:]
else:
break
#
# loop through the words in anames[] to find first gendered name (e.g., Mary), title (e.g. Mrs.), or word (e.g., mother)
if actor!="":
actor= actor.replace(' ',' ') # rarely a tab separates words in names, but tabs cause problems so replace with a space
actor= actor.replace('"',' ')
actor= actor.strip()
anames= actor.split(" ") # this will unfortunately separate some compound names
if prefix=="SK":
anames.reverse()
agender= "unknown" # initialize as unknown; loop will continue through anames[] until agender is no longer unknown
for aname in anames:
# some common surnames in Korean are given names in English;
if prefix=="SK" and aname in notgivenname:
agender= "unknown"
elif len(aname)>0 and aname[0]=="[": # wiki footnote usually at end of char section
agender= "unknown"
elif len(aname)>=2 and aname[-2]=="'" and aname[-1]=="s": # possessive: skip
agender= "unknown"
elif len(aname)>=2 and aname[-2]=="s" and aname[-1]=="'": # plural possessive
agender= "unknown"
else:
aname= aname.replace("'"," ")
# delete quotes around nicknames, e.g., 'Doc',
# but would also replace possessives, so possessives skipped above
# then check if aname is in revision dict, namegender{}, above:
agender= namegender.get(aname.lower())
if agender==None: # aname not found in updated namegender json file
# last place to look is in gender_guesser
agender= g.get_gender(aname)
if agender!= "unknown": #if gender is unknown, then continue for loop & go to next word; ?"andy"?
break
# end of going through words in anames
if agender=="unknown": # if at end of for loop, all anames are unknown gender, then use the first word
# ? but maybe the second if the first is just a title or a stopword like "the"?
aname=anames[0]
# check whether this aname has already been found:
if aname in foundnames:
# aname has been found before, so just increment count
iname= foundnames.index(aname)
countnames[iname]= countnames[iname]+1
countanames[iname]= countanames[iname]+1
else:
# aname has not been found before, so add to list of names found
foundnames.append(aname)
iname= len(foundnames)-1
countnames.append(1)
countanames.append(1)
countcnames.append(0)
gendernames.append(agender)
countmales.append(0)
countmmales.append(0)
countandy.append(0)
countmfemales.append(0)
countfemales.append(0)
countunknown.append(0)
ianame= iname
#
# loop through the words in cnames[] to find first gendered name | title
if char =="":
cgender="unknown"
cname="missing"
icname=0
else:
char= char.replace(' ',' ') # rarely a tab separates words in names
char= char.replace('"',' ')
char= char.strip()
cnames= char.split(" ") # this will separate some compound names
cgender= "nochar"
if prefix=="SK":
cnames.reverse()
for cname in cnames:
if prefix=="SK" and cname in notgivenname:
cgender= "unknown"
elif len(cname)>0 and cname[0]=="[": # wiki footnote usually at end of char section
cgender= "unknown"
elif len(cname)>=2 and cname[-2]=="'" and cname[-1]=="s": # possessive: skip
cgender= "unknown"
elif len(cname)>=2 and cname[-2]=="s" and cname[-1]=="'": # plural possessive
cgender= "unknown"
else:
cname= cname.replace("'"," ")
# delete quotes around nicknames, e.g., 'Doc',
# but would also replace possessives, so possessives skipped above
# then check if cname is in revision dict, namegender{}, above:
cgender= namegender.get(cname.lower())
if cgender==None: # cname not found in updated namegender json file
# last place to look is in gender_guesser
cgender= g.get_gender(cname)
if cgender!= "unknown": #if gender is unknown, then continue for loop & go to next word; ?"andy"?
break
# end of loop for finding gender of words in cnames
if cgender=="unknown": # if all cnames are unknown gender, then use the first word
# ? but maybe the second if the first is just a title or stopword?
cname=cnames[0]
# check whether this cname has already been found:
if cname in foundnames:
# cname has been found before, so just increment count
iname= foundnames.index(cname)
countnames[iname]= countnames[iname]+1
countcnames[iname]= countcnames[iname]+1
else:
# cname has not been found before, so add to list of names found
foundnames.append(cname)
iname= len(foundnames)-1
countnames.append(1)
countcnames.append(1)
countanames.append(0)
gendernames.append(cgender)
countmales.append(0)
countmmales.append(0)
countandy.append(0)
countmfemales.append(0)
countfemales.append(0)
countunknown.append(0)
icname= iname
#
# increment xtab counts of actor genders for this character name
if agender=="female":
countfemales[icname]= countfemales[icname]+1
xtabcell=0
elif agender=="mostly_female":
countmfemales[icname]= countmfemales[icname]+1
xtabcell=7
elif agender=="andy":
countandy[icname]= countandy[icname]+1
xtabcell=14
elif agender=="mostly_male":
countmmales[icname]= countmmales[icname]+1
xtabcell=21
elif agender=="male":
countmales[icname]= countmales[icname]+1
xtabcell=28
elif agender=="unknown":
countunknown[icname]= countunknown[icname]+1
xtabcell=35
else:
xtabcell=42 # no actor
# increment counts of character genders for this actor name
if cgender=="female":
countfemales[ianame]= countfemales[ianame]+1
#xtabcell= xtabcell+0
elif cgender=="mostly_female":
countmfemales[ianame]= countmfemales[ianame]+1
xtabcell= xtabcell+1
elif cgender=="andy":
countandy[ianame]= countandy[ianame]+1
xtabcell= xtabcell+2
elif cgender=="mostly_male":
countmmales[ianame]= countmmales[ianame]+1
xtabcell= xtabcell+3
elif cgender=="male":
countmales[ianame]= countmales[ianame]+1
xtabcell= xtabcell+4
elif cgender=="unknown":
xtabcell= xtabcell+5
countunknown[ianame]= countunknown[ianame]+1
else:
xtabcell= xtabcell+6
xtab[xtabcell]= xtab[xtabcell]+1
#
# assign this actor/character line a best guess gender
finalgender=""
if agender=="female" or agender=="mostly_female":
if cgender=="female" or cgender=="mostly_female" or cgender=="andy":
finalgender="female"
elif cgender=="unknown" or cgender=="missing":
finalgender="likely_female"
elif cgender=="male" or cgender=="mostly_male":
finalgender="mismatch"
elif agender=="male" or agender=="mostly_male":
if cgender=="male" or cgender=="mostly_male" or cgender=="andy":
finalgender="male"
elif cgender=="unknown" or cgender=="missing":
finalgender="likely_male"
elif cgender=="female" or cgender=="mostly_female":
finalgender="mismatch"
elif agender=="andy":
if cgender=="female" or cgender=="mostly_female":
finalgender="female"
elif cgender=="unknown" or cgender=="missing" or cgender=="andy":
finalgender="unknown"
elif cgender=="male" or cgender=="mostly_male":
finalgender="male"
elif agender=="unknown" or agender=="missing":
if cgender=="female" or cgender=="mostly_female":
finalgender="likely_female"
elif cgender=="unknown" or cgender=="missing" or cgender=="andy":
finalgender="unknown"
elif cgender=="male" or cgender=="mostly_male":
finalgender="likely_male"
else:
finalgender="unknown"
charf.write (file + " " + status + " " + aname + " " + actor + " " + agender + " " + cname + " " + char + " " +cgender + " " + finalgender + " " + additional + "\n")
# end of file loop through all lines in a file
#Nnochar= Nactors-Ncharacters
textsf.write (str(Nlines) + " " + str(Nblanklines) + " " + str(iline) + " " + str(Nwords) + " " + str(Nactors) + " " + str(Ncharacters) + " " + str(Nnochar) + " " + str(Nadditional)+ " " + str(Nnonstandard) + " " + file + "\n")
print ("Nlines=" + str(Nlines) + " Nblanks=" + str(Nblanklines) + " iline=" + str(iline) + " #words=" + str(Nwords) + " #actors=" + str(Nactors) + " #characters=" + str(Ncharacters) + " #missing=" + str(Nnochar) + " #additnl=" + str(Nadditional)+ " #nonstandard lines=" + str(Nnonstandard) + " "+ file )
Sumcharacters= Sumcharacters+Ncharacters
print ("# movies(files)=" + str(len(textfiles)))
Sumnames= len(foundnames)
for iname in range(Sumnames):
# pairscore is summary measure of whether the paired name (e.g. the character's when found name is the actor's) is female (negative scores=male)
pairscore=2*countfemales[iname]+countmfemales[iname]-countmmales[iname]-2*countmales[iname]
# pctfemale is % of foundnames that are "female" or "mostly_female";
# can be used to identify errors in male names
if countnames[iname]>0:
pctfemale=100*(countfemales[iname]+countmfemales[iname])/countnames[iname]
else:
pctfemale=0
# pctmale is % of foundnames that are "male" or "mostly_male";
# can be used to identify errors in female names
# pctmale + pctfemale <=100 because andy, unknown
if countnames[iname]>0:
pctmale=100*(countmales[iname]+countmmales[iname])/countnames[iname]
else:
pctfemale=0
lineout= str(iname) +" "+ foundnames[iname] +" "+ gendernames[iname] +" "+ str(countnames[iname]) +" "+ str(countanames[iname]) +" "+ str(countcnames[iname]) +" "+ str(countfemales[iname]) +" "+ str(countmfemales[iname]) +" "+ str(countandy[iname]) +" "+ str(countmmales[iname]) +" "+ str(countmales[iname]) +" "+ str(countunknown[iname]) + " " + str(pairscore) + "\n"
namesf.write (lineout)
print ("\n# unique names=", str(len(foundnames)))
xls= prefix+"names.xls;"
print ("See", xls, "sort on gender, pairscore to revise unknown and andy")
print ("See", xls, "sort on gender, pctmale or gender, pctfemale to check male&female errors\n")
#
print (str(Sumcharacters), "movie actor/character pairs out of", str(Sumtextlines), "lines in the", str(len(textfiles)), "cast files.")
# crosstabs of actor gender X character gender
coltotal= [0,0,0,0,0,0,0]
print ("\n character gender:")
print ("actor: female mfemale andy male mmale unknown miss rowtotal")
rowlabel= ["female", "mfemale", "andy", "mmale", "male", "unknown", "miss", "rowtotal"]
for row in range(0,49,7):
rowtotal= xtab[row+0] + xtab[row+1] + xtab[row+2] + xtab[row+3] + xtab[row+4] + xtab[row+5] + xtab[row+6]
coltotal[0]= coltotal[0] + xtab[row+0]
coltotal[1]= coltotal[1] + xtab[row+1]
coltotal[2]= coltotal[2] + xtab[row+2]
coltotal[3]= coltotal[3] + xtab[row+3]
coltotal[4]= coltotal[4] + xtab[row+4]
coltotal[5]= coltotal[5] + xtab[row+5]
coltotal[6]= coltotal[6] + xtab[row+6]
rowint=int(row/7)
print (rowlabel[rowint], str(xtab[row+0]), str(xtab[row+1]), str(xtab[row+2]), str(xtab[row+3]), str(xtab[row+4]), str(xtab[row+5]), str(xtab[row+6]), str(rowtotal), sep=" ")
total= coltotal[0] + coltotal[1] + coltotal[2] + coltotal[3] + coltotal[4] + coltotal[5] + coltotal[6]
print ("\ntotals", str(coltotal[0]), str(coltotal[1]), str(coltotal[2]), str(coltotal[3]), str(coltotal[4]), str(coltotal[5]), str(coltotal[6]), str(total), sep=" ")
#
xls= prefix+"characters.xls"
print ("\nSee", xls, "for each movie character.")
print (" Sort by Cname Chgender Agender Aname (or by Aname Agender Chgender Cname) to identify misclassified names.")
print ("\n exiting...")
sys.exit()