# A Notebook for exploring SpaCy in attempting to model The Bechdel Test

Following this [article from DataQuest](https://www.dataquest.io/blog/tutorial-text-classification-in-python-using-spacy/) has been the source of inspiration for bringing in SpaCy and TfidfVectorizer for this. TfidfVectorizer, as it weighs the occurrence of words within a document in its calculations (think character names in screenplays), will be incredibly useful in our project.

In [215]:
# Let's bring our mates
import textblob
import string
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English
import spacy
import re
import itertools
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score

In [146]:
stop_words = spacy.lang.en.stop_words.STOP_WORDS

In [147]:
punctuations = string.punctuation

In [148]:
nlp = spacy.load("en_core_web_lg")

### Looking at Named Entity Recognition (NER) Below:

We are going to bring in a gender-guesser function so that we can create a feature that counts the number of women in the film. This would allow for us to see just how many women are in the script (the more women, the higher the chance for passing The Bechdel Test!)

In [97]:
with open('../data/Scripts/2012.TXT', 'r') as movie:
    movie = movie.read()

In [149]:
movie_doc = nlp(movie)

In [103]:
[(ent.text, ent.label_) for ent in movie_doc.ents if ent.label_ == 'PERSON']

[('Harald Kloser\n\n                                                           ',
  'PERSON'),
 ('Mozart', 'PERSON'),
 ('Mozart', 'PERSON'),
 ('FREDERIC WEST', 'PERSON'),
 ('West', 'PERSON'),
 ('SATNAM TSURUTANI', 'PERSON'),
 ('SATNAM\n\n                    8200 feet', 'PERSON'),
 ('West', 'PERSON'),
 ('Satnam', 'PERSON'),
 ('WEST\n\n                    Helmsley', 'PERSON'),
 ('SATNAM\n\n                    ', 'PERSON'),
 ('SATNAM', 'PERSON'),
 ('SATNAM\n\n                    ', 'PERSON'),
 ('Satnam', 'PERSON'),
 ('SATNAM', 'PERSON'),
 ('ADRIAN HELMSLEY', 'PERSON'),
 ('ADRIAN\n\n                    ', 'PERSON'),
 ('Wilson', 'PERSON'),
 ('ADRIAN\n\n                    ', 'PERSON'),
 ('Adrian', 'PERSON'),
 ('West', 'PERSON'),
 ('Satnam', 'PERSON'),
 ('WEST\n\n                    ', 'PERSON'),
 ('Satnam', 'PERSON'),
 ('ADRIAN\n\n                    ', 'PERSON'),
 ('ADRIAN\n\n                    ', 'PERSON'),
 ('ADRIAN\n\n                    ', 'PERSON'),
 ('Adrian', 'PERSON'),
 ('ADRIAN\n

In [150]:
bechdel_df = pd.read_csv('../data/bechdel_df.csv')

In [151]:
bechdel_df.head()

Unnamed: 0.1,Unnamed: 0,rating,title,year,imdbid,script,simple_bechdel
0,0,1,warrior,2011,1291584.0,WARRIOR ...,0
1,1,1,collateral,2004,369339.0,COLLA...,0
2,8,1,gamer,2009,1034032.0,GAMER ...,0
3,9,1,wanted,2008,493464.0,WANTED ...,0
4,10,0,2001: a space odyssey,1968,62622.0,2001: A SPACE ODYSSEY Screenplay by ...,0


In [152]:
bechdel_df.drop(columns=['Unnamed: 0', 'rating', 'year', 'imdbid'], axis=1, inplace=True)

In [153]:
bechdel_df.head()

Unnamed: 0,title,script,simple_bechdel
0,warrior,WARRIOR ...,0
1,collateral,COLLA...,0
2,gamer,GAMER ...,0
3,wanted,WANTED ...,0
4,2001: a space odyssey,2001: A SPACE ODYSSEY Screenplay by ...,0


In [154]:
bechdel_df['simple_bechdel'].value_counts(normalize=True)

1    0.51634
0    0.48366
Name: simple_bechdel, dtype: float64

In [155]:
bechdel_df.shape

(459, 3)

### Feature Creation: Creating a count of female characters in a screenplay

The below function will create a feature that stores the number of female characters in the film for us to use as a feature in calculating Bechdel probability

In [140]:
def read_name_file(filename):
    names = []
    with open(filename, "r") as f:
        for line in f:
            if not line.startswith("#") and line.strip() != "":  # remove comments and empty lines
                names.append(line.strip().lower())
    return names

In [223]:
female_names = "../data/female.txt"
male_names = "../data/male.txt"
male_names = read_name_file(male_names)
female_names = read_name_file(female_names)

In [224]:
def character_list(file):
    #names = []
    #with open(file, 'r') as f:
        #for line in file:
            #re.findall(r'b[A-Z]', f)
    names = ' '.join(re.findall(r'[A-Z]+', file)).lower().split()
    #now let's search through the list and find only human names
    female_characters = []
    male_characters = []
    for i in names:
        if i in female_names:
            female_characters.append(i)
        elif i in male_names:
            male_characters.append(i)
        else:
            pass
    return len(set(female_characters))

In [225]:
character_list(bechdel_df['script'][5])

9

In [134]:
def women_count(df):
    for movie in df:
        bechdel_df['women'] = character_list(movie)
    return bechdel_df

In [156]:
bechdel_df.head()

Unnamed: 0,title,script,simple_bechdel
0,warrior,WARRIOR ...,0
1,collateral,COLLA...,0
2,gamer,GAMER ...,0
3,wanted,WANTED ...,0
4,2001: a space odyssey,2001: A SPACE ODYSSEY Screenplay by ...,0


In [226]:
bechdel_df['women'] = bechdel_df['script'].apply(character_list)

In [227]:
bechdel_df['women'].value_counts()

10    33
9     32
12    32
11    30
8     28
13    27
5     26
14    25
6     23
15    22
4     20
7     20
16    16
19    16
18    15
17    15
21    13
20    11
3      9
2      6
22     6
1      4
24     4
25     4
23     3
26     3
27     3
29     3
32     2
28     2
30     2
34     1
0      1
31     1
41     1
Name: women, dtype: int64

In [212]:
# Load English tokenizer, tagger, parser, NER and word vectors
parser = English()


In [220]:

# Creating our tokenizer function
def spacy_tokenizer(sentence):
    # Creating our token object, which is used to create documents with linguistic annotations.
    mytokens = parser(sentence)

    # Lemmatizing each token and converting each token into lowercase
    mytokens = [word.lower() for word in mytokens]

    #mytokens = [word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens]
    
    # Removing stop words
    mytokens = [word for word in mytokens if word not in stop_words and word not in punctuations]

    # return preprocessed list of tokens
    return mytokens

In [221]:
spacy_tokenizer('John has a beard and he likes to keep it neat')

TypeError: 'int' object is not callable

In [216]:
# got this code from Amir Semsarzadeh
def poles(text):
    try:
        return TextBlob(text).polarity
    except:
        return None
def subj(text):
    try:
        return TextBlob(text).subjectivity
    except:
        return None
#calls in each seperately for direct use alternatively you could get 
#it in a tuple but getting both values in the column is going to be annoying. 
bechdel_df['script_polarity'] = bechdel_df['script'].apply(poles)
bechdel_df['script_subjectivity'] = bechdel_df['script'].apply(subj)

In [217]:
bechdel_df.head()

Unnamed: 0,title,script,simple_bechdel,women,script_polarity,script_subjectivity
0,warrior,WARRIOR ...,0,15,,
1,collateral,COLLA...,0,12,,
2,gamer,GAMER ...,0,28,,
3,wanted,WANTED ...,0,8,,
4,2001: a space odyssey,2001: A SPACE ODYSSEY Screenplay by ...,0,7,,


In [48]:
print(punctuations)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [58]:
# Custom transformer using spaCy
class predictors(TransformerMixin):
    def transform(self, X, **transform_params):
        # Cleaning Text
        return [clean_text(text) for text in X]

    def fit(self, X, y=None, **fit_params):
        return self

    def get_params(self, deep=True):
        return {}

# Basic function to clean the text
def clean_text(text):
    # Removing spaces and converting text into lowercase
    return text.strip().lower()

In [59]:
# Let's instantiate TF-IDF here
tfidf_vector = TfidfVectorizer(tokenizer = spacy_tokenizer)

In [189]:
X = bechdel_df[['script', 'women']]
y = bechdel_df['simple_bechdel']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [61]:
logreg = LogisticRegression()

In [62]:
# Create pipeline using Bag of Words
pipe = Pipeline([("cleaner", predictors()),
                 ('vectorizer', tfidf_vector),
                 ('classifier', logreg)])

In [63]:
pipe.fit(X_train, y_train)

Pipeline(steps=[('cleaner', <__main__.predictors object at 0x7fd554890880>),
                ('vectorizer',
                 TfidfVectorizer(tokenizer=<function spacy_tokenizer at 0x7fd5dd300280>)),
                ('classifier', LogisticRegression())])

In [65]:
preds = pipe.predict(X_test)

In [68]:
print("Logistic Regression Accuracy:", accuracy_score(y_test, preds))
print("Logistic Regression Precision:", precision_score(y_test, preds))
print("Logistic Regression Recall:", recall_score(y_test, preds))

Logistic Regression Accuracy: 0.4782608695652174
Logistic Regression Precision: 0.5483870967741935
Logistic Regression Recall: 0.5151515151515151
