# Class 8: Parsing - application to gendered language, Named Entities


In [1]:
import spacy
from spacy import displacy
import pandas as pd 

In [2]:
df = pd.read_csv("Text_as_data/Data/Class6/book_reviews.csv")

df.head()

Unnamed: 0,Id,Title,Price,User_id,profileName,review/helpfulness,review/score,review/time,review/summary,review/text,review_positive_help,review_tot_help
0,0802841899,The Church of Christ: A Biblical Ecclesiology ...,25.97,ARI272XF8TOL4,Christopher J. Bray,74/81,5.0,955411200,Ecclesiological Milestone,With the publication of Everett Ferguson's boo...,74,81
1,0974289108,The Ultimate Guide to Law School Admission: In...,14.95,A1KZ0RDJZQSY4O,sayock,27/29,3.0,1090368000,No &quot;Insider&quot; Secrets,If you are someone who is fairly new to the la...,27,29
2,0809080699,The Repeal of Reticence: A History of America'...,,A18YY5TBNSDW3O,Bartleby,29/29,5.0,899164800,Great treatment of the defeat of reticence by ...,"Using a quiet, restrained writing style that i...",29,29
3,B000NKGYMK,Alaska Sourdough,,,,36/37,5.0,949104000,Real Alaskan Sourdough,Ruth Allman has written an excellent book abou...,36,37
4,B000NKGYMK,Alaska Sourdough,,AC58Z72OB2DDX,Gary W. Marian,29/30,5.0,945734400,True Alaskan cooking,"I have been using this book since 1988, the ei...",29,30


In [3]:
df.loc[0, 'review/text']

'With the publication of Everett Ferguson\'s book on ecclesiology, another milestone has been reached in the scholarly presentation of the distinctive theological perspective of Churches of Christ. The book is divided into six chapters, each roughly sixty to seventy pages in length. Within each chapter, Ferguson neatly and systematically outlines his thoughts and arguments. The first chapter, entitled "The People and the Messiah: History and Eschatology," deals mainly with background issues. It examines the Old Testament teaching on the importance of covenant and the meaning of the phrase "kingdom of God" in its relationship to an and distinction from the church. Ferguson enters into the New Testament and ecclesiology proper via a consideration of Jesus as Messiah, including a careful exegesis of Matt. 16:13-23, where he concludes that the "rock" of Matt. 16:18 is not Peter, but the fact of Jesus\' Messiahship. Ferguson\'s analysis of Matt. 16:13-23 is insightful and carefully articula

### Dependency Parsing

In [4]:
text = 'The book is divided into six chapters, each roughly sixty to seventy pages in length.'
nlp = spacy.load('en_core_web_md')
doc = nlp(text)
# We can see the dependency tree
displacy.render(doc, style="dep")

In [5]:

for token in doc:
    print(f"{token.text}, POS Tag: {token.pos_}, Dependency Tag: {token.dep_}")


The, POS Tag: DET, Dependency Tag: det
book, POS Tag: NOUN, Dependency Tag: nsubjpass
is, POS Tag: AUX, Dependency Tag: auxpass
divided, POS Tag: VERB, Dependency Tag: ROOT
into, POS Tag: ADP, Dependency Tag: prep
six, POS Tag: NUM, Dependency Tag: nummod
chapters, POS Tag: NOUN, Dependency Tag: pobj
,, POS Tag: PUNCT, Dependency Tag: punct
each, POS Tag: DET, Dependency Tag: det
roughly, POS Tag: ADV, Dependency Tag: advmod
sixty, POS Tag: NUM, Dependency Tag: quantmod
to, POS Tag: PART, Dependency Tag: quantmod
seventy, POS Tag: NUM, Dependency Tag: nummod
pages, POS Tag: NOUN, Dependency Tag: appos
in, POS Tag: ADP, Dependency Tag: prep
length, POS Tag: NOUN, Dependency Tag: pobj
., POS Tag: PUNCT, Dependency Tag: punct


List of labels can be found here: https://github.com/clir/clearnlp-guidelines/blob/master/md/specifications/dependency_labels.md

In [6]:
for sent in doc.sents:
    print("sentence:", sent)
    print("root:", sent.root)
    print([(w, w.dep_) for w in sent.root.children])   
    print()

sentence: The book is divided into six chapters, each roughly sixty to seventy pages in length.
root: divided
[(book, 'nsubjpass'), (is, 'auxpass'), (into, 'prep'), (., 'punct')]


In [7]:
df = df.sample(n=30000, random_state=12)
df["processed"] = df['review/text'].apply(lambda x: nlp(x))

def extract_subject_verb_pairs(sent):
    subjs = [w for w in sent if w.dep_ == "nsubj"]
    pairs = [(w.lemma_.lower(), w.head.lemma_.lower()) for w in subjs]
    return pairs

df["subj-verb-pairs"] = df["processed"].apply(lambda x: extract_subject_verb_pairs(x))


In [8]:
df["subj-verb-pairs"].head(10)

157418    [(i, own), (book, be), (he, start), (shankar, ...
216111    [(book, be), (that, pass), (character, be), (i...
103276    [(book, be), (i, read), (i, gain), (that, incr...
29987     [(they, expect), (percent, respond), (they, do...
179080    [(i, browse), (fool, disclose), (that, annoys)...
63415     [(past, be), (it, be), (picture, be), (book, b...
121872    [(it, be), (it, be), (it, be), (it, be), (it, ...
172430    [(lounge, imagine), (i, imagine), (they, serve...
182427    [(i, check), (who, love), (it, be), (that, det...
11693     [(i, use), (it, be), (it, feel), (suggestion, ...
Name: subj-verb-pairs, dtype: object

In [9]:
df.loc[157418, 'review/text']

"I own about a dozen or more books on quantum mechanics and Shankar's is by far my favorite. He starts out with an excellent chapter on the mathematical prerequisites of the subject and then develops the theory from the postulates, always having the student in mind. Shankar is obviously a man with high exposition skills (I wonder if he plays the sitar as well as he writes quantum mechanics?).I highly recommend this book over Cohen-Tannoudji (too thick!) or Griffiths (which deceptively looks like a good book until you actually start reading the text and attempting the poor selection of problems)."

In [10]:
df.loc[157418, "subj-verb-pairs"]

[('i', 'own'),
 ('book', 'be'),
 ('he', 'start'),
 ('shankar', 'be'),
 ('i', 'wonder'),
 ('he', 'play'),
 ('he', 'write'),
 ('mechanics?).i', 'recommend'),
 ('which', 'look'),
 ('you', 'start')]

In [11]:
import pandas as pd

# Assuming 'df' is your existing DataFrame

# Step 1: Flatten the list of subject-verb pairs from the 'subj-verb-pairs' column
all_subj_verb_pairs = [pair for sublist in df['subj-verb-pairs'] for pair in sublist]

# Step 2: Convert the list of pairs to a DataFrame and count the occurrences
subj_verb_df = pd.DataFrame(all_subj_verb_pairs, columns=['Subject', 'Verb'])
subj_verb_counts = subj_verb_df.groupby(['Subject', 'Verb']).size().reset_index(name='Count')

# Step 3: Sort the pairs by their count in descending order to see the most common pairs
most_common_pairs = subj_verb_counts.sort_values(by='Count', ascending=False).reset_index(drop=True)

# Optional: Filter or display only the top N for clarity
most_common_pairs.head(10)  # Adjust N to your preference

Unnamed: 0,Subject,Verb,Count
0,it,be,32954
1,this,be,13333
2,i,be,11753
3,book,be,11194
4,that,be,8654
5,i,read,7307
6,he,be,5671
7,i,think,5465
8,i,have,5391
9,i,find,5023


In [12]:
most_common_pairs.loc[most_common_pairs['Subject']=="he"]

Unnamed: 0,Subject,Verb,Count
6,he,be,5671
30,he,have,1606
52,he,do,1072
58,he,write,927
67,he,say,834
...,...,...,...
134245,he,unseat,1
134246,he,unravel,1
134247,he,unpopular,1
134248,he,unpleasant,1


In [13]:
most_common_pairs.loc[most_common_pairs['Subject']=="she"]


Unnamed: 0,Subject,Verb,Count
18,she,be,2530
65,she,have,843
140,she,do,401
142,she,write,397
196,she,say,291
...,...,...,...
61112,she,wrong,1
61113,she,writte,1
61114,she,writhe,1
61120,she,worry,1


In [14]:
male_nouns = [ "man", "boy", "father", "son", "brother", "husband", "uncle", "nephew", "emperor",
    "king", "prince", "duke", "lord", "knight", "waiter", "actor", "god", "policeman",
    "postman", "hero", "wizard", "steward", "he", "men", "boys", "fathers",
    "sons", "brothers", "husbands", "uncles", "nephews", "emperors", "kings", "princes",
    "dukes", "lords", "knights", "waiters", "actors", "gods", "policemen", "postmen",
    "heros", "wizards", "stewards"]

female_nouns = [ "woman", "girl", "mother", "daughter", "sister", "wife", "aunt", "niece", "empress",
    "queen", "princess", "duchess", "lady", "dame", "waitress", "actress", "goddess",
    "policewoman", "postwoman", "heroine", "witch", "stewardess", "she", "women", "girls",
    "mothers", "daughters", "sisters", "wives", "aunts", "nieces", "empresses", "queens",
    "princesses", "duchesses", "ladies", "dames", "waitresses", "actresses", "goddesses",
    "policewomen", "postwomen", "heroines", "witches", "stewardesses"]


We can try to build our own PMI to see which words are associated with male and female nouns. 

In [15]:
#replace them using <MALE> or <FEMALE>
subj_verb_counts_gender = subj_verb_counts.copy(deep=True)
subj_verb_counts_gender['Subject'] = subj_verb_counts_gender['Subject'].apply(lambda x: "<MALE>" if x in male_nouns else x)
subj_verb_counts_gender['Subject'] = subj_verb_counts_gender['Subject'].apply(lambda x: "<FEMALE>" if x in female_nouns else x)

In [16]:
subj_verb_counts_gender.loc[subj_verb_counts_gender['Subject']=="<MALE>"]

Unnamed: 0,Subject,Verb,Count
2098,<MALE>,be,6
2099,<MALE>,come,2
2100,<MALE>,deliver,2
2101,<MALE>,develop,1
2102,<MALE>,do,2
...,...,...,...
131590,<MALE>,be,1
131591,<MALE>,flaunt,1
131592,<MALE>,provide,1
131593,<MALE>,rule,1


In [17]:
subj_verb_counts_gender.loc[subj_verb_counts_gender['Subject']=="<FEMALE>"]


Unnamed: 0,Subject,Verb,Count
2117,<FEMALE>,be,4
2118,<FEMALE>,launch,1
2119,<FEMALE>,look,1
7812,<FEMALE>,be,4
7813,<FEMALE>,beg,1
...,...,...,...
132054,<FEMALE>,be,18
132055,<FEMALE>,live,1
132056,<FEMALE>,lousy,2
132057,<FEMALE>,need,1


In [18]:
#We need to aggregate because we created new duplicates 
subj_verb_counts_gender = subj_verb_counts_gender.groupby(['Subject', 'Verb'])['Count'].sum().reset_index()
subj_verb_counts_gender.sort_values(by='Count', ascending=False)

Unnamed: 0,Subject,Verb,Count
58894,it,be,32954
116359,this,be,13333
53512,i,be,11753
16525,book,be,11194
111718,that,be,8654
...,...,...,...
50864,hiaasen,outdo,1
50863,hiaasen,forge,1
50862,hiaasen,dumb,1
50861,hi,want,1


In [19]:
import numpy as np

import numpy as np

def calculate_pmi(subject, verb, dataframe):
    try:
        total_count = dataframe['Count'].sum()

        # Sum of "Counts" for all rows containing the specified subject
        subject_sum = dataframe[dataframe['Subject'] == subject]['Count'].sum()

        # Sum of "Counts" for all rows containing the specified verb
        verb_sum = dataframe[dataframe['Verb'] == verb]['Count'].sum()

        # Check if subject or verb is not in the dataset
        if subject_sum == 0 or verb_sum == 0:
            raise ValueError(f"Either subject '{subject}' or verb '{verb}' does not exist in the dataset.")

        # Number of "Counts" for the row that corresponds to the "Verb" and "Subject"
        specific_pair_count = dataframe[(dataframe['Subject'] == subject) & (dataframe['Verb'] == verb)]['Count'].sum()

        # Check if the specific subject-verb pair never occurs together
        if specific_pair_count == 0:
            raise ValueError(f"The specific subject-verb pair ('{subject}', '{verb}') never occurs together in the dataset.")

        # Calculate probabilities
        p_x_y = specific_pair_count / total_count
        p_x = subject_sum / total_count
        p_y = verb_sum / total_count

        # Calculate PMI
        pmi = np.log(p_x_y / (p_x * p_y))
        return pmi

    except ValueError as e:
        # Return the error message instead of raising an exception, or handle it as desired
        return str(e)



In [20]:
calculate_pmi("<MALE>", "fight", subj_verb_counts_gender)

0.5080249426507637

In [21]:
# Step 1: Identify verbs that appear at least 50 times
verbs_at_least_50 = subj_verb_counts_gender.groupby('Verb')['Count'].sum()
verbs_at_least_50 = verbs_at_least_50[verbs_at_least_50 >= 50].index.tolist()

# Initialize lists to store PMI results
male_pmi_list = []
female_pmi_list = []

# Step 2 & 3: Calculate the PMI for "<MALE>" and "<FEMALE>" with these verbs
for verb in verbs_at_least_50:
    male_pmi = calculate_pmi("<MALE>", verb, subj_verb_counts_gender)
    female_pmi = calculate_pmi("<FEMALE>", verb, subj_verb_counts_gender)

    # Append the results to the lists
    male_pmi_list.append({'Verb': verb, 'PMI': male_pmi})
    female_pmi_list.append({'Verb': verb, 'PMI': female_pmi})

# Step 4: Store the PMI values in separate dataframes
male_df = pd.DataFrame(male_pmi_list)
female_df = pd.DataFrame(female_pmi_list)

# Optionally, you might want to handle or filter out potential errors or 'NaN' values from the PMI calculation
male_df = male_df.dropna()  # This removes rows where PMI couldn't be calculated
female_df = female_df.dropna()


In [22]:
male_df

Unnamed: 0,Verb,PMI
0,'s,"The specific subject-verb pair ('<MALE>', ''s'..."
1,abandon,0.866159
2,abound,"The specific subject-verb pair ('<MALE>', 'abo..."
3,absorb,-1.283735
4,abuse,1.229029
...,...,...
738,worth,-2.003315
739,would,-0.572569
740,write,1.088192
741,wrong,-1.136504


In [23]:
# Convert the 'PMI' column to numeric, coercing errors to NaN, then drop NaN values for male_df
male_df['PMI'] = pd.to_numeric(male_df['PMI'], errors='coerce')
male_df = male_df.dropna(subset=['PMI'])

# Repeat the process for female_df
female_df['PMI'] = pd.to_numeric(female_df['PMI'], errors='coerce')
female_df = female_df.dropna(subset=['PMI'])

In [24]:
male_df.sort_values(by=['PMI'], ascending=False)

Unnamed: 0,Verb,PMI
80,bless,2.518474
295,forbid,2.066488
112,cite,1.684599
361,inherit,1.575182
122,command,1.549479
...,...,...
323,happen,-2.508035
356,increase,-2.605490
576,result,-2.709421
377,involve,-3.010955


In [25]:
female_df.sort_values(by=['PMI'], ascending=False)

Unnamed: 0,Verb,PMI
418,marry,2.048892
625,sleep,1.772397
135,conduct,1.765883
440,obey,1.741626
370,interview,1.660076
...,...,...
109,check,-1.839712
115,clear,-1.882454
23,affect,-2.103084
669,suspect,-2.229036


### Named Entities

In [26]:
def extract_named_entities(sent):
    # Extract named entities and their labels from the sentence
    entities = [(ent.text, ent.label_) for ent in sent.ents]
    return entities

# Apply the function to the 'processed' column to create a new 'named_entities' column
df["named_entities"] = df["processed"].apply(lambda x: extract_named_entities(x))


In [27]:
df["named_entities"].head(10)


157418    [(about a dozen, CARDINAL), (Shankar, PERSON),...
216111    [(these days, DATE), (Two, CARDINAL), (bush, P...
103276    [(three, CARDINAL), (ages 13, 18, DATE), (22, ...
29987     [(several years ago, DATE), (MIT, ORG), (half,...
179080    [(first, ORDINAL), (first, ORDINAL), (Wednesda...
63415            [(Rome, GPE), (Hadrian, ORG), (Rome, GPE)]
121872    [(The Seven Pillars, WORK_OF_ART), (Lawrence, ...
172430       [(Etta's &amp, ORG), (the Dahlia Lounge, ORG)]
182427                                                   []
11693                                                    []
Name: named_entities, dtype: object

In [28]:
# Step 1: Flatten the list of tuples from the 'named_entities' column
all_entities = [entity for sublist in df['named_entities'] for entity in sublist]

# Step 2: Convert the list of entities to a DataFrame and count the occurrences
entities_df = pd.DataFrame(all_entities, columns=['Entity', 'Label'])
entity_counts = entities_df.groupby(['Entity', 'Label']).size().reset_index(name='Count')

# Step 3: Sort the entities by their count in descending order to see the most common entities
most_common_entities = entity_counts.sort_values(by='Count', ascending=False).reset_index(drop=True)

# Optional: you might want to filter or display only the top N for clarity
most_common_entities.head(20)  # Adjust N to your preference


Unnamed: 0,Entity,Label,Count
0,first,ORDINAL,9176
1,one,CARDINAL,8621
2,two,CARDINAL,4876
3,American,NORP,2813
4,today,DATE,2438
5,Christian,NORP,2137
6,One,CARDINAL,2096
7,three,CARDINAL,1899
8,English,LANGUAGE,1874
9,second,ORDINAL,1788


In [29]:
most_common_entities.loc[most_common_entities['Label']=="PERSON"].head(20)  # Adjust N to your preference


Unnamed: 0,Entity,Label,Count
12,Jesus,PERSON,1439
28,Lewis,PERSON,717
31,Bush,PERSON,677
36,Kerry,PERSON,613
38,Hitler,PERSON,606
46,Smith,PERSON,493
53,Clinton,PERSON,441
55,Joyce,PERSON,430
63,Darwin,PERSON,372
64,Shakespeare,PERSON,369


### Semantic Role Labeling

Please open the following notebook for more details on Semantic Role Labeling:

[Class 8 - Google Colab](https://colab.research.google.com/drive/1TA7GxMYPbGb2__BEZCMtSw90rvB4oM03?usp=sharing)
