# Loading and downloading packages

In [1]:
%%capture
!pip install gender_guesser
!pip install pyLDAvis

In [4]:
import pandas as pd

import gender_guesser.detector as gender

import pickle
import pandas as pd
import numpy as np
import spacy 
import gensim
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
from gensim.models import Word2Vec
import matplotlib.pyplot as plt

%matplotlib inline
import tensorflow as tf
import tensorflow_hub as hub

from sklearn.manifold import TSNE
#from tensorflow.examples.tutorials.mnist import input_data
#from tensorflow.contrib.tensorboard.plugins import projector

nlp = spacy.load("en_core_web_sm")

  from collections import Iterable


In [2]:
# connecting to my drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Loading the data and some basic checks

In [6]:
path = "/content/drive/MyDrive/Thesis - Detecting Bias/Data Thesis/hansard-speeches-v310.csv"
df = pd.read_csv(path)

  exec(code_obj, self.user_global_ns, self.user_ns)


In [7]:
len(df)

2694375

In [28]:
# only keep debates later than 2015
df_sub = df[df.year >= 2015]

In [9]:
len(df_sub)

417154

## Some NaN cleaning

In [10]:
df_sub.isnull().sum()

id                            0
speech                        0
display_as                    0
party                     16439
constituency              16443
mnis_id                   16439
date                          0
time                     108148
colnum                        0
speech_class                  0
major_heading              2068
minor_heading            182539
oral_heading             417148
year                          0
hansard_membership_id    417154
speakerid                399045
person_id                 33878
speakername                   0
url                      337355
dtype: int64

In [30]:
df3 = df_sub[['speakername','speech','party']]
df3.head(3)

Unnamed: 0,speakername,speech,party
2277221,Unknown,The Secretary of State was asked —,
2277222,Jennifer Willott,What progress her Department has made on imple...,Liberal Democrat
2277223,Theresa May,The Government are on track to deliver their c...,Conservative


## Check female vs male speakers

In [32]:
d = gender.Detector()
#df2['gender'] = d(df['speakername'])
df3['first_name'] = df3['speakername'].str.split(' ').str[0]
df3['gender'] = df3['first_name'].map(lambda x: d.get_gender(x))
df3.head(5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


Unnamed: 0,speakername,speech,party,first_name,gender
2277221,Unknown,The Secretary of State was asked —,,Unknown,unknown
2277222,Jennifer Willott,What progress her Department has made on imple...,Liberal Democrat,Jennifer,female
2277223,Theresa May,The Government are on track to deliver their c...,Conservative,Theresa,female
2277224,Jennifer Willott,"It is clear that exit checks, which were scrap...",Liberal Democrat,Jennifer,female
2277225,Theresa May,"As I indicated in my original answer, we are o...",Conservative,Theresa,female


In [33]:
df3['gender'].value_counts()

male             255571
female           101319
unknown           28803
mostly_male       17206
mostly_female     10332
andy               3923
Name: gender, dtype: int64

In [34]:
df3_male = df3[df3['gender']=='male']
df3_female = df3[df3['gender']=='female']

## Some exploring to extract adjectives

In [24]:
male_words = ["nephew", "nephews", "father", "dad", "daddies", "grandfather", "step-son", "step-sons", "men", "son-in-law", "king", "daddy", "son", "groom", "grooms", "gentleman", "gentlemen", "sir", "lad", "grandson", "grand-son", "dads", "prince", "stepfathers", "boyfriend", "males", "grandfathers", "grand-fathers", "husband", "husbands", "boy", "bachelor", "bachelors", "duke", "sirs", "papas", "heir", "uncle", "mr", "lords", "father-inlaw", "sons-in-law", "fiances","fiance", "uncles", "lads", "brother", "grandpa", "grandpas", "heirs", "male",  "grandsons", "grandsons", "boys", "he", "step-father", "bridegroom", "bridegrooms stepfather",  "mr.", "brothers", "man", "sons", "boyfriends", "he’s", "his", "him", "stepson", "stepsons", "guy", "spokesman", "spokesmen", "pa", "dude", "dudes", "paternal", "brotherhood", "countryman", "countrymen", "suitor", "macho", "papa", "strongman", "strongmen", "boyhood", "manhood", "masculine", "macho", "horsemen", "brethren", "chap", "chaps", "schoolboy", "schoolboys", "bloke", "blokes", "patriarch", "patriachy", "fatherhood", "hubby", "hubbies", "fella", "fellas","fraternal", "bro", "masculinity", "pappy", "papi", "pappies", "dada", "bf", "bfs", "knights", "knight", "menfolk", "brotherly", "manly", "pimp", "pimps", "homeboy", "homeboys", "grandnephew", "grandnephew", "grand-nephew", "grand-nephews", "nobleman", "noblemen", "dream boy", "himself", "gramps"]
female_words = ["niece", "mother", "duchesses", "mom", "belle", "belles", "mummies",  "grandmother", "landlady", "landladies", "nuns", "stepdaughter", "women", "daughter-in-law", "daughter", "queens", "brides", "lady", "queen", "matron",  "mummy", "madam", "granddaughter", "grand-daughter", "moms", "princess", "stepmothers", "stepdaughters", "girlfriend", "females", "grand-mothers", "grandmothers", "step-daughter", "nieces", "wife", "mother", "wives", "girl", "duchess", "madams", "mamas", "aunt", "princesses", "fiancee", "mrs", "ladies", "mother-in-law", "bride", "daughters-in-law", "aunts", "sister", "grandma", "grandmas", "female", "granddaughters", "grand-daughters", "girls", "she", "stepmother", "stepmother", "mrs.", "sisters", "mama", "woman", "daughters", "girlfriends", "she’s", "her", "maid", "mum", "maiden", "maidens", "dudette", "maternal", "sisterhood", "housewife", "housewives", "chick", "chicks", "mommy", "babe", "babes", "diva", "divas", "gal", "gals", "sistren", "schoolgirl", "schoolgirls", "matriarch", "matriarchy", "motherhood", "wifey", "sis", "femininity", "granny", "grannies", "mami", "momma", "gf", "gfs", "damsel", "damsels", "vixen", "vixens", "nan", "nanny", "nannies", "auntie", "womenfolk", "sisterly", "motherly", "homegirl", "homegirls", "grand-niece", "grand-nieces", "grandniece", "grandnieces", "dream girl", "madame", "herself", "hers"]


In [25]:
def adj_check(word):
  doc = nlp(word)
  if doc[0].pos_ == 'ADJ':
    return(word)

In [26]:
# found on: https://albertauyeung.github.io/2018/06/03/generating-ngrams.html/
import re

def generate_ngrams(speech, n):
  n_grams = []
  for s in speech:
    # Convert to lowercases
    s = s.lower()
    
    # Replace all none alphanumeric characters with spaces
    s = re.sub(r'[^a-zA-Z0-9\s]', ' ', s)
    
    # Break sentence in the token, remove empty tokens
    tokens = [token for token in s.split(" ") if token != ""]
    
    # Use the zip function to help us generate n-grams
    # Concatentate the tokens into ngrams and return
    ngrams = zip(*[tokens[i:] for i in range(n)])
    ngr = [" ".join(ngram) for ngram in ngrams]
    for x in ngr:
      n_grams.append(x)

  return(n_grams)

## Female speakers ngrams + adjectives

In [27]:
n_grams_female = generate_ngrams(df3_female.speech, 2)

In [35]:
# m_fs refers to male_femalespeaker. Meaning the male ngrams in the female speeches are retrieved here
ngram_m_fs = list((filter(lambda val: val if (val.split(" ")[1] in male_words) else '', [ngram for ngram in n_grams_female] )))

In [36]:
# f_fs refers to female_femalespeaker. Meaning the female ngrams in the female speeches are retrieved here
ngram_f_fs = list((filter(lambda val: val if (val.split(" ")[1] in female_words) else '', [ngram for ngram in n_grams_female] )))

## Male speaker ngram + adjectives

In [38]:
# create ngrams male debates
n_grams_male = generate_ngrams(df3_male.speech, 2)

In [39]:
# m_ms refers to male_malespeaker. Meaning the male ngrams in the male speeches are retrieved here
ngram_m_ms = list((filter(lambda val: val if (val.split(" ")[1] in male_words) else '', [ngram for ngram in n_grams_male] )))

In [40]:
# f_ms refers to female_malespeaker. Meaning the male ngrams in the male speeches are retrieved here
ngram_f_ms = list((filter(lambda val: val if (val.split(" ")[1] in female_words) else '', [ngram for ngram in n_grams_male] )))

## Finding ngrams with adjectives

In [None]:
# the adj_check only returns a word if it's an adjective
adj_check("beautiful")

'beautiful'

In [None]:
#retrieving all adjectives in the ngrams
adjs_f_fs = list((filter(lambda val: val if adj_check(val.split(" ")[0]) else '', [ngram for ngram in ngram_f_fs] )))
adjs_m_fs = list((filter(lambda val: val if adj_check(val.split(" ")[0]) else '', [ngram for ngram in ngram_m_fs] )))
adjs_f_ms = list((filter(lambda val: val if adj_check(val.split(" ")[0]) else '', [ngram for ngram in ngram_f_ms] )))
adjs_m_ms = list((filter(lambda val: val if adj_check(val.split(" ")[0]) else '', [ngram for ngram in ngram_m_ms] )))

Execution time: 0:59:41.051821 seconds


In [None]:
lists = [adjs_f_fs, adjs_m_fs, adjs_f_ms, adjs_m_ms ]

In [None]:
# the adjectives are all saved to review manually and divide over the categories manually
dfff1 = pd.DataFrame({'col': adjs_f_fs})
dfff1.to_csv('adj1.csv', index=False)
dfff2 = pd.DataFrame({'col': adjs_m_fs})
dfff2.to_csv('adj2.csv', index=False)
dfff3 = pd.DataFrame({'col': adjs_f_ms})
dfff3.to_csv('adj3.csv', index=False)
dfff4 = pd.DataFrame({'col': adjs_m_ms})
dfff4.to_csv('adj4.csv', index=False)

## Preprocess text

In [41]:
%%capture

# tokenise the texts for the Word2Vec training
def tokenize1(texts):
    processed_texts = [text for text in nlp.pipe(texts, 
                                              disable=["ner",
                                                       "parser"])]
    # lower case tokens and remove punctuations
    tokenized_text = [[token.text.lower() for token in processed_text if not token.is_punct] 
                      for processed_text in processed_texts]

    # return tokenized text
    return (tokenized_text)

In [42]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 

In [None]:
%%capture

# the tokenizing
female_tokens = tokenize1(df3_female['speech'])
male_tokens = tokenize1(df3_male['speech'])

## Create word embeddings

In [None]:
female_tokens = tokenize1(df3_female.speech)

In [None]:
female_tokens1 = female_tokens[:10000]

In [None]:
# Female model

SIZE = 300 # dimensions of the embeddings
SG = 1 # whether to use skip-gram or CBOW (we use skip-gram)
WINDOW = 10 # the window size
N_WORKERS = 1 # number of workers to use
MIN_COUNT = 1

# setting up the word2vec female_model
female_model = Word2Vec(
                size=SIZE,
                sg=SG,
                window=WINDOW, 
                min_count=MIN_COUNT,
                workers=N_WORKERS)

# the female_model
female_model.build_vocab(female_tokens)

female_model.train(female_tokens,
           total_examples=female_model.corpus_count,
           epochs=female_model.epochs) # grab some coffee while training

(59946770, 81960705)

In [None]:
# save the female model for later use
female_model.save("female_word2vec.model")

In [None]:
len(df3_male.speech)

255571

In [None]:
# the tokenizing is done in 3 steps because of storage issues
male_tokens1 = tokenize1(df3_male.speech[:90000])

In [None]:
male_tokens2 = tokenize1(df3_male.speech[90001:180000])

In [None]:
male_tokens3 = tokenize1(df3_male.speech[180001:-1])

In [None]:
# add all tokens together
male_tokens = male_tokens1 + male_tokens2 + male_tokens3

In [None]:
# male model

# 300 as advised in the lectures
SIZE = 300 # dimensions of the embeddings
SG = 1 # whether to use skip-gram or CBOW (we use skip-gram)
WINDOW = 10 # the window size
N_WORKERS = 1 # number of workers to use
MIN_COUNT = 1

# setting up the word2vec male_model
male_model = Word2Vec(
                size=SIZE,
                sg=SG,
                window=WINDOW, 
                min_count=MIN_COUNT,
                workers=N_WORKERS)

# the male_model
male_model.build_vocab(male_tokens)

male_model.train(male_tokens,
           total_examples=male_model.corpus_count,
           epochs=male_model.epochs) # grab some coffee while training

(139790552, 192715360)

In [None]:
# save the male model for later use
male_model.save("male_word2vec.model")