In [161]:
from datasets import load_dataset
import pandas as pd
import numpy as np
import re

from collections import defaultdict

from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

## Dataset Card for Multi-Dimensional Gender Bias Classification

**Dataset Summary**


The Multi-Dimensional Gender Bias Classification dataset is based on a general framework that decomposes gender bias in text along several pragmatic and semantic dimensions: bias from the gender of the person being spoken about, bias from the gender of the person being spoken to, and bias from the gender of the speaker. It contains seven large scale datasets automatically annotated for gender information (there are eight in the original project but the Wikipedia set is not included in the HuggingFace distribution), one crowdsourced evaluation benchmark of utterance-level gender rewrites, a list of gendered names, and a list of gendered words in English.

**New_data config**

- **text:** the text to be classified
- **original:** the text before reformulation
- **labels:** a list of classification labels, with possible values including ABOUT:female, ABOUT:male, PARTNER:female, PARTNER:male, SELF:female.
- **class_type:** a classification label, with possible values including about (0), partner (1), self (2).
- **turker_gender:** a classification label, with possible values including man (0), woman (1), nonbinary (2), prefer not to say (3), no answer (4).
- **episode_done:** a boolean indicating whether the conversation was completed.
- **confidence:** a string indicating the confidence of the annotator in response to the instance label being ABOUT/TO/AS a man or woman. Possible values are certain, pretty sure, and unsure.

In [48]:
# Training set
dataset_new_data_train = load_dataset("md_gender_bias", name='new_data', split='train')
dataset_new_data_train

Reusing dataset md_gender_bias (/Users/vahidsj/.cache/huggingface/datasets/md_gender_bias/new_data/1.0.0/8ae77b51acf93383161cc954b146159291beca6c979b54ce228c46db86116c05)


Dataset({
    features: ['text', 'original', 'labels', 'class_type', 'turker_gender', 'episode_done', 'confidence'],
    num_rows: 2345
})

In [49]:
df_new_data_train = dataset_new_data_train.to_pandas()
df_new_data_train

Unnamed: 0,text,original,labels,class_type,turker_gender,episode_done,confidence
0,He designed monumental Lovissa War Cemetery in...,She designed monumental Loviisa war cemetery ...,[1],0,4,True,certain
1,She works as a volunteer firefighter.,I work as a volunteer firefighter.,[4],2,4,True,certain
2,She participated in the FIVB volleyball grand ...,She participated in the FIVB volleyball grand ...,[0],0,4,True,certain
3,Correct ! Walter Havighurst was an author of m...,Correct! Walter Havighurst was an author of ma...,[1],0,4,True,certain
4,"Yes, she released only one album with them, ""w...","yes, He released only one album with them, ""Wo...",[0],0,4,True,certain
...,...,...,...,...,...,...,...
2340,I am fine today bro. What about you Jim?,i am fine today bro. what about you,[1],0,1,True,pretty sure
2341,"Cool, well I was born in France among pretty g...","Cool, well I was born in France, then I moved ...",[2],1,1,True,certain
2342,"Yeah man, I have women's intuition you are rig...","Yeah man, you are right enough.",[4],2,1,True,certain
2343,Her name is lenoris and she just finished her ...,"Hello, my name is Lenoris I just finished my m...",[0],0,1,True,certain


In [71]:
def tuplizer(x):
    return tuple(x) if isinstance(x, (np.ndarray, list)) else x

In [74]:
df_new_data_train.labels.apply(tuplizer).unique()

array([(1,), (4,), (0,), (5,), (3,), (2,)], dtype=object)

**Funpedia config:**

- **text:** the text to be classified.
- **gender:** a classification label, with possible values including gender-neutral (0), female (1), male (2), indicating the gender of the person being talked about.
- **persona:** a string describing the persona assigned to the user when talking about the entity.
- **title:** a string naming the entity the text is about.

In [50]:
# Training set
dataset_funpedia_train = load_dataset("md_gender_bias", name='funpedia', split='train')
dataset_funpedia_train

Reusing dataset md_gender_bias (/Users/vahidsj/.cache/huggingface/datasets/md_gender_bias/funpedia/1.0.0/8ae77b51acf93383161cc954b146159291beca6c979b54ce228c46db86116c05)


Dataset({
    features: ['text', 'title', 'persona', 'gender'],
    num_rows: 23897
})

In [51]:
df_funpedia_train = dataset_funpedia_train.to_pandas()
df_funpedia_train

Unnamed: 0,text,title,persona,gender
0,Max Landis is a comic book writer who wrote Ch...,Max Landis,Humorous,2
1,Félix stevens is a retired cuban sprinter who ...,Félix Stevens,Confident,2
2,Arthur Engelbert is a professor at the Univers...,Arthur Engelbert,Creative,2
3,Danielle Frenkel is a high jumper born in Israel,Danielle Frenkel,Brilliant,1
4,Ernie O'Malley served in the Irish republican ...,Ernie O'Malley,Gentle,2
...,...,...,...,...
23892,If Cristian Rosso can win 2 gold medals in the...,Cristian Rosso,Optimistic,2
23893,With altitudes ranging from a low 700 to a who...,Wayanad district,Creative,0
23894,You can easily say that Cash Generator is the ...,Cash Generator,Energetic,2
23895,Eleanor Mosley (Elinor Mosely) (1700-?) when d...,Eleanor Mosley,Optimistic,1


In [35]:
# Test set
dataset_funpedia_test = load_dataset("md_gender_bias", name='funpedia', split='test')

Reusing dataset md_gender_bias (/Users/vahidsj/.cache/huggingface/datasets/md_gender_bias/funpedia/1.0.0/8ae77b51acf93383161cc954b146159291beca6c979b54ce228c46db86116c05)


In [52]:
df_funpedia_test = dataset_funpedia_test.to_pandas()
df_funpedia_test

Unnamed: 0,text,title,persona,gender
0,Hmmm yes Horacio Calcaterra is a sportsman tha...,Horacio Calcaterra,"Articulate (Well-spoken, Expressive)",2
1,John A. Fallon kindly accepted his appointment...,John A. Fallon,Humble,2
2,Arthur Knight sadly passed as a result of part...,Arthur Gerald Knight,Peaceful,2
3,I cannot believe you've never seen Pani Pani R...,Pani Pani Re,"Cultured (Refined, Educated)",2
4,Of course his legacy was that of the Taniguchi.,Yosa Buson,"Articulate (Well-spoken, Expressive)",2
...,...,...,...,...
2933,Sarah Phillips is respected throughout the com...,Sarah Phillips (cyclist),Respectful,1
2934,"Ruth Cohn created theme-centered interaction, ...",Ruth Cohn,Sophisticated,1
2935,Sonny Curtis was born on May 9th in 1937 and w...,Sonny Curtis,Open,2
2936,As soon as the first season of prison break wa...,Prison Break (soundtrack),Earnest (Enthusiastic),0


In [53]:
df_funpedia_train.gender.unique()

array([2, 1, 0])

In [63]:
# Female = 1 & Male = 2

df_male_female = df_funpedia_train[df_funpedia_train.gender != 0]

print("Number of Females:", df_male_female[df_male_female.gender == 1].shape[0])
print("Number of Males:", df_male_female[df_male_female.gender == 2].shape[0])

Number of Females: 3594
Number of Males: 19156


## Twitter User Gender Classification

In [77]:
df_twitter_gender_original = pd.read_csv("/Users/vahidsj/Documents/Work/Work_MARTIN/Gender Bias/Datasets/twitterText.csv", encoding='latin1')
df_twitter_gender_original

Unnamed: 0,_unit_id,_golden,_unit_state,_trusted_judgments,_last_judgment_at,gender,gender:confidence,profile_yn,profile_yn:confidence,created,...,profileimage,retweet_count,sidebar_color,text,tweet_coord,tweet_count,tweet_created,tweet_id,tweet_location,user_timezone
0,815719226,False,finalized,3,10/26/15 23:24,male,1.0000,yes,1.0,12/5/13 1:48,...,https://pbs.twimg.com/profile_images/414342229...,0,FFFFFF,Robbie E Responds To Critics After Win Against...,,110964,10/26/15 12:40,6.587300e+17,main; @Kan1shk3,Chennai
1,815719227,False,finalized,3,10/26/15 23:30,male,1.0000,yes,1.0,10/1/12 13:51,...,https://pbs.twimg.com/profile_images/539604221...,0,C0DEED,ÛÏIt felt like they were my friends and I was...,,7471,10/26/15 12:40,6.587300e+17,,Eastern Time (US & Canada)
2,815719228,False,finalized,3,10/26/15 23:33,male,0.6625,yes,1.0,11/28/14 11:30,...,https://pbs.twimg.com/profile_images/657330418...,1,C0DEED,i absolutely adore when louis starts the songs...,,5617,10/26/15 12:40,6.587300e+17,clcncl,Belgrade
3,815719229,False,finalized,3,10/26/15 23:10,male,1.0000,yes,1.0,6/11/09 22:39,...,https://pbs.twimg.com/profile_images/259703936...,0,C0DEED,Hi @JordanSpieth - Looking at the url - do you...,,1693,10/26/15 12:40,6.587300e+17,"Palo Alto, CA",Pacific Time (US & Canada)
4,815719230,False,finalized,3,10/27/15 1:15,female,1.0000,yes,1.0,4/16/14 13:23,...,https://pbs.twimg.com/profile_images/564094871...,0,0,Watching Neighbours on Sky+ catching up with t...,,31462,10/26/15 12:40,6.587300e+17,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20045,815757572,True,golden,259,,female,1.0000,yes,1.0,8/5/15 21:16,...,https://pbs.twimg.com/profile_images/656793310...,0,C0DEED,"@lookupondeath ...Fine, and I'll drink tea too...",,783,10/26/15 13:20,6.587400e+17,Verona ªÁ,
20046,815757681,True,golden,248,,male,1.0000,yes,1.0,8/15/12 21:17,...,https://pbs.twimg.com/profile_images/639815429...,0,0,Greg Hardy you a good player and all but don't...,,13523,10/26/15 12:40,6.587300e+17,"Kansas City, MO",
20047,815757830,True,golden,264,,male,1.0000,yes,1.0,9/3/12 1:17,...,https://pbs.twimg.com/profile_images/655473271...,0,C0DEED,You can miss people and still never want to se...,,26419,10/26/15 13:20,6.587400e+17,Lagos Nigeria,
20048,815757921,True,golden,250,,female,0.8489,yes,1.0,11/6/12 23:46,...,https://pbs.twimg.com/profile_images/657716093...,0,0,@bitemyapp i had noticed your tendency to pee ...,,56073,10/26/15 12:40,6.587300e+17,Texas Hill Country,


In [78]:
df_twitter_gender_original.columns

Index(['_unit_id', '_golden', '_unit_state', '_trusted_judgments',
       '_last_judgment_at', 'gender', 'gender:confidence', 'profile_yn',
       'profile_yn:confidence', 'created', 'description', 'fav_number',
       'gender_gold', 'link_color', 'name', 'profile_yn_gold', 'profileimage',
       'retweet_count', 'sidebar_color', 'text', 'tweet_coord', 'tweet_count',
       'tweet_created', 'tweet_id', 'tweet_location', 'user_timezone'],
      dtype='object')

In [104]:
df_twitter_gender = df_twitter_gender_original[['gender', 'gender:confidence', 'text']]
df_twitter_gender

Unnamed: 0,gender,gender:confidence,text
0,male,1.0000,Robbie E Responds To Critics After Win Against...
1,male,1.0000,ÛÏIt felt like they were my friends and I was...
2,male,0.6625,i absolutely adore when louis starts the songs...
3,male,1.0000,Hi @JordanSpieth - Looking at the url - do you...
4,female,1.0000,Watching Neighbours on Sky+ catching up with t...
...,...,...,...
20045,female,1.0000,"@lookupondeath ...Fine, and I'll drink tea too..."
20046,male,1.0000,Greg Hardy you a good player and all but don't...
20047,male,1.0000,You can miss people and still never want to se...
20048,female,0.8489,@bitemyapp i had noticed your tendency to pee ...


In [133]:
def clean_tweet(text):
    # Remove mention, URL and #
    text_cleaned = re.sub(r'(@|https?)\S+|#', '', text)
    # Remove special characters
    text_cleaned = re.sub('[^A-Za-z0-9;,!?.-]+', ' ', text_cleaned)
    # make sure we didn't introduce any double spaces
    text_cleaned = re.sub('\s+',' ',text_cleaned)
    return(text_cleaned)

In [134]:
df_twitter_gender_cleaned = df_twitter_gender.copy()
df_twitter_gender_cleaned['text'] = df_twitter_gender_cleaned['text'].apply(clean_tweet)

In [135]:
df_twitter_gender_cleaned

Unnamed: 0,gender,gender:confidence,text
0,male,1.0000,Robbie E Responds To Critics After Win Against...
1,male,1.0000,It felt like they were my friends and I was l...
2,male,0.6625,i absolutely adore when louis starts the songs...
3,male,1.0000,Hi - Looking at the url - do you use Don t typ...
4,female,1.0000,Watching Neighbours on Sky catching up with th...
...,...,...,...
20045,female,1.0000,"...Fine, and I ll drink tea too. I love you."
20046,male,1.0000,Greg Hardy you a good player and all but don t...
20047,male,1.0000,You can miss people and still never want to se...
20048,female,0.8489,i had noticed your tendency to pee on the car...


In [136]:
df_twitter_gender_cleaned.gender.unique()

array(['male', 'female', 'brand', 'unknown', nan], dtype=object)

In [137]:
# Only male and female
df_twitter_gender_cleaned = df_twitter_gender_cleaned[df_twitter_gender_cleaned['gender'].isin(['male', 'female'])]
df_twitter_gender_cleaned

Unnamed: 0,gender,gender:confidence,text
0,male,1.0000,Robbie E Responds To Critics After Win Against...
1,male,1.0000,It felt like they were my friends and I was l...
2,male,0.6625,i absolutely adore when louis starts the songs...
3,male,1.0000,Hi - Looking at the url - do you use Don t typ...
4,female,1.0000,Watching Neighbours on Sky catching up with th...
...,...,...,...
20045,female,1.0000,"...Fine, and I ll drink tea too. I love you."
20046,male,1.0000,Greg Hardy you a good player and all but don t...
20047,male,1.0000,You can miss people and still never want to se...
20048,female,0.8489,i had noticed your tendency to pee on the car...


In [138]:
df_twitter_gender_cleaned.gender.unique()

array(['male', 'female'], dtype=object)

In [139]:
# Only confidence = 1
df_twitter_gender_cleaned = df_twitter_gender_cleaned[df_twitter_gender_cleaned['gender:confidence']==1]
df_twitter_gender_cleaned

Unnamed: 0,gender,gender:confidence,text
0,male,1.0,Robbie E Responds To Critics After Win Against...
1,male,1.0,It felt like they were my friends and I was l...
3,male,1.0,Hi - Looking at the url - do you use Don t typ...
4,female,1.0,Watching Neighbours on Sky catching up with th...
5,female,1.0,"Ive seen people on the train with lamps, chair..."
...,...,...,...
20044,female,1.0,Need A Ride Home From Practice And its Raining...
20045,female,1.0,"...Fine, and I ll drink tea too. I love you."
20046,male,1.0,Greg Hardy you a good player and all but don t...
20047,male,1.0,You can miss people and still never want to se...


In [141]:
# None Values
df_twitter_gender_cleaned.isnull().values.any()

False

In [152]:
# Training a model using CountVectorizer

count_vectorizer = CountVectorizer()
x = count_vectorizer.fit_transform(df_twitter_gender_cleaned['text'])

encoder = LabelEncoder()
y = encoder.fit_transform(df_twitter_gender_cleaned['gender'])

# split into train and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

print('Traing set:', x_train.shape)
print('Test set', x_test.shape)

nb = MultinomialNB()
nb.fit(x_train, y_train)

print(nb.score(x_test, y_test))

Traing set: (8016, 16964)
Test set (2004, 16964)
0.6272455089820359


In [165]:
# Training a model using TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()
x = vectorizer.fit_transform(df_twitter_gender_cleaned['text'])

encoder = LabelEncoder()
y = encoder.fit_transform(df_twitter_gender_cleaned['gender'])

# split into train and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

print('Traing set:', x_train.shape)
print('Test set', x_test.shape)

nb = MultinomialNB()
nb.fit(x_train, y_train)

print(nb.score(x_test, y_test))

Traing set: (8016, 16964)
Test set (2004, 16964)
0.6327345309381237


In [166]:
# Training a model using TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()
x = vectorizer.fit_transform(df_twitter_gender_cleaned['text'])

encoder = LabelEncoder()
y = encoder.fit_transform(df_twitter_gender_cleaned['gender'])

# split into train and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

print('Traing set:', x_train.shape)
print('Test set', x_test.shape)

svm = SVC()
svm.fit(x_train, y_train)

print(svm.score(x_test, y_test))

Traing set: (8016, 16964)
Test set (2004, 16964)
0.592814371257485


In [159]:
# Training a model using TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()
x = vectorizer.fit_transform(df_twitter_gender_cleaned['text'])

encoder = LabelEncoder()
y = encoder.fit_transform(df_twitter_gender_cleaned['gender'])

# split into train and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

print('Traing set:', x_train.shape)
print('Test set', x_test.shape)

mlp = MLPClassifier()
mlp.fit(x_train, y_train)

print(mlp.score(x_test, y_test))

Traing set: (8016, 16964)
Test set (2004, 16964)
0.56187624750499


In [167]:
y_pred = nb.predict(x_test)

cm = confusion_matrix(y_pred, y_test)
print("\nConfusion matrix: \n")
print(cm)


Confusion matrix: 

[[938 279]
 [110 677]]


## Blog Authorship Corpus

In [168]:
df_blog_gender_original = pd.read_csv("/Users/vahidsj/Documents/Work/Work_MARTIN/Gender Bias/Datasets/blogText.csv")
df_blog_gender_original

Unnamed: 0,id,gender,age,topic,sign,date,text
0,2059027,male,15,Student,Leo,"14,May,2004","Info has been found (+/- 100 pages,..."
1,2059027,male,15,Student,Leo,"13,May,2004",These are the team members: Drewe...
2,2059027,male,15,Student,Leo,"12,May,2004",In het kader van kernfusie op aarde...
3,2059027,male,15,Student,Leo,"12,May,2004",testing!!! testing!!!
4,3581210,male,33,InvestmentBanking,Aquarius,"11,June,2004",Thanks to Yahoo!'s Toolbar I can ...
...,...,...,...,...,...,...,...
681279,1713845,male,23,Student,Taurus,"01,July,2004","Dear Susan, I could write some really ..."
681280,1713845,male,23,Student,Taurus,"01,July,2004","Dear Susan, 'I have the second yeast i..."
681281,1713845,male,23,Student,Taurus,"01,July,2004","Dear Susan, Your 'boyfriend' is fuckin..."
681282,1713845,male,23,Student,Taurus,"01,July,2004","Dear Susan: Just to clarify, I am as..."
