# 1. Importing Dataset

In [1]:
import os
import json
import pandas as pd

In [2]:
annotation_folder = "/opt/datasets/mmf/datasets/hateful_memes/defaults/annotations"
detect_folder = "/opt/datasets/mmf/datasets/hateful_memes/defaults/web_detect"

race_info_folder = "/opt/datasets/race_entity_info/hatefulmemes"

In [3]:
train_annotation_df = pd.read_json(os.path.join(detect_folder, "web_detect_train_samples.jsonl"), orient="records", lines=True)
train_labels_df = pd.read_json(os.path.join(annotation_folder, "fine_grained", "train.json"), orient="records", lines=True)

In [4]:
val_annotation_df = pd.read_json(os.path.join(detect_folder, "web_detect_dev.jsonl"), orient="records", lines=True)
val_labels_df = pd.read_json(os.path.join(annotation_folder, "fine_grained", "dev_seen.json"), orient="records", lines=True)

In [5]:
train_annotation_df['index'] = train_annotation_df['index'].astype(int)
val_annotation_df['index'] = val_annotation_df['index'].astype(int)

In [6]:
race_df = pd.read_json(os.path.join(race_info_folder, "box_annos.race.json"))

# 2. Combining Datasets

## 2.1 Cleaning Labels

In [7]:
train_labels_df.head()

Unnamed: 0,id,set_name,img,text,gold_hate,gold_pc,gold_attack,pc,attack
0,42953,train,img/42953.png,its their character not their color that matters,[not_hateful],[pc_empty],[attack_empty],,
1,23058,train,img/23058.png,don't be afraid to love again everyone is not ...,[not_hateful],[pc_empty],[attack_empty],,
2,13894,train,img/13894.png,putting bows on your pet,[not_hateful],[pc_empty],[attack_empty],,
3,37408,train,img/37408.png,i love everything and everybody! except for sq...,[not_hateful],[pc_empty],[attack_empty],,
4,82403,train,img/82403.png,"everybody loves chocolate chip cookies, even h...",[not_hateful],[pc_empty],[attack_empty],,


In [8]:
train_labels_df['gold_hate'] = train_labels_df['gold_hate'].apply(lambda x: x[0])
train_labels_df.drop(columns=['pc', 'attack', 'set_name'], inplace=True)
train_labels_df.set_index('id', inplace=True)

val_labels_df['gold_hate'] = val_labels_df['gold_hate'].apply(lambda x: x[0])
val_labels_df.drop(columns=['pc', 'attack', 'set_name'], inplace=True)
val_labels_df.set_index('id', inplace=True)

In [9]:
train_labels_df.head()

Unnamed: 0_level_0,img,text,gold_hate,gold_pc,gold_attack
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
42953,img/42953.png,its their character not their color that matters,not_hateful,[pc_empty],[attack_empty]
23058,img/23058.png,don't be afraid to love again everyone is not ...,not_hateful,[pc_empty],[attack_empty]
13894,img/13894.png,putting bows on your pet,not_hateful,[pc_empty],[attack_empty]
37408,img/37408.png,i love everything and everybody! except for sq...,not_hateful,[pc_empty],[attack_empty]
82403,img/82403.png,"everybody loves chocolate chip cookies, even h...",not_hateful,[pc_empty],[attack_empty]


In [10]:
print(train_labels_df.shape, val_labels_df.shape)

(8500, 5) (500, 5)


In [11]:
train_labels_df = train_labels_df[train_labels_df.index.isin(train_annotation_df['index'].unique())]
val_labels_df = val_labels_df[val_labels_df.index.isin(val_annotation_df['index'].unique())]

In [12]:
print(train_labels_df.shape, val_labels_df.shape)

(500, 5) (500, 5)


## 2.2 Cleaning Annotation

In [13]:
train_annotation_df.head()

Unnamed: 0,index,best_guess_labels,full_matches,partial_matches,entities,visually_similar
0,52743,[moteur turbo],[https://media.istockphoto.com/photos/performa...,[],"[{'score': 0.8115000129000001, 'description': ...",[http://www.fiches-auto.fr/sdoms/shiatsu/uploa...
1,37895,[car],[https://media.gettyimages.com/photos/women-fr...,[],"[{'score': 0.7287999988, 'description': 'Motor...",[https://melbournevipcashforcars.com.au/wp-con...
2,29703,[eyewear],[],[],"[{'score': 0.5329114199, 'description': 'Produ...",[https://l450v.alamy.com/450v/wa9bt2/croquet-i...
3,75294,[alexandria ocasio cortez in action],[https://i.insider.com/5c068d3d0e2bf860a654087...,[],"[{'score': 14.998500824, 'description': 'Alexa...",[https://odaction.com/wp-content/uploads/2019/...
4,60451,[photo caption],[],[],"[{'score': 0.4034947753, 'description': 'Produ...",[https://i.kym-cdn.com/photos/images/original/...


In [14]:
val_annotation_df.head()

Unnamed: 0,index,best_guess_labels,full_matches,partial_matches,entities,visually_similar
0,8291,[transgender kids],[https://i0.wp.com/post.healthline.com/wp-cont...,[],"[{'score': 0.5564999580000001, 'description': ...",[https://i.insider.com/5ddc180079d7570dad3b564...
1,46971,[glasses],[https://media.istockphoto.com/photos/end-of-t...,[],"[{'score': 1.0561499596, 'description': 'Fried...",[https://amc-theatres-res.cloudinary.com/image...
2,3745,[desk],[],[],"[{'score': 0.5328160524000001, 'description': ...",[http://static-15.sinclairstoryline.com/resour...
3,83745,[],[https://www.history.com/.image/ar_4:3%2Cc_fil...,[],"[{'score': 0.7148000002, 'description': ''}, {...",[https://pbs.twimg.com/media/E2PUeD8XoBIprYI.j...
4,80243,[monochrome photography],[https://media.gettyimages.com/photos/this-pic...,[],"[{'score': 0.7729064226, 'description': 'Black...",[https://landmarksdekalbal.org/wp-content/uplo...


In [15]:
train_annotation_df[train_annotation_df['best_guess_labels'].apply(len) > 1]

Unnamed: 0,index,best_guess_labels,full_matches,partial_matches,entities,visually_similar


In [16]:
val_annotation_df[val_annotation_df['partial_matches'].apply(len) > 0]

Unnamed: 0,index,best_guess_labels,full_matches,partial_matches,entities,visually_similar


In [17]:
val_annotation_df[val_annotation_df['best_guess_labels'].apply(len) > 1]

Unnamed: 0,index,best_guess_labels,full_matches,partial_matches,entities,visually_similar


In [18]:
train_annotation_df['entities'] = train_annotation_df['entities'].apply(lambda entities: [x['description'] for x in entities])
train_annotation_df['best_guess_labels'] = train_annotation_df['best_guess_labels'].apply(lambda x: x[0])

val_annotation_df['entities'] = val_annotation_df['entities'].apply(lambda entities: [x['description'] for x in entities])
val_annotation_df['best_guess_labels'] = val_annotation_df['best_guess_labels'].apply(lambda x: x[0])

train_annotation_df.drop(columns=["full_matches", "partial_matches", "visually_similar"], inplace=True)
val_annotation_df.drop(columns=["full_matches", "partial_matches", "visually_similar"], inplace=True)

In [19]:
train_annotation_df.set_index("index", inplace=True)
val_annotation_df.set_index("index", inplace=True)

In [20]:
train_annotation_df.head()

Unnamed: 0_level_0,best_guess_labels,entities
index,Unnamed: 1_level_1,Unnamed: 2_level_1
52743,moteur turbo,"[Car, Engine, Dodge Durango, Engine Oil, Turbo..."
37895,car,"[Motorcycle, , Harley-Davidson, Daytona Beach ..."
29703,eyewear,"[Product design, Brand, Product, Smiley, Line,..."
75294,alexandria ocasio cortez in action,"[Alexandria Ocasio-Cortez, Politics, 2019 Wome..."
60451,photo caption,"[Product, Photo caption]"


## 2.3 Clean Race

In [21]:
race_df.head()

Unnamed: 0,img_name,boxes_and_score
0,01235.png,"[{'ymin': 0.268042504787445, 'xmin': 0.3950905..."
1,01236.png,"[{'ymin': 0.655592083930969, 'xmin': 0.1557954..."
2,01243.png,"[{'ymin': 0.22486409544944702, 'xmin': 0.21732..."
3,01245.png,"[{'ymin': 0.276416897773742, 'xmin': 0.5804585..."
4,01247.png,"[{'ymin': 0.161682158708572, 'xmin': 0.2477303..."


In [22]:
race_df['img_name'] = race_df['img_name'].apply(lambda x: int(x.split('.')[0]))

race_df['gender'] = race_df['boxes_and_score'].apply(lambda row: [x['gender'] for x in row if x['gender'] != None])
race_df['race'] = race_df['boxes_and_score'].apply(lambda row: [x['race'] for x in row if x['race'] != None])

In [23]:
race_df.drop(columns=['boxes_and_score'], inplace=True)
race_df.set_index('img_name', inplace=True)
race_df.head()

Unnamed: 0_level_0,gender,race
img_name,Unnamed: 1_level_1,Unnamed: 2_level_1
1235,[Male],[Middle Eastern]
1236,"[Male, Male, Male, Male]","[Indian, Latino_Hispanic, Latino_Hispanic, Lat..."
1243,[],[]
1245,"[Male, Male]","[White, White]"
1247,[Male],[White]


## 2.4 Combining Dataframes

In [24]:
train_combined_df = pd.merge(train_labels_df, train_annotation_df, left_index=True, right_index=True)
train_combined_df = pd.merge(train_combined_df, race_df, left_index=True, right_index=True)
train_combined_df.sort_index(inplace=True)
train_combined_df.head()

Unnamed: 0,img,text,gold_hate,gold_pc,gold_attack,best_guess_labels,entities,gender,race
1395,img/01395.png,i just found out mexican babies come out just ...,hateful,[nationality],[dehumanizing],pregnancy painful labour,"[Childbirth, Pregnancy, Pain, Pain management ...","[Female, Male, Male, Female, Female]","[White, White, White, White, White]"
1487,img/01487.png,when you forget you're retarded,hateful,[disability],[inferiority],sirius,"[Poster, Font]","[Female, Female]","[White, White]"
1497,img/01497.png,now back to the married islamic classic.. to c...,hateful,[religion],[dehumanizing],photo caption,"[, Nangalam, Getty Images, Image, Photo captio...","[Male, Male, Male]","[Indian, White, Indian]"
1579,img/01579.png,"muslim figure: ""we must have pork-free menus o...",hateful,[religion],[exclusion],با حجاب در آمریکا,"[United States, , Islam in the United States, ...","[Female, Male, Female, Male, Male, Male, Female]","[Latino_Hispanic, Black, White, Black, Black, ..."
2158,img/02158.png,i like my women like i like my chickens held i...,hateful,[sex],"[inciting_violence, dehumanizing]",fauna,"[Typhoid fever, Fowl, Birds, Barnevelder, Poli...",[],[]


In [25]:
# Ensure that all records are kept
print(train_labels_df.shape, train_combined_df.shape)

(500, 5) (500, 9)


In [26]:
val_combined_df = pd.merge(val_labels_df, val_annotation_df, left_index=True, right_index=True)
val_combined_df = pd.merge(val_combined_df, race_df, left_index=True, right_index=True)
val_combined_df.sort_index(inplace=True)
val_combined_df.head()

Unnamed: 0,img,text,gold_hate,gold_pc,gold_attack,best_guess_labels,entities,gender,race
1268,img/01268.png,father and daughter bonding time is the best l...,not_hateful,[pc_empty],[attack_empty],photo caption,"[Human behavior, Photo caption, Product, Font,...","[Male, Male, Male]","[White, White, White]"
1456,img/01456.png,they see them rollin..... they hating..,hateful,[religion],[mocking],poster,"[Heinrich Himmler, La maldad: raíces antropoló...","[Male, Male, Male, Female, Male, Female, Male,...","[White, White, White, Middle Eastern, White, W..."
1726,img/01726.png,a real man loads the dishwasher every night,hateful,[sex],[dehumanizing],خودکشی مجری رادیو مشهد,"[Health, Human sexual activity, Sleep, Reprodu...",[],[]
1742,img/01742.png,in just one hour from now i'll only have 4 hou...,not_hateful,[pc_empty],[attack_empty],Computer,"[Computer security, Computer, Computer Monitor...",[Male],[East Asian]
1765,img/01765.png,"after a girl dies, what organ in her body stay...",hateful,"[race, sex]",[inciting_violence],poster,"[Stock photography, , Getty Images, Image, Pho...","[Male, Male]","[Black, Black]"


In [27]:
# Ensure that all records are kept
print(val_labels_df.shape, val_combined_df.shape)

(500, 5) (500, 9)


## 2.6 Saving Combined DataFrames

In [28]:
train_combined_df.to_csv(os.path.join("..","..","annotations", "train_sampled_combined.csv"))
val_combined_df.to_csv(os.path.join("..","..","annotations", "dev_sampled_combined.csv"))

## 2.7 Adding Manual Labels

In [29]:
val_labels = ",,female,,black&LGBT,middle east&muslim&LGBT,middle east,,,,,,black,,black,female,black,female,,,,,,,black,,,,,female,,,,white,,,,muslim&LGBT,male,,,LGBT,,,,,,???&???,black,female,LGBT,black,,,,,,,,,middle east (iraqi),,black,hispanic/latino,,,,LGBT,LGBT,female,,female,,,female&intellectual disability,???&???,,,,,islam&middle east,,,,,,,,,,black,,,,,white,,,white,,,,,,,white or black,,black,,middle east&islam,,,,,,,female,,,,,black,male,,,,,,,,white,???&male,,,,LGBT,,,,judaism&middle east,,,,,female&leg impairment,,LGBT,,,white,,black,,,,,black,,LGBT,,,black,,,,,,,,female,black,,,,,,female,,,,,,female,,,,LGBT,,,,,,LGBT,,,,female,,,black,,,,,,,,LGBT,black,,,LGBT,,,,???,,,LGBT,female,,,,,,,???,,,,female,,,,,,female&asia,,,,,,,,black,,hispanic/latino (mexican),,female,black,black&???,,,,,,,,,female,,,black,,,black,,,,,,white,,,,,female,,,,,,,white,hispanic/latino,black,,,,,,,,,chinese&east asian,,,,,,,,,,black,,,,,,,,,black,,,german&judaism (???),,,black,,,,,,,,,,,,,???,,,,,,,LGBT,white&male,,,black,,female,white,,black&islam,,,female&leg impairment,,black,,,,,,,,,black,,black,,female,,,,,LGBT&leg impairment,,,,female,,,,,,female,,,,,,,,,,,,black,black,,,,,???,,,,,female,,,,middle east,,middle east&islam,,???,,,,,,,,,christian&male,,female,,,black,,middle east (iraqi),,,,,,,,,hispanic/latino,black&down syndrome,black,,,female,,,,,,,,,,female,,black&islam,,female,,black&islam,,,,black,,,,,female,,,,black&female,,,,,,,,,,LGBT,female,,white,black,,,,white,,,,,,,,,black,"
val_difficulties = ",,,,context_medium,context_difficult,text_easy,,,,,,context_medium,,context_medium,,context_medium,,,,,,,,text_easy,,,,,,,,,context_medium,,,,,,,,,,,,,,,context_medium,,,context_medium,,,,,,,,,text_easy&context_medium,,context_medium,context_medum,,,,,,,,,,,,,,,,,context_medium,,,,,,,,,,context_medium,,,,,context_medium,,,knowledge_hard,,,,,,,context_medium,,context_medium,,context_medium,,,,,,,,,,,,context_medium,,,,,,,,,context_medium& knowledge_hard,context_medium,,,,,,,,context_medium,,,,,,,,,,context_medium,,context_medium,,,,,context_medium,,,,,context_medium,,,,,,,,,context_medium,,,,,,,,,,,,,,,,,,,,,,,,,,,,,context_medium,,,,,,,,,context_medium,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,context_medium,,knowledge_hard,,,context_medium,text_easy,,,,,,,,,,,,context_medium,,,context_medium,,,,,,context_medium,,,,,,,,,,,,context_medium& knowledge_hard,context_medium& knowledge_hard,text_easy,,,,,,,,,text_easy,,,,,,,,,,context_medium,,,,,,,,,context_medium,,,,,,context_medium,,,,,,,,,,,,,,,,,,,,,,,,context_medium,,,text_easy,,context_medium,,,,,context_medium,,,,,,,,,context_medium,,context_medium,,,,,,,,,,,,,,,,,,,,,,,,,,,,,context_medium,context_medium,,,,,,,,,,,,,,knowledge_hard,,context_medium,,,,,,,,,,,,,,,,context_medium,,knowledge_hard,,,,,,,,,context_medium,context_medium,context_medium,,,,,,,,,,,,,,,context_medium,,,,context_medium,,,,context_medium,,,,,,,,,context_medium,,,,,,,,,,,,,context_medium,context_medium,,,,knowledge_hard,,,,,,,,,context_medium,"

train_labels = "mexico,intellectual disability,muslim,muslim,female,autism,dwarfism,black,muslim,black,illegal immigrant,down syndrome,down syndrome,illegal immigrant,autism or  intellectual disability or down syndrome,india,rheumatoid arithritis (inflammatory arthritis),black,black,LGBT,dwarfism,amyotrophic lateral sclerosis (motor disease),muslim,black,east asia,???,amyotrophic lateral sclerosis (motor disease),mexico,female,jews,muslim,muslim,jews (???),black,white,female,illegal immigrant,christianity,muslim,???,black,down syndrome,LGBT,conjoined twins,LGBT,LGBT,mexico,muslim or  jews,black,muslim,female,french,???,muslim,jews,black,???,LGBT,???,east asia (korea),intellectual disability,catholic christian,middle east (jews),conjoined twins,black,black,LGBT,down syndrome,american indian,romamians,muslim,mexico,muslim,muslim,amyotrophic lateral sclerosis (motor disease),limb impairment,hearing impairment,east asia (chinese),Parkinson's disease,LGBT,female,jews,refugee,intellectual disability,intellectual disability,limb impairment,mexico,mexico,limb impairment,stroke,mexico,LGBT,LGBT,muslim,middle east (arab),middle east (iraqi),muslim,limb impairment,muslim,muslim,israel,white,black,facial impairment,muslim,diabetes,???,white,down syndrome,black,muslim,down syndrome,rheumatoid arithritis,black,LGBT,jews,LGBT,muslim,muslim,muslim,LGBT,white,female,black,limb impairment,iraqi,muslim,intellectual disability,refugee,leprosy,female,muslim,mexico,LGBT,black,mexico,down syndrome,american,female,black,japan,jews (???),iraqi,refugee,LGBT,refugee,LGBT,limb impairment,Mislabeled,LGBT,jews (???),vision impairment,black,mexico,down syndrome,east asia (chinese),africa,american,muslim,muslim,mexico,black,limb impairment,muslim,??? (criminials),LGBT,muslim,black,conjoined twins,black,mexico,american indian,jews,muslim,rheumatoid arithritis,???,muslim,black,america,mexico,???,female,muslim,female,black,dwarfism,muslim,LGBT,cancer (leukemia) (cancer),???,??? (america0,mexico,mexico,muslim,autism,female,mexico,mexico,LGBT,??? (white),LGBT,japan,south korea,jews,muslim,LGBT,black,black,??? (muslim),??? (black),muslim,intellectual disability,black,female,africa,mexico,??? (muslim),muslim,autism,black,female,???,female,LGBT,japan,jews,jews,down syndrome,white,refugee,female,???,depression,mexico,jews,muslim,???,???,muslim,cancer (leukemia) (cancer),jews,refugee,refugee (middle east),stammering,jews,ptosis,???,africa,female,mexico,intellectual disability,illegal immigrant,female,LGBT,asia (chinese),jews,illegal immigrant,down syndrome,??? (israel),muslim,LGBT,female,handicap,???,LGBT,jews,female,female,black,black,LGBT,LGBT,autism,stroke,dyslexic,LGBT,mexico,black,white,dwarfism,black,rheumatoid arithritis,black,LGBT,mexico,muslim,female,muslim,female,limb impairment,LGBT,down syndrome,vision impairment,LGBT,muslim,black,jews,muslim,black,refugee,??? (muslim),??? (europe),black,jews,??? (India/africas),LGBT,jews,female,muslim,LGBT,LGBT,black,jews,LGBT,female,???,female,black,black,on the verge of dying,black,north korean,black,limb impairment,muslim,black,female,germans,female,LGBT,iraqi,LGBT,LGBT,muslim,???,limb impairment,black,epilepsy,??? (black),female,black,???,jews,down syndrome,muslim,female,??? (america),black,illegal immigrant,catholic christian,mexico,american,black,africa,???,LGBT,jews,black,paralysis,jews,epilepsy,down syndrome,muslim,africa,mexico,black,LGBT,canada,muslim,muslim,mexico,down syndrome,american,cancer,LGBT,jews,mexico,autism,iraqi,jews,intellectual disability,LGBT,female,???,intellectual disability,down syndrome,LGBT,muslim,LGBT,jews,refugee,down syndrome,mexico,refugee,???,muslim,??? (europes),paralysis,conjoined twins,black,LGBT,female,LGBT,german,vision impairment,???,epilepsy,middle east (arab),hearing impairment,muslim,black,black,white,intellectual disability,black,LGBT,black,africa,black,black,LGBT,vision impairment,muslim,jews,???,LGBT,muslim,LGBT,LGBT,female,black,white,intellectual disability,???,female,Mislabeled,???,female,iraqi,??? (white police),muslim,LGBT,mexico,LGBT,female,muslim,intellectual disability,american indian,LGBT,black,africa,black,muslim,catholic christian,muslim,LGBT,female,europe,LGBT,hearing impairment,muslim,female,white,???,female,muslim,limb impairment,???,american,female,illegal immigrant,dwarfism,muslim,muslim,german,black,black,intellectual disability,down syndrome,???,jews,europe,asian,LGBT,muslim,down syndrome,LGBT,black,muslim,middle east (african),hearing impairment,mexico,???,black,???,muslim,muslim,black,???,LGBT"
train_difficulties = ",,,,text_easy,,,context_medium,,context_medium,easy,,,hard,hard,,,context_medium& knowledge_hard,text_easy&context_medium,text_easy,,,,easy_text&knowledge_hard,knowledge_hard,,,,text_easy,,,,,knowledge_hard,,context_medium,easy,easy&hard,,,knowledge_hard,easy&hard,context_medium,,text_easy,text_easy,,,text_easy,,text_easy,,,,,text_easy,,text_easy,hard,knowledge_hard,,,text_easy,hard,text_easy,text_easy,text_easy,,easy_text& medium_context,,,,,,,,,easy_text,,text_easy,context_medium,,easy,,hard,,,,hard,,,text_easy,text_easy,,context_medium& knowledge_hard,,,,,,,text_image,text_image,,,,,context_medium,,context_medium&knowledge_hard,,,,context_medium& knowledge_hard,text_easy,,text_easy,,,,context_medium,text_easy,context_medium,text_easy,,,,,easy& hard,,context_medium,,,context_medium,knowledge_hard,,,easy,text_easy,text_easy,,,,easy,context_medium,hard,context_medium,,,context_medium,,,text_easy,hard,,context_medium& knowledge_hard,easy,,,,hard,text_easy,,,context_medium,text_easy&context_medium,,context_medium,,context_medium& knowledge_hard,,,,,,,,context_medium,,,hard,context_medium&knowledge_hard,,context_medium,text_easy,,,text_easy,easy,,hard,,,,,context_medium& knowledge_hard,,,text_easy,,text_easy,,,,,knowledge_hard,text_easy,text_easy,,,,,knowledge_hard,context_medium,,,,,,context_medium,text_easy?,,context_medium,context_medium,hard,,,,text_easy,easy,context_medium,,,,,,hard,,,easy,,easy,hard,,,hard,,,context_medium,,,easy,context_medium,text_easy,,,easy,,hard,,text_easy,text_easy&context_medium,easy,,text_easy,,text_easy,context_medium,text_easy& knowledge_hard,text_easy,text_easy?,text_easy,,,,text_easy,,text_easy,text_easy,,context_medium,,knowledge_hard,context_medium,,,context_medium,,context_medium,,knowledge_hard,,,text_easy,,context_medium,,,knowledge_hard,easy,,,text_easy,,easy&hard,text_easy,,context_medium,,context_medium,text_easy,context_medium,,text_easy,text_easy,,context_medium,text_easy,text_easy,easy,knowledge_hard,,text_easy,,,context_medium,context_medium,,context_medium,text_easy,,text_easy,knowledge_hard,,,,context_medium,,,text_easy,context_medium,,,,,text_easy,???,context_medium,easy,easy,,,context_medium,,,text_easy,,text_easy,,,,easy,,,,context_medium,text_easy,,,,,,easy&hard,hard,text_easy,,,,,,easy,text_easy,context_medium,,,,text_easy,,text_easy,,easy,,easy,easy,,,hard,,,context_medium& knowledge_hard,context_medium,text_easy,text_easy,,,,,text_easy,,,text_easy,context_medium,text_easy,,context_medium& knowledge_hard,context_medium,text_easy,easy,context_medium,text_easy,text_easy,,,,,context_medium,,text_easy,text_easy,text_easy,context_medium,text_easy,,,text_easy,,,text_easy,,,,text_easy,,text_easy,context_medium,,,text_easy,context_medium,context_medium,,text_easy,,easy,,text_easy,context_medium,,text_easy,,,text_easy,text_easy,,text_easy,,,,,text_easy,easy,,,,,context_medium,text_easy,,,,,,context_medium,text_easy,,medium,text_easy,knowledge_hard,,context_medium& knowledge_hard,,hard,,text_easy,,,,context_medium,,text_easy"

In [30]:
train_labels, val_labels = train_labels.split(','), val_labels.split(',')
val_difficulties, train_difficulties = val_difficulties.split(','), train_difficulties.split(',')

print(len(train_labels), len(val_labels))
print(len(train_difficulties), len(val_difficulties))

500 500
500 500


In [31]:
train_combined_df['manual_label'] = train_labels
val_combined_df['manual_label'] = val_labels

train_combined_df['difficulty_level'] = train_difficulties
val_combined_df['difficulty_level'] = val_difficulties

In [32]:
train_combined_df['manual_label'] = train_combined_df['manual_label'].apply(lambda x: x.split('&'))
val_combined_df['manual_label'] = val_combined_df['manual_label'].apply(lambda x: x.split('&'))

In [33]:
train_combined_df.head()

Unnamed: 0,img,text,gold_hate,gold_pc,gold_attack,best_guess_labels,entities,gender,race,manual_label,difficulty_level
1395,img/01395.png,i just found out mexican babies come out just ...,hateful,[nationality],[dehumanizing],pregnancy painful labour,"[Childbirth, Pregnancy, Pain, Pain management ...","[Female, Male, Male, Female, Female]","[White, White, White, White, White]",[mexico],
1487,img/01487.png,when you forget you're retarded,hateful,[disability],[inferiority],sirius,"[Poster, Font]","[Female, Female]","[White, White]",[intellectual disability],
1497,img/01497.png,now back to the married islamic classic.. to c...,hateful,[religion],[dehumanizing],photo caption,"[, Nangalam, Getty Images, Image, Photo captio...","[Male, Male, Male]","[Indian, White, Indian]",[muslim],
1579,img/01579.png,"muslim figure: ""we must have pork-free menus o...",hateful,[religion],[exclusion],با حجاب در آمریکا,"[United States, , Islam in the United States, ...","[Female, Male, Female, Male, Male, Male, Female]","[Latino_Hispanic, Black, White, Black, Black, ...",[muslim],
2158,img/02158.png,i like my women like i like my chickens held i...,hateful,[sex],"[inciting_violence, dehumanizing]",fauna,"[Typhoid fever, Fowl, Birds, Barnevelder, Poli...",[],[],[female],text_easy


# 3. Cleaning Text

In [34]:
import contractions

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def clean_text(sent):
    sent = sent.replace('\"', '')
    sent = contractions.fix(sent)
    sent = sent.lower()

    word_tokens = word_tokenize(sent)
    
    filtered_sentence = [lemmatizer.lemmatize(word) for word in word_tokens]

    filtered_sentence = [
        "".join(c for c in token if c not in ("?", ".", ";", ":", "!"))
        for token in filtered_sentence
    ]
    
    return filtered_sentence

In [35]:
def get_specific_label(row, label):
    pos = row['gold_pc'].index(label)
    return row['manual_label'][pos]

In [36]:
val_combined_df.loc[83045]['best_guess_labels']

'117th congress women'

In [37]:
clean_text(val_combined_df.loc[83045]['best_guess_labels'])

['117th', 'congress', 'woman']

In [38]:
train_combined_df['text_cleaned'] = train_combined_df['text'].apply(clean_text)
train_combined_df['best_guess_labels_cleaned'] = train_combined_df['best_guess_labels'].apply(clean_text)
train_combined_df['entities_cleaned'] = train_combined_df['entities'].apply(lambda x: clean_text(' '.join(x)))

In [39]:
val_combined_df['text_cleaned'] = val_combined_df['text'].apply(clean_text)
val_combined_df['best_guess_labels_cleaned'] = val_combined_df['best_guess_labels'].apply(clean_text)
val_combined_df['entities_cleaned'] = val_combined_df['entities'].apply(lambda x: clean_text(' '.join(x)))

# 4. Check Distribution

In [40]:
hateful_df = val_combined_df[val_combined_df['gold_hate'] == 'hateful']
hateful_df.head()

Unnamed: 0,img,text,gold_hate,gold_pc,gold_attack,best_guess_labels,entities,gender,race,manual_label,difficulty_level,text_cleaned,best_guess_labels_cleaned,entities_cleaned
1456,img/01456.png,they see them rollin..... they hating..,hateful,[religion],[mocking],poster,"[Heinrich Himmler, La maldad: raíces antropoló...","[Male, Male, Male, Female, Male, Female, Male,...","[White, White, White, Middle Eastern, White, W...",[],,"[they, see, them, rollin, , , they, hating]",[poster],"[heinrich, himmler, la, maldad, , raíces, antr..."
1726,img/01726.png,a real man loads the dishwasher every night,hateful,[sex],[dehumanizing],خودکشی مجری رادیو مشهد,"[Health, Human sexual activity, Sleep, Reprodu...",[],[],[female],,"[a, real, man, load, the, dishwasher, every, n...","[خودکشی, مجری, رادیو, مشهد]","[health, human, sexual, activity, sleep, repro..."
1765,img/01765.png,"after a girl dies, what organ in her body stay...",hateful,"[race, sex]",[inciting_violence],poster,"[Stock photography, , Getty Images, Image, Pho...","[Male, Male]","[Black, Black]","[black, LGBT]",context_medium,"[after, a, girl, dy, ,, what, organ, in, her, ...",[poster],"[stock, photography, getty, image, image, phot..."
1796,img/01796.png,life hack #23 how to get stoned with no weed,hateful,"[race, religion, sex]",[inciting_violence],photograph,"[Photograph, Hug, Photo caption, Product, Frie...","[Female, Female, Female, Female, Female]","[Southeast Asian, Southeast Asian, Southeast A...","[middle east, muslim, LGBT]",context_difficult,"[life, hack, #, 23, how, to, get, stoned, with...",[photograph],"[photograph, hug, photo, caption, product, fri..."
1925,img/01925.png,i am not racist i just don't like brown sugar,hateful,[race],[contempt],Scholarship,[Amelia Lost: The Life and Disappearance of Am...,"[Female, Female, Female]","[Indian, Indian, Indian]",[middle east],text_easy,"[i, am, not, racist, i, just, do, not, like, b...",[scholarship],"[amelia, lost, , the, life, and, disappearance..."


## Sex Distribution

In [41]:
train_sex_df = train_combined_df[train_combined_df['gold_pc'].apply(lambda x: 'sex' in x)]
train_sex_df.shape

(100, 14)

In [42]:
train_sex_df['manual_label'] = train_sex_df['manual_label'].apply(lambda x: x[0])
train_sex_df['manual_label'].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_sex_df['manual_label'] = train_sex_df['manual_label'].apply(lambda x: x[0])


LGBT          59
female        39
Mislabeled     2
Name: manual_label, dtype: int64

In [43]:
val_sex_df = hateful_df[hateful_df['gold_pc'].apply(lambda x: 'sex' in x)]
val_sex_df.shape

(56, 14)

In [44]:
val_sex_df['manual_label'] = val_sex_df.apply(lambda x: get_specific_label(x, 'sex'), axis=1)
val_sex_df['manual_label'].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  val_sex_df['manual_label'] = val_sex_df.apply(lambda x: get_specific_label(x, 'sex'), axis=1)


female    33
LGBT      18
male       5
Name: manual_label, dtype: int64

## 4.2 Race Distribution

In [45]:
train_race_df = train_combined_df[train_combined_df['gold_pc'].apply(lambda x: 'race' in x)]
train_race_df.shape

(100, 14)

In [46]:
train_race_df['manual_label'] = train_race_df['manual_label'].apply(lambda x: x[0])
train_race_df['manual_label'] = train_race_df['manual_label'].apply(lambda x: x[0: x.index("(")].strip() if "(" in x else x)
train_race_df['manual_label'].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_race_df['manual_label'] = train_race_df['manual_label'].apply(lambda x: x[0])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_race_df['manual_label'] = train_race_df['manual_label'].apply(lambda x: x[0: x.index("(")].strip() if "(" in x else x)


black              62
???                16
white               9
middle east         5
east asia           4
american indian     3
asian               1
Name: manual_label, dtype: int64

In [47]:
val_race_df = hateful_df[hateful_df['gold_pc'].apply(lambda x: 'race' in x)]
val_race_df.shape

(78, 14)

In [48]:
val_race_df['manual_label'] = val_race_df.apply(lambda x: get_specific_label(x, 'race'), axis=1)
val_race_df['manual_label'] = val_race_df['manual_label'].apply(lambda x: x[0: x.index("(")].strip() if "(" in x else x)
val_race_df['manual_label'].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  val_race_df['manual_label'] = val_race_df.apply(lambda x: get_specific_label(x, 'race'), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  val_race_df['manual_label'] = val_race_df['manual_label'].apply(lambda x: x[0: x.index("(")].strip() if "(" in x else x)


black              42
white              11
middle east         9
???                 8
hispanic/latino     4
white or black      1
german              1
asia                1
east asian          1
Name: manual_label, dtype: int64

## 4.3 Religion Distribution

In [49]:
train_religion_df = train_combined_df[train_combined_df['gold_pc'].apply(lambda x: 'religion' in x)]
train_religion_df.shape

(100, 14)

In [50]:
train_religion_df['manual_label'] = train_religion_df['manual_label'].apply(lambda x: x[0])
train_religion_df['manual_label'] = train_religion_df['manual_label'].apply(lambda x: x[0: x.index("(")].strip() if "(" in x else x)
train_religion_df['manual_label'].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_religion_df['manual_label'] = train_religion_df['manual_label'].apply(lambda x: x[0])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_religion_df['manual_label'] = train_religion_df['manual_label'].apply(lambda x: x[0: x.index("(")].strip() if "(" in x else x)


muslim                65
jews                  28
catholic christian     3
???                    2
christianity           1
muslim or  jews        1
Name: manual_label, dtype: int64

## 4.4 Nationality Distribution

In [51]:
train_nationality_df = train_combined_df[train_combined_df['gold_pc'].apply(lambda x: 'nationality' in x)]
train_nationality_df.shape

(100, 14)

In [52]:
train_nationality_df['manual_label'] = train_nationality_df['manual_label'].apply(lambda x: x[0])
train_nationality_df['manual_label'] = train_nationality_df['manual_label'].apply(lambda x: x[0: x.index("(")].strip() if "(" in x else x)
train_nationality_df['manual_label'].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_nationality_df['manual_label'] = train_nationality_df['manual_label'].apply(lambda x: x[0])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_nationality_df['manual_label'] = train_nationality_df['manual_label'].apply(lambda x: x[0: x.index("(")].strip() if "(" in x else x)


mexico               29
???                  19
refugee              10
africa                7
illegal immigrant     7
iraqi                 5
american              5
japan                 3
europe                2
german                2
romamians             1
india                 1
north korean          1
asia                  1
french                1
israel                1
muslim                1
canada                1
america               1
south korea           1
germans               1
Name: manual_label, dtype: int64

## 4.5 Disability Distribution

In [53]:
train_disability_df = train_combined_df[train_combined_df['gold_pc'].apply(lambda x: 'disability' in x)]
train_disability_df.shape

(100, 14)

In [54]:
train_disability_df['manual_label'] = train_disability_df['manual_label'].apply(lambda x: x[0])
train_disability_df['manual_label'] = train_disability_df['manual_label'].apply(lambda x: x[0: x.index("(")].strip() if "(" in x else x)
train_disability_df['manual_label'].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_disability_df['manual_label'] = train_disability_df['manual_label'].apply(lambda x: x[0])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_disability_df['manual_label'] = train_disability_df['manual_label'].apply(lambda x: x[0: x.index("(")].strip() if "(" in x else x)


down syndrome                                          18
intellectual disability                                13
limb impairment                                        11
???                                                     8
dwarfism                                                5
autism                                                  5
vision impairment                                       4
rheumatoid arithritis                                   4
hearing impairment                                      4
conjoined twins                                         4
amyotrophic lateral sclerosis                           3
cancer                                                  3
epilepsy                                                3
stroke                                                  2
paralysis                                               2
handicap                                                1
diabetes                                                1
stammering    

# 3. Sex Analysis

In [55]:
import contractions
from nltk.stem import WordNetLemmatizer
  
lemmatizer = WordNetLemmatizer()

def lemmatize(sent):
    words = [lemmatizer.lemmatize(word) for word in sent.split()]
    return words

In [56]:
categories = (
    ("male", ("he", "man", "men", "boy", "male")),
    ("female", ("she", "woman", "woman", "girl", "feminist", "female", "dishwasher", "dish-washer", "sandwichmaker", "sandwich-maker", "sandwich")),
    ("LGBT", ("lgbt", "gay", 'bisexual', 'queer')) #LGBT
)

special_kws = [
    ("LGBT", "tran")
]

def assign_gender(sent):
    keywords, labels  = [], []
    for t in sent:
        for cat, kws in categories:
            if t in kws:
                keywords.append(t)
                labels.append(cat)
        
        for cat, special_kw in special_kws:
            if special_kw in t:
                keywords.append(t)
                labels.append(cat)
    
    return labels

def predict_gender(combined_cat):
    from collections import Counter
    
    if len(combined_cat) == 0:
        return "Unidentified"

    if "LGBT" in combined_cat:
        return "LGBT"
    
    counter = Counter(combined_cat)
    elements = counter.most_common(2)
    
    if len(elements) == 1 or elements[0][1] > elements[1][1]:
        return elements[0][0]
    
    return "Mixed"
        
        
#     return combined_series.apply(lambda x: "LGBT" if "LGBT" in x else max(set(lst), key=lst.count))

## 3.1 Sex Analysis - Train

In [57]:
train_sex_df = train_combined_df[train_combined_df['gold_pc'].apply(lambda x: 'sex' in x)]
train_sex_df.shape

(100, 14)

In [58]:
train_sex_df['manual_label'] = train_sex_df['manual_label'].apply(lambda x: x[0])
train_sex_df['manual_label'].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_sex_df['manual_label'] = train_sex_df['manual_label'].apply(lambda x: x[0])


LGBT          59
female        39
Mislabeled     2
Name: manual_label, dtype: int64

In [59]:
assign_gender(' '.join(train_sex_df.loc[9352]['entities']))

[]

In [60]:
train_sex_df['text_cat'] = train_sex_df['text_cleaned'].apply(assign_gender)
train_sex_df['entities_cat'] = train_sex_df['entities_cleaned'].apply(assign_gender)
train_sex_df['best_guess_cat'] = train_sex_df['best_guess_labels_cleaned'].apply(assign_gender)

train_sex_df['combined_cat'] = train_sex_df['text_cat'] + train_sex_df['entities_cat'] + train_sex_df['best_guess_cat']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_sex_df['text_cat'] = train_sex_df['text_cleaned'].apply(assign_gender)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_sex_df['entities_cat'] = train_sex_df['entities_cleaned'].apply(assign_gender)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_sex_df['best_guess_cat'] = train_sex_

In [61]:
train_sex_df['pred_label'] = train_sex_df['combined_cat'].apply(predict_gender)
# train_sex_df.drop(columns=['text_cat', 'entities_cat', 'best_guess_cat', 'gender', 'race'], inplace=True)
train_sex_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_sex_df['pred_label'] = train_sex_df['combined_cat'].apply(predict_gender)


Unnamed: 0,img,text,gold_hate,gold_pc,gold_attack,best_guess_labels,entities,gender,race,manual_label,difficulty_level,text_cleaned,best_guess_labels_cleaned,entities_cleaned,text_cat,entities_cat,best_guess_cat,combined_cat,pred_label
2158,img/02158.png,i like my women like i like my chickens held i...,hateful,[sex],"[inciting_violence, dehumanizing]",fauna,"[Typhoid fever, Fowl, Birds, Barnevelder, Poli...",[],[],female,text_easy,"[i, like, my, woman, like, i, like, my, chicke...",[fauna],"[typhoid, fever, fowl, bird, barnevelder, poli...",[female],[],[],[female],female
3794,img/03794.png,tranny jesus died for your eyebrows,hateful,[sex],[slurs],community,"[Festival M., Community, Festival, Recreation,...","[Female, Male, Male, Male, Male]","[Black, Middle Eastern, White, White, Middle E...",LGBT,text_easy,"[tranny, jesus, died, for, your, eyebrow]",[community],"[festival, m, community, festival, recreation,...",[LGBT],[LGBT],[],"[LGBT, LGBT]",LGBT
6482,img/06482.png,i dont always make sandwiches because im not a...,hateful,[sex],[dehumanizing],photo caption,"[, Photo caption, Illustration, Portugal, Heal...","[Male, Male, Male]","[Middle Eastern, Middle Eastern, Middle Eastern]",female,text_easy,"[i, do, not, always, make, sandwich, because, ...","[photo, caption]","[photo, caption, illustration, portugal, healt...","[female, female]",[male],[],"[female, female, male]",female
8275,img/08275.png,the pimp hand strong enough for man but made f...,hateful,[sex],[inciting_violence],friendship,"[Stock photography, , Getty Images, Photo capt...","[Male, Female, Female]","[White, Latino_Hispanic, Latino_Hispanic]",female,context_medium,"[the, pimp, hand, strong, enough, for, man, bu...",[friendship],"[stock, photography, getty, image, photo, capt...",[male],[female],[],"[male, female]",Mixed
9352,img/09352.png,they will soon be extinct,hateful,[sex],"[inciting_violence, dehumanizing]",pride month,"[Gay pride, LGBT community, Queer, Rainbow fla...","[Female, Female, Female, Female, Female]","[White, White, White, White, White]",LGBT,context_medium,"[they, will, soon, be, extinct]","[pride, month]","[gay, pride, lgbt, community, queer, rainbow, ...",[],"[LGBT, LGBT, LGBT, LGBT, LGBT, LGBT]",[],"[LGBT, LGBT, LGBT, LGBT, LGBT, LGBT]",LGBT


In [62]:
print(train_sex_df[train_sex_df['combined_cat'].apply(len) != 0].shape)
print(train_sex_df[train_sex_df['combined_cat'].apply(len) == 0].shape)

(79, 19)
(21, 19)


In [63]:
print(train_sex_df[train_sex_df['pred_label'] == train_sex_df['manual_label']].shape)
print(train_sex_df[train_sex_df['pred_label'] != train_sex_df['manual_label']].shape)

(73, 19)
(27, 19)


In [64]:
mislabel_df = train_sex_df[(train_sex_df['pred_label'] != train_sex_df['manual_label']) & (train_sex_df['pred_label'] != 'Unidentified')]
print(mislabel_df.shape)
mislabel_df

(6, 19)


Unnamed: 0,img,text,gold_hate,gold_pc,gold_attack,best_guess_labels,entities,gender,race,manual_label,difficulty_level,text_cleaned,best_guess_labels_cleaned,entities_cleaned,text_cat,entities_cat,best_guess_cat,combined_cat,pred_label
8275,img/08275.png,the pimp hand strong enough for man but made f...,hateful,[sex],[inciting_violence],friendship,"[Stock photography, , Getty Images, Photo capt...","[Male, Female, Female]","[White, Latino_Hispanic, Latino_Hispanic]",female,context_medium,"[the, pimp, hand, strong, enough, for, man, bu...",[friendship],"[stock, photography, getty, image, photo, capt...",[male],[female],[],"[male, female]",Mixed
38271,img/38271.png,non-binary genders are real there are more tha...,hateful,[sex],[dehumanizing],grumpy cat,"[Grumpy Cat, Persian cat, Grumpy Cat, Dog, Kit...",[],[],LGBT,text_easy,"[non-binary, gender, are, real, there, are, mo...","[grumpy, cat]","[grumpy, cat, persian, cat, grumpy, cat, dog, ...",[],[female],[],[female],female
52743,img/52743.png,like a hot ls swapped rx-7 girl with a penis,hateful,[sex],[dehumanizing],moteur turbo,"[Car, Engine, Dodge Durango, Engine Oil, Turbo...",[],[],LGBT,text_easy,"[like, a, hot, l, swapped, rx-7, girl, with, a...","[moteur, turbo]","[car, engine, dodge, durango, engine, oil, tur...",[female],[],[],[female],female
58631,img/58631.png,the same people that hate trump think this is ...,hateful,[sex],[dehumanizing],caitlyn jenner net worth,"[Celebrity, Kardashian Family, , Television pr...","[Female, Female]","[White, White]",LGBT,knowledge_hard,"[the, same, people, that, hate, trump, think, ...","[caitlyn, jenner, net, worth]","[celebrity, kardashian, family, television, pr...",[female],[],[],[female],female
90723,img/90723.png,every man needs a decent coffee maker,hateful,[sex],[dehumanizing],Ansmann Aes 1 Zero Watt One,"[Ansmann Aes 1 Zero Watt One, Ansmann időzítő ...","[Female, Female]","[White, White]",female,context_medium,"[every, man, need, a, decent, coffee, maker]","[ansmann, aes, 1, zero, watt, one]","[ansmann, aes, 1, zero, watt, one, ansmann, id...",[male],[],[],[male],male
92148,img/92148.png,"once a year, women go completely crazy for abo...",hateful,[sex],[inferiority],streetball young boy,"[Infant, Parent, Parenting, Creative Care Chil...","[Male, Female, Female]","[Indian, Middle Eastern, Middle Eastern]",female,text_easy,"[once, a, year, ,, woman, go, completely, craz...","[streetball, young, boy]","[infant, parent, parenting, creative, care, ch...",[female],[],[male],"[female, male]",Mixed


In [65]:
unlabel_df = train_sex_df[(train_sex_df['pred_label'] != train_sex_df['manual_label']) & (train_sex_df['pred_label'] == 'Unidentified')]
print(unlabel_df.shape)
unlabel_df.head()

(21, 19)


Unnamed: 0,img,text,gold_hate,gold_pc,gold_attack,best_guess_labels,entities,gender,race,manual_label,difficulty_level,text_cleaned,best_guess_labels_cleaned,entities_cleaned,text_cat,entities_cat,best_guess_cat,combined_cat,pred_label
10328,img/10328.png,how dikes look when they're in line paying for...,hateful,[sex],[slurs],photo caption,"[Image, Stock photography, Photograph, Getty I...","[Female, Female, Female, Female, Female]","[Latino_Hispanic, White, White, Latino_Hispani...",LGBT,text_easy,"[how, dike, look, when, they, are, in, line, p...","[photo, caption]","[image, stock, photography, photograph, getty,...",[],[],[],[],Unidentified
13620,img/13620.png,when you get shot in the groin by a rubber bul...,hateful,[sex],[inciting_violence],photo caption,"[Photo caption, Font, Dude]","[Male, Male]","[Black, Black]",LGBT,text_easy,"[when, you, get, shot, in, the, groin, by, a, ...","[photo, caption]","[photo, caption, font, dude]",[],[],[],[],Unidentified
17236,img/17236.png,this just in caitlyn jenner claims bruce fondl...,hateful,[sex],[inferiority],poster,"[Poster, Photo caption]","[Female, Male, Male]","[White, White, White]",LGBT,text_easy,"[this, just, in, caitlyn, jenner, claim, bruce...",[poster],"[poster, photo, caption]",[],[],[],[],Unidentified
17326,img/17326.png,what the neighbors see what my father sees,hateful,[sex],[inciting_violence],photograph,"[Stock photography, , Image, , Getty Images, P...","[Female, Female, Female, Female]","[White, White, White, White]",female,context_medium,"[what, the, neighbor, see, what, my, father, see]",[photograph],"[stock, photography, image, getty, image, phot...",[],[],[],[],Unidentified
25847,img/25847.png,they should not be allowed to have kids you ma...,hateful,[sex],"[contempt, exclusion]",mi sono innamorata di una ragazza,"[Friendship, Lesbiaeth, Kiss, Multiracial peop...","[Female, Female, Female]","[White, White, White]",LGBT,context_medium,"[they, should, not, be, allowed, to, have, kid...","[mi, sono, innamorata, di, una, ragazza]","[friendship, lesbiaeth, kiss, multiracial, peo...",[],[],[],[],Unidentified


# 3.2 Sex Analysis - Validation

In [66]:
val_sex_df = hateful_df[hateful_df['gold_pc'].apply(lambda x: 'sex' in x)]
val_sex_df.shape

(56, 14)

In [67]:
val_sex_df.loc[5938]

img                                                              img/05938.png
text                             in the muslim world all gay men are well hung
gold_hate                                                              hateful
gold_pc                                                        [religion, sex]
gold_attack                                  [inciting_violence, dehumanizing]
best_guess_labels                                                photo caption
entities                     [Photograph, Image, Stock photography, iStock,...
gender                                                      [Male, Male, Male]
race                                                     [White, White, White]
manual_label                                                    [muslim, LGBT]
difficulty_level                                                              
text_cleaned                 [in, the, muslim, world, all, gay, men, are, w...
best_guess_labels_cleaned                           

In [68]:
val_sex_df['manual_label'] = val_sex_df.apply(lambda x: get_specific_label(x, 'sex'), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  val_sex_df['manual_label'] = val_sex_df.apply(lambda x: get_specific_label(x, 'sex'), axis=1)


In [69]:
val_sex_df['manual_label'].value_counts()

female    33
LGBT      18
male       5
Name: manual_label, dtype: int64

In [70]:
val_sex_df.head()

Unnamed: 0,img,text,gold_hate,gold_pc,gold_attack,best_guess_labels,entities,gender,race,manual_label,difficulty_level,text_cleaned,best_guess_labels_cleaned,entities_cleaned
1726,img/01726.png,a real man loads the dishwasher every night,hateful,[sex],[dehumanizing],خودکشی مجری رادیو مشهد,"[Health, Human sexual activity, Sleep, Reprodu...",[],[],female,,"[a, real, man, load, the, dishwasher, every, n...","[خودکشی, مجری, رادیو, مشهد]","[health, human, sexual, activity, sleep, repro..."
1765,img/01765.png,"after a girl dies, what organ in her body stay...",hateful,"[race, sex]",[inciting_violence],poster,"[Stock photography, , Getty Images, Image, Pho...","[Male, Male]","[Black, Black]",LGBT,context_medium,"[after, a, girl, dy, ,, what, organ, in, her, ...",[poster],"[stock, photography, getty, image, image, phot..."
1796,img/01796.png,life hack #23 how to get stoned with no weed,hateful,"[race, religion, sex]",[inciting_violence],photograph,"[Photograph, Hug, Photo caption, Product, Frie...","[Female, Female, Female, Female, Female]","[Southeast Asian, Southeast Asian, Southeast A...",LGBT,context_difficult,"[life, hack, #, 23, how, to, get, stoned, with...",[photograph],"[photograph, hug, photo, caption, product, fri..."
2634,img/02634.png,and just like that... this sandwich maker doub...,hateful,[sex],[dehumanizing],grass,"[Stock photography, Getty Images, , Photograph...","[Female, Female, Female, Female]","[East Asian, East Asian, East Asian, East Asian]",female,,"[and, just, like, that, , this, sandwich, make...",[grass],"[stock, photography, getty, image, photograph,..."
3197,img/03197.png,i'm no bird expert but i'm guessing it's 4 fem...,hateful,[sex],[dehumanizing],fauna,"[Pacific swallow, Birds, Barn swallow, Stock p...",[],[],female,,"[i, am, no, bird, expert, but, i, am, guessing...",[fauna],"[pacific, swallow, bird, barn, swallow, stock,..."


In [71]:
val_sex_df['text_cat'] = val_sex_df['text_cleaned'].apply(assign_gender)
val_sex_df['entities_cat'] = val_sex_df['entities_cleaned'].apply(assign_gender)
val_sex_df['best_guess_cat'] = val_sex_df['best_guess_labels'].apply(assign_gender)

val_sex_df['combined_cat'] = val_sex_df['text_cat'] + val_sex_df['entities_cat'] + val_sex_df['best_guess_cat']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  val_sex_df['text_cat'] = val_sex_df['text_cleaned'].apply(assign_gender)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  val_sex_df['entities_cat'] = val_sex_df['entities_cleaned'].apply(assign_gender)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  val_sex_df['best_guess_cat'] = val_sex_df['best_gue

In [72]:
val_sex_df['pred_label'] = val_sex_df['combined_cat'].apply(predict_gender)
val_sex_df.drop(columns=['text_cat', 'entities_cat', 'best_guess_cat', 'gender', 'race'], inplace=True)
val_sex_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  val_sex_df['pred_label'] = val_sex_df['combined_cat'].apply(predict_gender)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


Unnamed: 0,img,text,gold_hate,gold_pc,gold_attack,best_guess_labels,entities,manual_label,difficulty_level,text_cleaned,best_guess_labels_cleaned,entities_cleaned,combined_cat,pred_label
1726,img/01726.png,a real man loads the dishwasher every night,hateful,[sex],[dehumanizing],خودکشی مجری رادیو مشهد,"[Health, Human sexual activity, Sleep, Reprodu...",female,,"[a, real, man, load, the, dishwasher, every, n...","[خودکشی, مجری, رادیو, مشهد]","[health, human, sexual, activity, sleep, repro...","[male, female]",Mixed
1765,img/01765.png,"after a girl dies, what organ in her body stay...",hateful,"[race, sex]",[inciting_violence],poster,"[Stock photography, , Getty Images, Image, Pho...",LGBT,context_medium,"[after, a, girl, dy, ,, what, organ, in, her, ...",[poster],"[stock, photography, getty, image, image, phot...",[female],female
1796,img/01796.png,life hack #23 how to get stoned with no weed,hateful,"[race, religion, sex]",[inciting_violence],photograph,"[Photograph, Hug, Photo caption, Product, Frie...",LGBT,context_difficult,"[life, hack, #, 23, how, to, get, stoned, with...",[photograph],"[photograph, hug, photo, caption, product, fri...",[],Unidentified
2634,img/02634.png,and just like that... this sandwich maker doub...,hateful,[sex],[dehumanizing],grass,"[Stock photography, Getty Images, , Photograph...",female,,"[and, just, like, that, , this, sandwich, make...",[grass],"[stock, photography, getty, image, photograph,...",[female],female
3197,img/03197.png,i'm no bird expert but i'm guessing it's 4 fem...,hateful,[sex],[dehumanizing],fauna,"[Pacific swallow, Birds, Barn swallow, Stock p...",female,,"[i, am, no, bird, expert, but, i, am, guessing...",[fauna],"[pacific, swallow, bird, barn, swallow, stock,...","[female, male]",Mixed


In [73]:
print(val_sex_df[val_sex_df['combined_cat'].apply(len) != 0].shape)
print(val_sex_df[val_sex_df['combined_cat'].apply(len) == 0].shape)

(41, 14)
(15, 14)


In [74]:
print(val_sex_df[val_sex_df['pred_label'] == val_sex_df['manual_label']].shape)
print(val_sex_df[val_sex_df['pred_label'] != val_sex_df['manual_label']].shape)

(31, 14)
(25, 14)


In [75]:
mislabel_df = val_sex_df[(val_sex_df['pred_label'] != val_sex_df['manual_label']) & (val_sex_df['pred_label'] != 'Unidentified')]
print(mislabel_df.shape)
mislabel_df

(10, 14)


Unnamed: 0,img,text,gold_hate,gold_pc,gold_attack,best_guess_labels,entities,manual_label,difficulty_level,text_cleaned,best_guess_labels_cleaned,entities_cleaned,combined_cat,pred_label
1726,img/01726.png,a real man loads the dishwasher every night,hateful,[sex],[dehumanizing],خودکشی مجری رادیو مشهد,"[Health, Human sexual activity, Sleep, Reprodu...",female,,"[a, real, man, load, the, dishwasher, every, n...","[خودکشی, مجری, رادیو, مشهد]","[health, human, sexual, activity, sleep, repro...","[male, female]",Mixed
1765,img/01765.png,"after a girl dies, what organ in her body stay...",hateful,"[race, sex]",[inciting_violence],poster,"[Stock photography, , Getty Images, Image, Pho...",LGBT,context_medium,"[after, a, girl, dy, ,, what, organ, in, her, ...",[poster],"[stock, photography, getty, image, image, phot...",[female],female
3197,img/03197.png,i'm no bird expert but i'm guessing it's 4 fem...,hateful,[sex],[dehumanizing],fauna,"[Pacific swallow, Birds, Barn swallow, Stock p...",female,,"[i, am, no, bird, expert, but, i, am, guessing...",[fauna],"[pacific, swallow, bird, barn, swallow, stock,...","[female, male]",Mixed
4769,img/04769.png,if she's pouting like this and you don't immed...,hateful,[sex],[contempt],sitting,"[Product design, , Font, Abdomen, Photo captio...",female,,"[if, she, is, pouting, like, this, and, you, d...",[sitting],"[product, design, font, abdomen, photo, captio...","[female, LGBT]",LGBT
24396,img/24396.png,"so like, why are men even gay? girls have butt...",hateful,[sex],[contempt],beauty,"[Forehead, Face, Lips, Black hair, Photo capti...",male,,"[so, like, ,, why, are, men, even, gay, , girl...",[beauty],"[forehead, face, lip, black, hair, photo, capt...","[male, LGBT, female]",LGBT
27498,img/27498.png,who's gonna make the sandwiches? what do we do...,hateful,[sex],[dehumanizing],photo caption,"[Bride, Photo caption, Meter]",LGBT,,"[who, is, going, to, make, the, sandwich, , wh...","[photo, caption]","[bride, photo, caption, meter]","[female, female]",female
48296,img/48296.png,when you date an asian boy and you trynna get ...,hateful,"[sex, race]",[inferiority],photo caption,"[Mammal, Human behavior, Hug, Human, Happiness...",female,,"[when, you, date, an, asian, boy, and, you, tr...","[photo, caption]","[mammal, human, behavior, hug, human, happines...",[male],male
74906,img/74906.png,are you a boy or a girl i'm non binary,hateful,"[sex, disability]",[inferiority],photograph,"[Photograph, Wheelchair, iStock, Stock photogr...",LGBT,,"[are, you, a, boy, or, a, girl, i, am, non, bi...",[photograph],"[photograph, wheelchair, istock, stock, photog...","[male, female]",Mixed
84510,img/84510.png,bartender: how about a nice 12 yr old? him: bo...,hateful,"[religion, sex]",[dehumanizing],getty images priest,"[Stock photography, stock.xchng, Image, , Gett...",male,,"[bartender, , how, about, a, nice, 12, yr, old...","[getty, image, priest]","[stock, photography, stockxchng, image, getty,...","[male, female]",Mixed
91836,img/91836.png,one of my 5 boyfriends slept with another girl...,hateful,[sex],[inferiority],photo caption,"[Stock photography, Web colors, Color scheme, ...",female,,"[one, of, my, 5, boyfriend, slept, with, anoth...","[photo, caption]","[stock, photography, web, color, color, scheme...","[female, male]",Mixed


In [76]:
unlabel_df = train_sex_df[(train_sex_df['pred_label'] != train_sex_df['manual_label']) & (train_sex_df['pred_label'] == 'Unidentified')]
print(unlabel_df.shape)
unlabel_df

(21, 19)


Unnamed: 0,img,text,gold_hate,gold_pc,gold_attack,best_guess_labels,entities,gender,race,manual_label,difficulty_level,text_cleaned,best_guess_labels_cleaned,entities_cleaned,text_cat,entities_cat,best_guess_cat,combined_cat,pred_label
10328,img/10328.png,how dikes look when they're in line paying for...,hateful,[sex],[slurs],photo caption,"[Image, Stock photography, Photograph, Getty I...","[Female, Female, Female, Female, Female]","[Latino_Hispanic, White, White, Latino_Hispani...",LGBT,text_easy,"[how, dike, look, when, they, are, in, line, p...","[photo, caption]","[image, stock, photography, photograph, getty,...",[],[],[],[],Unidentified
13620,img/13620.png,when you get shot in the groin by a rubber bul...,hateful,[sex],[inciting_violence],photo caption,"[Photo caption, Font, Dude]","[Male, Male]","[Black, Black]",LGBT,text_easy,"[when, you, get, shot, in, the, groin, by, a, ...","[photo, caption]","[photo, caption, font, dude]",[],[],[],[],Unidentified
17236,img/17236.png,this just in caitlyn jenner claims bruce fondl...,hateful,[sex],[inferiority],poster,"[Poster, Photo caption]","[Female, Male, Male]","[White, White, White]",LGBT,text_easy,"[this, just, in, caitlyn, jenner, claim, bruce...",[poster],"[poster, photo, caption]",[],[],[],[],Unidentified
17326,img/17326.png,what the neighbors see what my father sees,hateful,[sex],[inciting_violence],photograph,"[Stock photography, , Image, , Getty Images, P...","[Female, Female, Female, Female]","[White, White, White, White]",female,context_medium,"[what, the, neighbor, see, what, my, father, see]",[photograph],"[stock, photography, image, getty, image, phot...",[],[],[],[],Unidentified
25847,img/25847.png,they should not be allowed to have kids you ma...,hateful,[sex],"[contempt, exclusion]",mi sono innamorata di una ragazza,"[Friendship, Lesbiaeth, Kiss, Multiracial peop...","[Female, Female, Female]","[White, White, White]",LGBT,context_medium,"[they, should, not, be, allowed, to, have, kid...","[mi, sono, innamorata, di, una, ragazza]","[friendship, lesbiaeth, kiss, multiracial, peo...",[],[],[],[],Unidentified
27489,img/27489.png,how do we destroy the patriarchy? it's simple....,hateful,[sex],[inferiority],smile,"[Stock photography, , Getty Images, Photo capt...",[Female],[White],female,context_medium,"[how, do, we, destroy, the, patriarchy, , it, ...",[smile],"[stock, photography, getty, image, photo, capt...",[],[],[],[],Unidentified
27956,img/27956.png,companies when pride month ends:,hateful,[sex],[dehumanizing],shoulder,"[Stock photography, Anytime Plumbing & Solutio...","[Male, Male, Male]","[White, White, White]",LGBT,context_medium,"[company, when, pride, month, end, ]",[shoulder],"[stock, photography, anytime, plumbing, &, sol...",[],[],[],[],Unidentified
29703,img/29703.png,when each letter is a mental disorder,hateful,[sex],[inferiority],eyewear,"[Product design, Brand, Product, Smiley, Line,...",[],[],LGBT,context_medium,"[when, each, letter, is, a, mental, disorder]",[eyewear],"[product, design, brand, product, smiley, line...",[],[],[],[],Unidentified
30692,img/30692.png,cameraman: i don't understan- psychic kid: tru...,hateful,[sex],[attack_empty],9/11 memorial,"[World Trade Center, 9/11 Memorial & Museum, ,...",[],[],Mislabeled,,"[cameraman, , i, do, not, understan-, psychic,...","[9/11, memorial]","[world, trade, center, 9/11, memorial, &, muse...",[],[],[],[],Unidentified
37628,img/37628.png,when your uber driver arrives but you're proba...,hateful,[sex],[inferiority],driving,"[Car, Driving, , Getty Images, Image, Stock ph...","[Female, Female, Female]","[Southeast Asian, Southeast Asian, Southeast A...",female,context_medium&knowledge_hard,"[when, your, uber, driver, arrives, but, you, ...",[driving],"[car, driving, getty, image, image, stock, pho...",[],[],[],[],Unidentified


# 4. Race Analysis

In [77]:
categories = (
    ("white", ("cracker", )),
    ("black", ("black", "obama", "barack", "gorilla", "mississippi", "nigger", "ape")),
    ("east asia", ("brown", "chinese", "asian")),
    ("middle east", ("israel", "arab", "africa"))
)

special_kws = [
    ("white", "white"),
    ("black", "dark"),
    ("american indian", "red indian"),
    ("american indian", "native american")
]

def assign_race(tokens):
    keywords, labels  = [], []
    for t in tokens:
        for cat, kws in categories:
            if t in kws:
                keywords.append((t, kws))
                labels.append(cat)
    
    sent = " ".join(tokens)
    for cat, special_kw in special_kws:
        if special_kw in sent:
            keywords.append(t)
            labels.append(cat)
    
    return labels

def predict_race(combined_cat):
    from collections import Counter
    
    if len(combined_cat) == 0:
        return "Unidentified"
    
    counter = Counter(combined_cat)
    elements = counter.most_common(2)
    
    if len(elements) == 1 or elements[0][1] > elements[1][1]:
        return elements[0][0]
    
    return "Mixed"

## 4.1 Race Analysis - Train

In [78]:
train_race_df = train_combined_df[train_combined_df['gold_pc'].apply(lambda x: 'race' in x)]
train_race_df['manual_label'] = train_race_df['manual_label'].apply(lambda x: x[0])
train_race_df['manual_label'] = train_race_df['manual_label'].apply(lambda x: x[0: x.index("(")].strip() if "(" in x else x)
train_race_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_race_df['manual_label'] = train_race_df['manual_label'].apply(lambda x: x[0])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_race_df['manual_label'] = train_race_df['manual_label'].apply(lambda x: x[0: x.index("(")].strip() if "(" in x else x)


Unnamed: 0,img,text,gold_hate,gold_pc,gold_attack,best_guess_labels,entities,gender,race,manual_label,difficulty_level,text_cleaned,best_guess_labels_cleaned,entities_cleaned
2457,img/02457.png,i hate when i see a black out in my neighborho...,hateful,[race],[contempt],kkk cool,"[Ku Klux Klan, Stone Mountain, Loyal White Kni...",[],[],black,context_medium,"[i, hate, when, i, see, a, black, out, in, my,...","[kkk, cool]","[ku, klux, klan, stone, mountain, loyal, white..."
2719,img/02719.png,your black neighbor after you called the cops,hateful,[race],[dehumanizing],bonobo male,"[Bonobo, Primate, Human, New World monkeys, Te...",[],[],black,context_medium,"[your, black, neighbor, after, you, called, th...","[bonobo, male]","[bonobo, primate, human, new, world, monkey, t..."
3128,img/03128.png,lazy day nothing to steal,hateful,[race],[dehumanizing],tribe,"[Omo River, Hamar people, Tribe, , Arbore peop...","[Male, Male, Male]","[Black, Black, Black]",black,context_medium& knowledge_hard,"[lazy, day, nothing, to, steal]",[tribe],"[omo, river, hamar, people, tribe, arbore, peo..."
3591,img/03591.png,joins the democrat party to indulge in deep se...,hateful,[race],[attack_empty],nerd girls,"[Stock photography, Photograph, Getty Images, ...","[Female, Female, Female]","[White, White, White]",black,text_easy&context_medium,"[join, the, democrat, party, to, indulge, in, ...","[nerd, girl]","[stock, photography, photograph, getty, image,..."
4859,img/04859.png,arnold schwarzenigger i'll be black,hateful,[race],[slurs],beard,"[T-Shirt, Beard, Product, Brand, Photo caption...",[Male],[Black],black,easy_text&knowledge_hard,"[arnold, schwarzenigger, i, will, be, black]",[beard],"[t-shirt, beard, product, brand, photo, captio..."


In [79]:
train_race_df['manual_label'].value_counts()

black              62
???                16
white               9
middle east         5
east asia           4
american indian     3
asian               1
Name: manual_label, dtype: int64

In [80]:
train_race_df['text_cat'] = train_race_df['text_cleaned'].apply(assign_race)
train_race_df['entities_cat'] = train_race_df['entities_cleaned'].apply(assign_race)
train_race_df['best_guess_cat'] = train_race_df['best_guess_labels_cleaned'].apply(assign_race)

train_race_df['combined_cat'] = train_race_df['text_cat'] + train_race_df['entities_cat'] + train_race_df['best_guess_cat']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_race_df['text_cat'] = train_race_df['text_cleaned'].apply(assign_race)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_race_df['entities_cat'] = train_race_df['entities_cleaned'].apply(assign_race)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_race_df['best_guess_cat'] = train_rac

In [81]:
train_race_df.head()

Unnamed: 0,img,text,gold_hate,gold_pc,gold_attack,best_guess_labels,entities,gender,race,manual_label,difficulty_level,text_cleaned,best_guess_labels_cleaned,entities_cleaned,text_cat,entities_cat,best_guess_cat,combined_cat
2457,img/02457.png,i hate when i see a black out in my neighborho...,hateful,[race],[contempt],kkk cool,"[Ku Klux Klan, Stone Mountain, Loyal White Kni...",[],[],black,context_medium,"[i, hate, when, i, see, a, black, out, in, my,...","[kkk, cool]","[ku, klux, klan, stone, mountain, loyal, white...",[black],[white],[],"[black, white]"
2719,img/02719.png,your black neighbor after you called the cops,hateful,[race],[dehumanizing],bonobo male,"[Bonobo, Primate, Human, New World monkeys, Te...",[],[],black,context_medium,"[your, black, neighbor, after, you, called, th...","[bonobo, male]","[bonobo, primate, human, new, world, monkey, t...",[black],[],[],[black]
3128,img/03128.png,lazy day nothing to steal,hateful,[race],[dehumanizing],tribe,"[Omo River, Hamar people, Tribe, , Arbore peop...","[Male, Male, Male]","[Black, Black, Black]",black,context_medium& knowledge_hard,"[lazy, day, nothing, to, steal]",[tribe],"[omo, river, hamar, people, tribe, arbore, peo...",[],[],[],[]
3591,img/03591.png,joins the democrat party to indulge in deep se...,hateful,[race],[attack_empty],nerd girls,"[Stock photography, Photograph, Getty Images, ...","[Female, Female, Female]","[White, White, White]",black,text_easy&context_medium,"[join, the, democrat, party, to, indulge, in, ...","[nerd, girl]","[stock, photography, photograph, getty, image,...",[black],[],[],[black]
4859,img/04859.png,arnold schwarzenigger i'll be black,hateful,[race],[slurs],beard,"[T-Shirt, Beard, Product, Brand, Photo caption...",[Male],[Black],black,easy_text&knowledge_hard,"[arnold, schwarzenigger, i, will, be, black]",[beard],"[t-shirt, beard, product, brand, photo, captio...",[black],[],[],[black]


In [82]:
train_race_df['pred_label'] = train_race_df['combined_cat'].apply(predict_gender)
train_race_df.drop(columns=['text_cat', 'entities_cat', 'best_guess_cat'], inplace=True)
train_race_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_race_df['pred_label'] = train_race_df['combined_cat'].apply(predict_gender)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


Unnamed: 0,img,text,gold_hate,gold_pc,gold_attack,best_guess_labels,entities,gender,race,manual_label,difficulty_level,text_cleaned,best_guess_labels_cleaned,entities_cleaned,combined_cat,pred_label
2457,img/02457.png,i hate when i see a black out in my neighborho...,hateful,[race],[contempt],kkk cool,"[Ku Klux Klan, Stone Mountain, Loyal White Kni...",[],[],black,context_medium,"[i, hate, when, i, see, a, black, out, in, my,...","[kkk, cool]","[ku, klux, klan, stone, mountain, loyal, white...","[black, white]",Mixed
2719,img/02719.png,your black neighbor after you called the cops,hateful,[race],[dehumanizing],bonobo male,"[Bonobo, Primate, Human, New World monkeys, Te...",[],[],black,context_medium,"[your, black, neighbor, after, you, called, th...","[bonobo, male]","[bonobo, primate, human, new, world, monkey, t...",[black],black
3128,img/03128.png,lazy day nothing to steal,hateful,[race],[dehumanizing],tribe,"[Omo River, Hamar people, Tribe, , Arbore peop...","[Male, Male, Male]","[Black, Black, Black]",black,context_medium& knowledge_hard,"[lazy, day, nothing, to, steal]",[tribe],"[omo, river, hamar, people, tribe, arbore, peo...",[],Unidentified
3591,img/03591.png,joins the democrat party to indulge in deep se...,hateful,[race],[attack_empty],nerd girls,"[Stock photography, Photograph, Getty Images, ...","[Female, Female, Female]","[White, White, White]",black,text_easy&context_medium,"[join, the, democrat, party, to, indulge, in, ...","[nerd, girl]","[stock, photography, photograph, getty, image,...",[black],black
4859,img/04859.png,arnold schwarzenigger i'll be black,hateful,[race],[slurs],beard,"[T-Shirt, Beard, Product, Brand, Photo caption...",[Male],[Black],black,easy_text&knowledge_hard,"[arnold, schwarzenigger, i, will, be, black]",[beard],"[t-shirt, beard, product, brand, photo, captio...",[black],black


In [83]:
print(train_race_df[(train_race_df['combined_cat'].apply(len) != 0) & (train_race_df['manual_label'] != '???')].shape)
print(train_race_df[(train_race_df['combined_cat'].apply(len) == 0) & (train_race_df['manual_label'] != '???')].shape)

(63, 16)
(21, 16)


In [84]:
print(train_race_df[(train_race_df['pred_label'] == train_race_df['manual_label']) & (train_race_df['manual_label'] != '???')].shape)
print(train_race_df[(train_race_df['pred_label'] != train_race_df['manual_label']) & (train_race_df['manual_label'] != '???')].shape)

(52, 16)
(32, 16)


In [85]:
mislabel_df = train_race_df[(train_race_df['pred_label'] != train_race_df['manual_label']) & (train_race_df['pred_label'] != 'Unidentified') & (train_race_df['manual_label'] != '???')]

print(mislabel_df.shape)
mislabel_df

(11, 16)


Unnamed: 0,img,text,gold_hate,gold_pc,gold_attack,best_guess_labels,entities,gender,race,manual_label,difficulty_level,text_cleaned,best_guess_labels_cleaned,entities_cleaned,combined_cat,pred_label
2457,img/02457.png,i hate when i see a black out in my neighborho...,hateful,[race],[contempt],kkk cool,"[Ku Klux Klan, Stone Mountain, Loyal White Kni...",[],[],black,context_medium,"[i, hate, when, i, see, a, black, out, in, my,...","[kkk, cool]","[ku, klux, klan, stone, mountain, loyal, white...","[black, white]",Mixed
26735,img/26735.png,you pick cotton to get to it because it's whit...,hateful,[race],[mocking],photo caption,"[Rat, Photo caption, Whiskers, Organism, Meter...",[],[],black,text_easy,"[you, pick, cotton, to, get, to, it, because, ...","[photo, caption]","[rat, photo, caption, whisker, organism, meter...","[black, white]",Mixed
57631,img/57631.png,adding coolant when the farm equipment overheats,hateful,[race],"[dehumanizing, mocking]",human,"[Human, Africa, Human head, Human body, , stoc...","[Male, Male]","[Black, Black]",black,knowledge_hard,"[adding, coolant, when, the, farm, equipment, ...",[human],"[human, africa, human, head, human, body, stoc...",[middle east],middle east
59410,img/59410.png,stop racism black children and white children ...,hateful,[race],[dehumanizing],bambino 1 anno,"[Infant, , Horoscope, Baby Food, Family, Week,...",[Female],[White],black,context_medium,"[stop, racism, black, child, and, white, child...","[bambino, 1, anno]","[infant, horoscope, baby, food, family, week, ...","[black, white]",Mixed
60173,img/60173.png,white people dont shoot eachother in the stree...,hateful,[race],[dehumanizing],School,"[Graduation ceremony, Madeira High School, Sch...","[Female, Male, Female, Male, Male]","[Latino_Hispanic, Black, White, East Asian, Ea...",black,text_easy,"[white, people, do, not, shoot, eachother, in,...",[school],"[graduation, ceremony, madeira, high, school, ...","[black, white]",Mixed
60985,img/60985.png,when you got an itch on your back that you can...,hateful,[race],[mocking],willie lynch,"[The Black Man's Lament; Or, how to Make Sugar...","[Male, Male, Male]","[White, Middle Eastern, Middle Eastern]",black,context_medium,"[when, you, got, an, itch, on, your, back, tha...","[willie, lynch]","[the, black, man, 's, lament, , or, ,, how, to...","[white, black]",Mixed
62931,img/62931.png,i'm sure people who lived in the bible where w...,hateful,[race],[dehumanizing],standing,"[Online quiz, T-Shirt, Human behavior, Centrov...","[Male, Male]","[Black, Black]",black,text_easy,"[i, am, sure, people, who, lived, in, the, bib...",[standing],"[online, quiz, t-shirt, human, behavior, centr...","[black, white]",Mixed
81245,img/81245.png,when you are hanging out with your black frien...,hateful,[race],[slurs],1980's dark alley,"[1990s, 1980s, Stock photography, Photograph, ...",[Male],[White],white,text_easy,"[when, you, are, hanging, out, with, your, bla...","[1980, 's, dark, alley]","[1990s, 1980s, stock, photography, photograph,...","[black, white, black]",black
82163,img/82163.png,please make them white,hateful,[race],[attack_empty],fur,"[Erotica, Literature, Blond, , Fur'M]","[Female, Female, Female, Female, Female, Femal...","[White, White, White, White, White, White, White]",black,text_easy,"[please, make, them, white]",[fur],"[erotica, literature, blond, fur, 'm]",[white],white
96354,img/96354.png,why do asians have eyes like this? isn't it ha...,hateful,[race],[inferiority],dogecoin up,"[Dogecoin, Cryptocurrency, Bitcoin, Shiba Inu ...",[],[],asian,context_medium,"[why, do, asian, have, eye, like, this, , is, ...","[dogecoin, up]","[dogecoin, cryptocurrency, bitcoin, shiba, inu...",[east asia],east asia


In [86]:
unlabel_df = train_race_df[(train_race_df['pred_label'] != train_race_df['manual_label']) & (train_race_df['pred_label'] == 'Unidentified') & (train_race_df['manual_label'] != '???')]

print(unlabel_df.shape)
unlabel_df

(21, 16)


Unnamed: 0,img,text,gold_hate,gold_pc,gold_attack,best_guess_labels,entities,gender,race,manual_label,difficulty_level,text_cleaned,best_guess_labels_cleaned,entities_cleaned,combined_cat,pred_label
3128,img/03128.png,lazy day nothing to steal,hateful,[race],[dehumanizing],tribe,"[Omo River, Hamar people, Tribe, , Arbore peop...","[Male, Male, Male]","[Black, Black, Black]",black,context_medium& knowledge_hard,"[lazy, day, nothing, to, steal]",[tribe],"[omo, river, hamar, people, tribe, arbore, peo...",[],Unidentified
7236,img/07236.png,"when you realize that ""ginger"" is just the n-w...",hateful,[race],[slurs],femme jeune rousse,"[Stock photography, Portrait, Image, Illustrat...",[Female],[White],black,knowledge_hard,"[when, you, realize, that, ginger, is, just, t...","[femme, jeune, rousse]","[stock, photography, portrait, image, illustra...",[],Unidentified
13875,img/13875.png,i see a cat let's get it for lunch,hateful,[race],[inferiority],official,"[Kim Jong-il, North Korea, Supreme Leader of N...","[Male, Male, Male, Male, Male, Male, Male]","[Southeast Asian, Southeast Asian, Southeast A...",east asia,knowledge_hard,"[i, see, a, cat, let, u, get, it, for, lunch]",[official],"[kim, jong-il, north, korea, supreme, leader, ...",[],Unidentified
20738,img/20738.png,if being racist means i don't want my country ...,hateful,[race],[contempt],firearm,"[Keanu Reeves, Gun, , , John Wick: Chapter 2, ...","[Male, Male]","[White, White]",middle east,,"[if, being, racist, mean, i, do, not, want, my...",[firearm],"[keanu, reef, gun, john, wick, , chapter, 2, m...",[],Unidentified
21674,img/21674.png,hey bro i'm not racist bro but ever noticed po...,hateful,[race],"[dehumanizing, inferiority]",cool dog,"[Siberian Husky, Puppy, Husky, Sled dog, iStoc...",[],[],black,text_image,"[hey, bro, i, am, not, racist, bro, but, ever,...","[cool, dog]","[siberian, husky, puppy, husky, sled, dog, ist...",[],Unidentified
23910,img/23910.png,baby look like he pay his own child support,hateful,[race],[inferiority],head,"[Name, Infant, Youth, , Middle name, Nickname,...","[Male, Male]","[Black, Black]",black,context_medium&knowledge_hard,"[baby, look, like, he, pay, his, own, child, s...",[head],"[name, infant, youth, middle, name, nickname, ...",[],Unidentified
24783,img/24783.png,when you go out to fight for independence but ...,hateful,[race],[mocking],battle,"[Infantry, Battle, Military organization, Troo...",[],[],black,context_medium& knowledge_hard,"[when, you, go, out, to, fight, for, independe...",[battle],"[infantry, battle, military, organization, tro...",[],Unidentified
25984,img/25984.png,whipepo reading the comments like,hateful,[race],[dehumanizing],macaque,"[Macaques, Organism, Fauna, Photo caption, Snout]",[],[],white,text_easy,"[whipepo, reading, the, comment, like]",[macaque],"[macaque, organism, fauna, photo, caption, snout]",[],Unidentified
28735,img/28735.png,morgan freeman morgan not-so-freeman,hateful,[race],[mocking],Slavery,"[Slavery, Abolitionism, History of slavery, Hi...",[Male],[Black],black,knowledge_hard,"[morgan, freeman, morgan, not-so-freeman]",[slavery],"[slavery, abolitionism, history, of, slavery, ...",[],Unidentified
34872,img/34872.png,i bet god is one of us because he didn't even ...,hateful,[race],[inferiority],photo caption,"[Facial hair, Photo caption, Gentleman, Dude, ...",[Male],[Black],black,context_medium,"[i, bet, god, is, one, of, u, because, he, did...","[photo, caption]","[facial, hair, photo, caption, gentleman, dude...",[],Unidentified


## 4.2 Race Analysis - Test

In [87]:
val_race_df = hateful_df[hateful_df['gold_pc'].apply(lambda x: 'race' in x)]
val_race_df.shape

(78, 14)

In [88]:
val_race_df['manual_label'] = val_race_df.apply(lambda x: get_specific_label(x, 'race'), axis=1)
val_race_df['manual_label'] = val_race_df['manual_label'].apply(lambda x: x[0: x.index("(")].strip() if "(" in x else x)
val_race_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  val_race_df['manual_label'] = val_race_df.apply(lambda x: get_specific_label(x, 'race'), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  val_race_df['manual_label'] = val_race_df['manual_label'].apply(lambda x: x[0: x.index("(")].strip() if "(" in x else x)


Unnamed: 0,img,text,gold_hate,gold_pc,gold_attack,best_guess_labels,entities,gender,race,manual_label,difficulty_level,text_cleaned,best_guess_labels_cleaned,entities_cleaned
1765,img/01765.png,"after a girl dies, what organ in her body stay...",hateful,"[race, sex]",[inciting_violence],poster,"[Stock photography, , Getty Images, Image, Pho...","[Male, Male]","[Black, Black]",black,context_medium,"[after, a, girl, dy, ,, what, organ, in, her, ...",[poster],"[stock, photography, getty, image, image, phot..."
1796,img/01796.png,life hack #23 how to get stoned with no weed,hateful,"[race, religion, sex]",[inciting_violence],photograph,"[Photograph, Hug, Photo caption, Product, Frie...","[Female, Female, Female, Female, Female]","[Southeast Asian, Southeast Asian, Southeast A...",middle east,context_difficult,"[life, hack, #, 23, how, to, get, stoned, with...",[photograph],"[photograph, hug, photo, caption, product, fri..."
1925,img/01925.png,i am not racist i just don't like brown sugar,hateful,[race],[contempt],Scholarship,[Amelia Lost: The Life and Disappearance of Am...,"[Female, Female, Female]","[Indian, Indian, Indian]",middle east,text_easy,"[i, am, not, racist, i, just, do, not, like, b...",[scholarship],"[amelia, lost, , the, life, and, disappearance..."
2478,img/02478.png,must..resist...urge to.. kill whitey.,hateful,[race],[inciting_violence],official,"[Marcus Antonius, Public speaking, Public Rela...","[Male, Male, Male, Male]","[White, Latino_Hispanic, Latino_Hispanic, Lati...",black,context_medium,"[mustresist, , urge, to, kill, whitey, ]",[official],"[marcus, antonius, public, speaking, public, r..."
2568,img/02568.png,whatsthe difference between heroin and i? cops...,hateful,[race],"[inciting_violence, dehumanizing]",man middle finger,"[Stock photography, Middle finger, Image, , Ph...",[Male],[Black],black,context_medium,"[whatsthe, difference, between, heroin, and, i...","[man, middle, finger]","[stock, photography, middle, finger, image, ph..."


In [89]:
val_race_df['manual_label'].value_counts()

black              42
white              11
middle east         9
???                 8
hispanic/latino     4
white or black      1
german              1
asia                1
east asian          1
Name: manual_label, dtype: int64

In [90]:
val_race_df['text_cat'] = val_race_df['text_cleaned'].apply(assign_race)
val_race_df['entities_cat'] = val_race_df['entities_cleaned'].apply(assign_race)
val_race_df['best_guess_cat'] = val_race_df['best_guess_labels_cleaned'].apply(assign_race)

val_race_df['combined_cat'] = val_race_df['text_cat'] + val_race_df['entities_cat'] + val_race_df['best_guess_cat']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  val_race_df['text_cat'] = val_race_df['text_cleaned'].apply(assign_race)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  val_race_df['entities_cat'] = val_race_df['entities_cleaned'].apply(assign_race)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  val_race_df['best_guess_cat'] = val_race_df['best_g

In [91]:
val_race_df['pred_label'] = val_race_df['combined_cat'].apply(predict_race)
# val_race_df.drop(columns=['text_cat', 'entities_cat', 'best_guess_cat', 'gender', 'race'], inplace=True)
val_race_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  val_race_df['pred_label'] = val_race_df['combined_cat'].apply(predict_race)


Unnamed: 0,img,text,gold_hate,gold_pc,gold_attack,best_guess_labels,entities,gender,race,manual_label,difficulty_level,text_cleaned,best_guess_labels_cleaned,entities_cleaned,text_cat,entities_cat,best_guess_cat,combined_cat,pred_label
1765,img/01765.png,"after a girl dies, what organ in her body stay...",hateful,"[race, sex]",[inciting_violence],poster,"[Stock photography, , Getty Images, Image, Pho...","[Male, Male]","[Black, Black]",black,context_medium,"[after, a, girl, dy, ,, what, organ, in, her, ...",[poster],"[stock, photography, getty, image, image, phot...",[],[],[],[],Unidentified
1796,img/01796.png,life hack #23 how to get stoned with no weed,hateful,"[race, religion, sex]",[inciting_violence],photograph,"[Photograph, Hug, Photo caption, Product, Frie...","[Female, Female, Female, Female, Female]","[Southeast Asian, Southeast Asian, Southeast A...",middle east,context_difficult,"[life, hack, #, 23, how, to, get, stoned, with...",[photograph],"[photograph, hug, photo, caption, product, fri...",[],[],[],[],Unidentified
1925,img/01925.png,i am not racist i just don't like brown sugar,hateful,[race],[contempt],Scholarship,[Amelia Lost: The Life and Disappearance of Am...,"[Female, Female, Female]","[Indian, Indian, Indian]",middle east,text_easy,"[i, am, not, racist, i, just, do, not, like, b...",[scholarship],"[amelia, lost, , the, life, and, disappearance...",[east asia],[],[],[east asia],east asia
2478,img/02478.png,must..resist...urge to.. kill whitey.,hateful,[race],[inciting_violence],official,"[Marcus Antonius, Public speaking, Public Rela...","[Male, Male, Male, Male]","[White, Latino_Hispanic, Latino_Hispanic, Lati...",black,context_medium,"[mustresist, , urge, to, kill, whitey, ]",[official],"[marcus, antonius, public, speaking, public, r...",[white],[],[],[white],white
2568,img/02568.png,whatsthe difference between heroin and i? cops...,hateful,[race],"[inciting_violence, dehumanizing]",man middle finger,"[Stock photography, Middle finger, Image, , Ph...",[Male],[Black],black,context_medium,"[whatsthe, difference, between, heroin, and, i...","[man, middle, finger]","[stock, photography, middle, finger, image, ph...",[],[],[],[],Unidentified


In [92]:
print(val_race_df[(val_race_df['combined_cat'].apply(len) != 0) & (val_race_df['manual_label'] != '???')].shape)
print(val_race_df[(val_race_df['combined_cat'].apply(len) == 0) & (val_race_df['manual_label'] != '???')].shape)

(35, 19)
(35, 19)


In [93]:
print(val_race_df[(val_race_df['pred_label'] == val_race_df['manual_label']) & (val_race_df['manual_label'] != '???')].shape)
print(val_race_df[(val_race_df['pred_label'] != val_race_df['manual_label']) & (val_race_df['manual_label'] != '???')].shape)

(24, 19)
(46, 19)


In [94]:
mislabel_df = val_race_df[(val_race_df['pred_label'] != val_race_df['manual_label']) & (val_race_df['pred_label'] != 'Unidentified') & (val_race_df['manual_label'] != '???')]
print(mislabel_df.shape)
mislabel_df

(11, 19)


Unnamed: 0,img,text,gold_hate,gold_pc,gold_attack,best_guess_labels,entities,gender,race,manual_label,difficulty_level,text_cleaned,best_guess_labels_cleaned,entities_cleaned,text_cat,entities_cat,best_guess_cat,combined_cat,pred_label
1925,img/01925.png,i am not racist i just don't like brown sugar,hateful,[race],[contempt],Scholarship,[Amelia Lost: The Life and Disappearance of Am...,"[Female, Female, Female]","[Indian, Indian, Indian]",middle east,text_easy,"[i, am, not, racist, i, just, do, not, like, b...",[scholarship],"[amelia, lost, , the, life, and, disappearance...",[east asia],[],[],[east asia],east asia
2478,img/02478.png,must..resist...urge to.. kill whitey.,hateful,[race],[inciting_violence],official,"[Marcus Antonius, Public speaking, Public Rela...","[Male, Male, Male, Male]","[White, Latino_Hispanic, Latino_Hispanic, Lati...",black,context_medium,"[mustresist, , urge, to, kill, whitey, ]",[official],"[marcus, antonius, public, speaking, public, r...",[white],[],[],[white],white
3745,img/03745.png,your order comes to $37.50 and your white priv...,hateful,[race],[inferiority],desk,"[Furniture, Desk, Media, electronic machine, E...","[Female, Female, Female]","[White, White, White]",black,text_easy,"[your, order, come, to, $, 3750, and, your, wh...",[desk],"[furniture, desk, medium, electronic, machine,...",[white],[],[],[white],white
8291,img/08291.png,white people is this a shooting range,hateful,[race],[dehumanizing],transgender kids,"[Jean Piaget, , Family, Infant, Object permane...","[Male, Male, Male]","[Black, Black, Black]",black,context_medium,"[white, people, is, this, a, shooting, range]","[transgender, kid]","[jean, piaget, family, infant, object, permane...",[white],[],[],[white],white
19508,img/19508.png,the definition of utter disgust in plain black...,hateful,[race],[contempt],man,"[Black and white, Thorax, Romance, Photo capti...","[Female, Female]","[Middle Eastern, Middle Eastern]",white or black,context_medium,"[the, definition, of, utter, disgust, in, plai...",[man],"[black, and, white, thorax, romance, photo, ca...","[black, white]","[black, white]",[],"[black, white, black, white]",Mixed
28190,img/28190.png,when finishing a race is your passion,hateful,"[religion, race]",[mocking],rome berlin axis 1936,"[Benito Mussolini, World War II, The dark vall...","[Male, Male]","[White, Middle Eastern]",middle east,context_medium,"[when, finishing, a, race, is, your, passion]","[rome, berlin, axis, 1936]","[benito, mussolini, world, war, ii, the, dark,...",[],[black],[],[black],black
48296,img/48296.png,when you date an asian boy and you trynna get ...,hateful,"[sex, race]",[inferiority],photo caption,"[Mammal, Human behavior, Hug, Human, Happiness...","[Female, Female, Female]","[Black, Black, Black]",asia,,"[when, you, date, an, asian, boy, and, you, tr...","[photo, caption]","[mammal, human, behavior, hug, human, happines...",[east asia],[],[],[east asia],east asia
49826,img/49826.png,white people on cinco de mayo vs. every other day,hateful,[race],[inferiority],art,"[Stock photography, , Getty Images, Image, Pho...","[Male, Male]","[White, White]",hispanic/latino,knowledge_hard,"[white, people, on, cinco, de, mayo, vs, every...",[art],"[stock, photography, getty, image, image, phot...",[white],[],[],[white],white
58672,img/58672.png,did you know that the chinese eat dogs savages...,hateful,"[nationality, race]","[dehumanizing, contempt]",dish,"[Charles, Prince of Wales, Camilla, Duchess of...","[Male, Female, Male, Male, Male, Male, Female]","[White, White, White, Middle Eastern, White, I...",east asian,text_easy,"[did, you, know, that, the, chinese, eat, dog,...",[dish],"[charles, ,, prince, of, wale, camilla, ,, duc...",[east asia],[],[],[east asia],east asia
73601,img/73601.png,when you are white and stole your first bike,hateful,[race],[dehumanizing],hand,"[Hand, Stock photography, Image, stock.xchng, ...",[],[],black,context_medium,"[when, you, are, white, and, stole, your, firs...",[hand],"[hand, stock, photography, image, stockxchng, ...",[white],"[black, white]",[],"[white, black, white]",white


In [95]:
unlabel_df = val_race_df[(val_race_df['pred_label'] != val_race_df['manual_label']) & (val_race_df['pred_label'] == 'Unidentified') & (val_race_df['manual_label'] != '???')]
print(unlabel_df.shape)
unlabel_df

(35, 19)


Unnamed: 0,img,text,gold_hate,gold_pc,gold_attack,best_guess_labels,entities,gender,race,manual_label,difficulty_level,text_cleaned,best_guess_labels_cleaned,entities_cleaned,text_cat,entities_cat,best_guess_cat,combined_cat,pred_label
1765,img/01765.png,"after a girl dies, what organ in her body stay...",hateful,"[race, sex]",[inciting_violence],poster,"[Stock photography, , Getty Images, Image, Pho...","[Male, Male]","[Black, Black]",black,context_medium,"[after, a, girl, dy, ,, what, organ, in, her, ...",[poster],"[stock, photography, getty, image, image, phot...",[],[],[],[],Unidentified
1796,img/01796.png,life hack #23 how to get stoned with no weed,hateful,"[race, religion, sex]",[inciting_violence],photograph,"[Photograph, Hug, Photo caption, Product, Frie...","[Female, Female, Female, Female, Female]","[Southeast Asian, Southeast Asian, Southeast A...",middle east,context_difficult,"[life, hack, #, 23, how, to, get, stoned, with...",[photograph],"[photograph, hug, photo, caption, product, fri...",[],[],[],[],Unidentified
2568,img/02568.png,whatsthe difference between heroin and i? cops...,hateful,[race],"[inciting_violence, dehumanizing]",man middle finger,"[Stock photography, Middle finger, Image, , Ph...",[Male],[Black],black,context_medium,"[whatsthe, difference, between, heroin, and, i...","[man, middle, finger]","[stock, photography, middle, finger, image, ph...",[],[],[],[],Unidentified
2783,img/02783.png,a picture of my great great great grand grandf...,hateful,[race],"[dehumanizing, mocking]",tree,"[Illustration, , Clip art, , Getty Images, Ama...","[Male, Male, Male]","[Indian, Indian, Indian]",black,context_medium,"[a, picture, of, my, great, great, great, gran...",[tree],"[illustration, clip, art, getty, image, amazon...",[],[],[],[],Unidentified
7429,img/07429.png,i'm about to have my 5th abortion fuck you bat...,hateful,[race],[dehumanizing],pregnant stomach,"[Pregnancy, Abdomen, Health, Stomach, Diastasi...","[Female, Female]","[Latino_Hispanic, Latino_Hispanic]",black,context_medium,"[i, am, about, to, have, my, 5th, abortion, fu...","[pregnant, stomach]","[pregnancy, abdomen, health, stomach, diastasi...",[],[],[],[],Unidentified
10362,img/10362.png,german british danish austrian diversity. appa...,hateful,[race],[inferiority],friendship,"[, Indonesia, , Photograph, Stock photography,...","[Female, Female, Female, Male, Female, Female,...","[East Asian, Indian, Middle Eastern, White, Bl...",middle east,text_easy&context_medium,"[german, british, danish, austrian, diversity,...",[friendship],"[indonesia, photograph, stock, photography, ge...",[],[],[],[],Unidentified
10785,img/10785.png,"time to put the ""panic"" back in hispanic",hateful,[race],[inciting_violence],trump gun,"[Gun, Mass shooting, President of the United S...","[Male, Male]","[White, White]",hispanic/latino,context_medum,"[time, to, put, the, panic, back, in, hispanic]","[trump, gun]","[gun, mass, shooting, president, of, the, unit...",[],[],[],[],Unidentified
16354,img/16354.png,we've taken this country! now fuck off! we're ...,hateful,"[religion, race]","[inciting_violence, contempt]",beard,"[Beard, Facial hair, Poster, Dude, Photo capti...","[Male, Male, Male, Male]","[White, Middle Eastern, Middle Eastern, Black]",middle east,context_medium,"[we, have, taken, this, country, , now, fuck, ...",[beard],"[beard, facial, hair, poster, dude, photo, cap...",[],[],[],[],Unidentified
17682,img/17682.png,give them what they deserve kill them all,hateful,[race],[inciting_violence],player,"[, , Sports equipment, Ball, Activewear, Playe...","[Male, Male]","[Middle Eastern, Middle Eastern]",black,context_medium,"[give, them, what, they, deserve, kill, them, ...",[player],"[sport, equipment, ball, activewear, player, p...",[],[],[],[],Unidentified
18547,img/18547.png,1st day of 4th grade sandy hook elementary school,hateful,[race],[dehumanizing],gun,"[Gun, Gun, , Getty Images, Image, Stock photog...","[Male, Male]","[White, White]",white,knowledge_hard,"[1st, day, of, 4th, grade, sandy, hook, elemen...",[gun],"[gun, gun, getty, image, image, stock, photogr...",[],[],[],[],Unidentified


# 5. Religion Analysis

In [112]:
categories = (
    ("muslim", ("muslim", "islamic", "islam", "moslem", "hijab", "jihad", "burqa", "abdul")),
    ("jews", ("jew", "nazi", "hitler")),
    ("catholic christian", ("catholic", )),
    ("christian", ("undefinedtobeconfirmed",))
)

special_kws = [
    ("catholic christian", "pedo"),
    ("jews", "jew"),
    ("muslim", "moham")
]

def assign_religion(tokens):
    keywords, labels  = [], []
    for t in tokens:
        for cat, kws in categories:
            if t in kws:
                keywords.append((t, kws))
                labels.append(cat)
    
    sent = " ".join(tokens)
    for cat, special_kw in special_kws:
        if special_kw in sent:
            keywords.append(t)
            labels.append(cat)
    
    return labels

def predict_religion(combined_cat):
    from collections import Counter
    
    if len(combined_cat) == 0:
        return "Unidentified"
    
    counter = Counter(combined_cat)
    elements = counter.most_common(2)
    
    if len(elements) == 1 or elements[0][1] > elements[1][1]:
        return elements[0][0]
    
    return "Mixed"

In [113]:
train_religion_df['text_cat'] = train_religion_df['text_cleaned'].apply(assign_religion)
train_religion_df['entities_cat'] = train_religion_df['entities_cleaned'].apply(assign_religion)
train_religion_df['best_guess_cat'] = train_religion_df['best_guess_labels_cleaned'].apply(assign_religion)

train_religion_df['combined_cat'] = train_religion_df['text_cat'] + train_religion_df['entities_cat'] + train_religion_df['best_guess_cat']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_religion_df['text_cat'] = train_religion_df['text_cleaned'].apply(assign_religion)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_religion_df['entities_cat'] = train_religion_df['entities_cleaned'].apply(assign_religion)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_religion_df['

In [114]:
train_race_df.head()

Unnamed: 0,img,text,gold_hate,gold_pc,gold_attack,best_guess_labels,entities,gender,race,manual_label,difficulty_level,text_cleaned,best_guess_labels_cleaned,entities_cleaned,combined_cat,pred_label
2457,img/02457.png,i hate when i see a black out in my neighborho...,hateful,[race],[contempt],kkk cool,"[Ku Klux Klan, Stone Mountain, Loyal White Kni...",[],[],black,context_medium,"[i, hate, when, i, see, a, black, out, in, my,...","[kkk, cool]","[ku, klux, klan, stone, mountain, loyal, white...","[black, white]",Mixed
2719,img/02719.png,your black neighbor after you called the cops,hateful,[race],[dehumanizing],bonobo male,"[Bonobo, Primate, Human, New World monkeys, Te...",[],[],black,context_medium,"[your, black, neighbor, after, you, called, th...","[bonobo, male]","[bonobo, primate, human, new, world, monkey, t...",[black],black
3128,img/03128.png,lazy day nothing to steal,hateful,[race],[dehumanizing],tribe,"[Omo River, Hamar people, Tribe, , Arbore peop...","[Male, Male, Male]","[Black, Black, Black]",black,context_medium& knowledge_hard,"[lazy, day, nothing, to, steal]",[tribe],"[omo, river, hamar, people, tribe, arbore, peo...",[],Unidentified
3591,img/03591.png,joins the democrat party to indulge in deep se...,hateful,[race],[attack_empty],nerd girls,"[Stock photography, Photograph, Getty Images, ...","[Female, Female, Female]","[White, White, White]",black,text_easy&context_medium,"[join, the, democrat, party, to, indulge, in, ...","[nerd, girl]","[stock, photography, photograph, getty, image,...",[black],black
4859,img/04859.png,arnold schwarzenigger i'll be black,hateful,[race],[slurs],beard,"[T-Shirt, Beard, Product, Brand, Photo caption...",[Male],[Black],black,easy_text&knowledge_hard,"[arnold, schwarzenigger, i, will, be, black]",[beard],"[t-shirt, beard, product, brand, photo, captio...",[black],black


In [115]:
train_religion_df['pred_label'] = train_religion_df['combined_cat'].apply(predict_religion)
train_religion_df.drop(columns=['text_cat', 'entities_cat', 'best_guess_cat'], inplace=True)
train_religion_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_religion_df['pred_label'] = train_religion_df['combined_cat'].apply(predict_religion)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


Unnamed: 0,img,text,gold_hate,gold_pc,gold_attack,best_guess_labels,entities,gender,race,manual_label,difficulty_level,text_cleaned,best_guess_labels_cleaned,entities_cleaned,combined_cat,pred_label
1497,img/01497.png,now back to the married islamic classic.. to c...,hateful,[religion],[dehumanizing],photo caption,"[, Nangalam, Getty Images, Image, Photo captio...","[Male, Male, Male]","[Indian, White, Indian]",muslim,,"[now, back, to, the, married, islamic, classic...","[photo, caption]","[nangalam, getty, image, image, photo, caption...",[muslim],muslim
1579,img/01579.png,"muslim figure: ""we must have pork-free menus o...",hateful,[religion],[exclusion],با حجاب در آمریکا,"[United States, , Islam in the United States, ...","[Female, Male, Female, Male, Male, Male, Female]","[Latino_Hispanic, Black, White, Black, Black, ...",muslim,,"[muslim, figure, , we, must, have, pork-free, ...","[با, حجاب, در, آمریکا]","[united, state, islam, in, the, united, state,...","[muslim, muslim, muslim, muslim, muslim]",muslim
2653,img/02653.png,england where moslems can rape your children a...,hateful,[religion],[dehumanizing],christmas in london,"[London, Christmas Day, , King's College Chape...",[],[],muslim,,"[england, where, moslem, can, rape, your, chil...","[christmas, in, london]","[london, christmas, day, king, 's, college, ch...",[muslim],muslim
4361,img/04361.png,islam turning every single country it touches ...,hateful,[religion],[contempt],summit camp,"[Graphic design, Black and white, Poster, Font...",[],[],muslim,,"[islam, turning, every, single, country, it, t...","[summit, camp]","[graphic, design, black, and, white, poster, f...",[muslim],muslim
6541,img/06541.png,"hey i just met you and this is crazy, but here...",hateful,[religion],[mocking],gentleman,"[Black and white, Monochrome, M, Photo caption...","[Male, Male, Male]","[White, White, White]",jews,,"[hey, i, just, met, you, and, this, is, crazy,...",[gentleman],"[black, and, white, monochrome, m, photo, capt...",[],Unidentified


In [116]:
print(train_religion_df[(train_religion_df['combined_cat'].apply(len) != 0) & (train_religion_df['manual_label'] != '???')].shape)
print(train_religion_df[(train_religion_df['combined_cat'].apply(len) == 0) & (train_religion_df['manual_label'] != '???')].shape)

(73, 16)
(25, 16)


In [117]:
print(train_religion_df[(train_religion_df['pred_label'] == train_religion_df['manual_label']) & (train_religion_df['manual_label'] != '???')].shape)
print(train_religion_df[(train_religion_df['pred_label'] != train_religion_df['manual_label']) & (train_religion_df['manual_label'] != '???')].shape)

(70, 16)
(28, 16)


In [118]:
mislabel_df = train_religion_df[(train_religion_df['pred_label'] != train_religion_df['manual_label']) & (train_religion_df['pred_label'] != 'Unidentified') & (train_religion_df['manual_label'] != '???')]

print(mislabel_df.shape)
mislabel_df

(3, 16)


Unnamed: 0,img,text,gold_hate,gold_pc,gold_attack,best_guess_labels,entities,gender,race,manual_label,difficulty_level,text_cleaned,best_guess_labels_cleaned,entities_cleaned,combined_cat,pred_label
10492,img/10492.png,"""judgement day will not come until the muslims...",hateful,[religion],[inciting_violence],glasses,"[Glasses, Photo caption, Stock photography, Ph...",[Male],[White],muslim or jews,,"[judgement, day, will, not, come, until, the, ...",[glass],"[glass, photo, caption, stock, photography, ph...","[muslim, jews, jews]",jews
34751,img/34751.png,muslims: screaming about their religion being ...,hateful,[religion],[inciting_violence],shoulder,"[Forehead, Hair coloring, Photo caption, Font,...","[Female, Female, Female]","[Latino_Hispanic, Latino_Hispanic, Latino_Hisp...",muslim,,"[muslim, , screaming, about, their, religion, ...",[shoulder],"[forehead, hair, coloring, photo, caption, fon...","[muslim, jews, jews]",jews
38246,img/38246.png,islamic extremists try to exterminate judaism ...,hateful,[religion],[mocking],heath ledger joker,"[Heath Ledger, The Dark Knight, Joker, The Jok...","[Female, Female]","[Southeast Asian, Southeast Asian]",muslim,,"[islamic, extremist, try, to, exterminate, jud...","[heath, ledger, joker]","[heath, ledger, the, dark, knight, joker, the,...","[muslim, jews]",Mixed


In [119]:
unlabel_df = train_religion_df[(train_religion_df['pred_label'] != train_religion_df['manual_label']) & (train_religion_df['pred_label'] == 'Unidentified') & (train_religion_df['manual_label'] != '???')]

print(unlabel_df.shape)
unlabel_df

(25, 16)


Unnamed: 0,img,text,gold_hate,gold_pc,gold_attack,best_guess_labels,entities,gender,race,manual_label,difficulty_level,text_cleaned,best_guess_labels_cleaned,entities_cleaned,combined_cat,pred_label
6541,img/06541.png,"hey i just met you and this is crazy, but here...",hateful,[religion],[mocking],gentleman,"[Black and white, Monochrome, M, Photo caption...","[Male, Male, Male]","[White, White, White]",jews,,"[hey, i, just, met, you, and, this, is, crazy,...",[gentleman],"[black, and, white, monochrome, m, photo, capt...",[],Unidentified
8534,img/08534.png,if you don't believe in god you're going to su...,hateful,[religion],[inferiority],estranged parents,"[Parent, Family estrangement, Daughter, , Fami...","[Female, Male, Male, Male]","[East Asian, White, White, White]",christianity,easy&hard,"[if, you, do, not, believe, in, god, you, are,...","[estranged, parent]","[parent, family, estrangement, daughter, famil...",[],Unidentified
13249,img/13249.png,"here's a little song i call ""the talmudists ar...",hateful,[religion],[dehumanizing],kermit the frog,"[Kermit the Frog, Miss Piggy, The Muppets, Rai...",[],[],jews,,"[here, is, a, little, song, i, call, the, talm...","[kermit, the, frog]","[kermit, the, frog, miss, piggy, the, muppets,...",[],Unidentified
14087,img/14087.png,so i bent him over like this and i made sure h...,hateful,[religion],[dehumanizing],person,[Photo caption],[Male],[White],catholic christian,,"[so, i, bent, him, over, like, this, and, i, m...",[person],"[photo, caption]",[],Unidentified
15479,img/15479.png,when you're in heaven and you got your 72 virg...,hateful,[religion],[dehumanizing],photo caption,"[Facial hair, Photo caption, Product, Dude, Ha...","[Male, Male, Male, Male, Male, Male]","[Latino_Hispanic, White, Latino_Hispanic, Whit...",muslim,,"[when, you, are, in, heaven, and, you, got, yo...","[photo, caption]","[facial, hair, photo, caption, product, dude, ...",[],Unidentified
23054,img/23054.png,911 was personally difficult for me i lost 19 ...,hateful,[religion],[dehumanizing],girl,"[Photo caption, Student]","[Female, Female, Female, Female]","[Middle Eastern, Indian, Indian, Indian]",muslim,,"[911, wa, personally, difficult, for, me, i, l...",[girl],"[photo, caption, student]",[],Unidentified
25489,img/25489.png,brother... a day without a blast is a day wasted,hateful,[religion],[dehumanizing],photo caption,"[Moustache M, Photo caption, Product]",[Male],[Middle Eastern],muslim,,"[brother, , a, day, without, a, blast, is, a, ...","[photo, caption]","[moustache, m, photo, caption, product]",[],Unidentified
35684,img/35684.png,when you tell him he can't marry a 6 years old...,hateful,[religion],[dehumanizing],photo caption,"[Human behavior, Beard, Photo caption, Human, ...",[Male],[Middle Eastern],muslim,,"[when, you, tell, him, he, can, not, marry, a,...","[photo, caption]","[human, behavior, beard, photo, caption, human...",[],Unidentified
41092,img/41092.png,ana frank at 14 years old ana frank at 15 year...,hateful,[religion],[mocking],Soap,"[Bar Soap, Soap, Selah Vie Soapery, Oil, Essen...","[Female, Female]","[White, White]",jews,,"[ana, frank, at, 14, year, old, ana, frank, at...",[soap],"[bar, soap, soap, selah, vie, soapery, oil, es...",[],Unidentified
41270,img/41270.png,adult diapers for complete shitheads,hateful,[religion],[inferiority],photo caption,"[Photo caption, Headgear, Font, Capital asset ...","[Female, Female, Female]","[Black, Black, Black]",muslim,,"[adult, diaper, for, complete, shithead]","[photo, caption]","[photo, caption, headgear, font, capital, asse...",[],Unidentified
