In [1]:
import time
import os
import pandas as pd
import re
import numpy as np
import nltk
import string
from nltk import word_tokenize
from nltk import wordpunct_tokenize
from nltk.corpus import stopwords
from string import punctuation
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import pos_tag
import csv
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns',50)

In [2]:
os.getcwd()

'/Users/shreyavinaynayak/DSCI531_Project'

In [3]:
# taken from Appendix of Paper: Evidence That Gendered Wording in Job Advertisements Exists and Sustains Gender Inequality
text = """
Masculine words Feminine words
Active Affectionate
Adventurous Child
Aggress Cheer
Ambitio Commit
Analy Communal
Assert Compassion
Athlet Connect
Autonom Considerate
Boast Cooperat
Challeng Depend
Compet Emotiona
Confident Empath
Courag Feminine
Decide Flatterable
Decisive Gentle
Decision Honest
Determin Interpersonal
Dominant Interdependen
Domina Interpersona
Force Kind
Greedy Kinship
Headstrong Loyal
Hierarch Modesty
Hostil Nag
Implusive Nurtur
Independen Pleasant
Individual Polite
Intellect Quiet
Lead Respon
Logic Sensitiv
Masculine Submissive
Objective Support
Opinion Sympath
Outspoken Tender
Persist Together
Principle Trust
Reckless Understand
Stubborn Warm
Superior Whin
Self-confiden Yield
Self-sufficien
Self-relian
"""

In [4]:
male_words = []
female_words = []
for t in text.split('\n')[2:]:
    if t!= '':
        if len(t.split(' '))>1:
            m_w,f_w = t.split(' ')
            male_words.append(m_w.replace('\x01',''))
            female_words.append(f_w.replace('\x01',''))
        else:
            m_w = t.split(' ')[0].replace('\x01','')
            male_words.append(m_w) 

In [5]:
male_words[:10]

['Active',
 'Adventurous',
 'Aggress',
 'Ambitio',
 'Analy',
 'Assert',
 'Athlet',
 'Autonom',
 'Boast',
 'Challeng']

In [6]:
female_words[:10]

['Affectionate',
 'Child',
 'Cheer',
 'Commit',
 'Communal',
 'Compassion',
 'Connect',
 'Considerate',
 'Cooperat',
 'Depend']

In [7]:
# read data
df = pd.read_csv("Train_rev1.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244768 entries, 0 to 244767
Data columns (total 12 columns):
 #   Column              Non-Null Count   Dtype 
---  ------              --------------   ----- 
 0   Id                  244768 non-null  int64 
 1   Title               244767 non-null  object
 2   FullDescription     244768 non-null  object
 3   LocationRaw         244768 non-null  object
 4   LocationNormalized  244768 non-null  object
 5   ContractType        65442 non-null   object
 6   ContractTime        180863 non-null  object
 7   Company             212338 non-null  object
 8   Category            244768 non-null  object
 9   SalaryRaw           244768 non-null  object
 10  SalaryNormalized    244768 non-null  int64 
 11  SourceName          244767 non-null  object
dtypes: int64(2), object(10)
memory usage: 22.4+ MB


In [8]:
# text pre processing
df["FullDescription"] = df["FullDescription"].apply(lambda s: ' '.join(re.sub("(w+://S+)", " ", s).split()))
df["FullDescription"] = df["FullDescription"].apply(lambda s: ' '.join(re.sub("[.,!?:;-='...@#_/*]", " ", s).split()))
df["FullDescription"].replace('[0-9]+', '', regex=True, inplace=True)

# stopwords
nltk.download('stopwords')
stop = set(stopwords.words('english'))
# stop words removal function
def rem_en(input_txt):
    words = input_txt.lower().split()
    noise_free_words = [word for word in words if word not in stop] 
    noise_free_text = " ".join(noise_free_words) 
    return noise_free_text
df["FullDescription"] = df["FullDescription"].apply(lambda s: rem_en(s))

# remove punctuations
def rem_pu(input_txt):
    words = input_txt.lower().split()
    noise_free_words = [word for word in words if word not in punctuation] 
    noise_free_text = " ".join(noise_free_words) 
    return noise_free_text
df["FullDescription"] = df["FullDescription"].apply(lambda s: rem_pu(s))

# tokenize the job descriptions
from nltk.tokenize import RegexpTokenizer
tokeniser = RegexpTokenizer(r'\w+')
df["FullDescription_token"] = df["FullDescription"].apply(lambda x: tokeniser.tokenize(x))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/shreyavinaynayak/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
# function to get word count from job descriptions
def word_count_token(male_words, female_words, text_token):
    
    # initialize dictionaries to store word count
    male_words_all_count = {}
    female_words_all_count = {}
    
    # male words count
    for i in range(len(male_words)):
        male_word = male_words[i]
        male_count = 0
        male_all_count = 0
        if (re.match(male_word, text_token)):
            male_count_match=1
        else:
            male_count_match=0
        male_all_count=male_count_match+len(re.findall(' '+male_word, text_token))
        male_words_all_count[male_word] = male_all_count
    
    # female words count
    for j in range(len(female_words)):
        female_word = female_words[j]
        female_count = 0
        female_all_count = 0
        if (re.match(female_word, text_token)):
            female_count_match=1
        else:
            female_count_match=0
        female_all_count=female_count_match+len(re.findall(' '+female_word, text_token))
        female_words_all_count[female_word] = female_all_count
        
    return male_words_all_count, female_words_all_count

In [37]:
processed_male_words = [' '.join(tokeniser.tokenize(rem_pu(rem_en(x)))) for x in male_words]
processed_female_words = [' '.join(tokeniser.tokenize(rem_pu(rem_en(x)))) for x in female_words]


In [38]:

# update main data frame
df['all_word_counts'] = df["FullDescription"].apply(lambda x: word_count_token(processed_male_words, processed_female_words, x))

# subset dimensions
df1 = df[['LocationNormalized','Category','SalaryNormalized', 'all_word_counts']]

# get all separate counts
df1['male_words_all_count'] = df1["all_word_counts"].apply(lambda x: x[0])
df1['female_words_all_count'] = df1["all_word_counts"].apply(lambda x: x[1])

# dimensions
df_dim = df1[['LocationNormalized','Category', 'SalaryNormalized']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['male_words_all_count'] = df1["all_word_counts"].apply(lambda x: x[0])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['female_words_all_count'] = df1["all_word_counts"].apply(lambda x: x[1])


In [41]:
# make all data frames
# dataframe 1
male_words_all_count_list = list(df1['male_words_all_count'])
df_male_words_all_count = pd.DataFrame(male_words_all_count_list)
dfs = [df_dim, df_male_words_all_count]
df_male_words_all_count = pd.concat(dfs, axis = 1)
df_mwac = df_male_words_all_count.melt(id_vars=['LocationNormalized','Category','SalaryNormalized'], var_name = 'words', value_name = 'count').reset_index(drop=True)
df_mwac['gender'] = 'male'
df_mwac['count_type'] = 'total'
# data frame 2
female_words_all_count_list = list(df1['female_words_all_count'])
df_female_words_all_count = pd.DataFrame(female_words_all_count_list)
dfs = [df_dim, df_female_words_all_count]
df_female_words_all_count = pd.concat(dfs, axis = 1)
df_fwac = df_female_words_all_count.melt(id_vars=['LocationNormalized','Category','SalaryNormalized'], var_name = 'words', value_name = 'count').reset_index(drop=True)
df_fwac['gender'] = 'female'
df_fwac['count_type'] = 'total'

# get total counts
df_female_words_all_count['total_female_words'] = df_female_words_all_count.loc[:, processed_female_words].sum(1)
df_male_words_all_count['total_male_words'] = df_male_words_all_count.loc[:, processed_male_words].sum(1)

# make data frames for unique counts
# data frame 3
df_mwc = df_mwac.copy()
df_mwc.loc[df_mwc['count'] > 1, 'count'] = 1
df_mwc.loc[df_mwc['count_type']=='total','count_type'] = 'unique'
# dataframe 4
df_fwc = df_fwac.copy()
df_fwc.loc[df_fwc['count'] > 1, 'count'] = 1
df_fwc.loc[df_fwc['count_type']=='total','count_type'] = 'unique'

# combine all data
df_all = [df_mwc, df_mwac, df_fwc, df_fwac]
df_final = pd.concat(df_all, axis = 0)
# remove rows where count is 0
df_final = df_final[df_final['count'] > 0]

# group by dimensions and aggreagte counts
df_final_group = df_final.groupby(['LocationNormalized', 'Category',
                                   'words','gender','count_type'])['count'].sum().reset_index()

In [44]:
# save df_final_group
compression_opts = dict(method='zip',
                         archive_name='df_final_group.csv')
df_final_group.to_csv('out.zip', index=False,
           compression=compression_opts)

In [45]:
# make data frame for comparison
df_compare = df_male_words_all_count[['LocationNormalized', 'Category', 'SalaryNormalized', 'total_male_words']]
df_compare['total_female_words'] = df_female_words_all_count['total_female_words']

# ratio calculation
def compare(x):
    if (x['total_male_words']==0 and x['total_female_words']==0):
        ratio = -1
    elif (x['total_male_words']>0 and x['total_female_words']==0):
        ratio = 10
    elif (x['total_male_words']==0 and x['total_female_words']>0):
        ratio = 0
    else:
        ratio = x['total_male_words'] / x['total_female_words']
    return ratio

df_compare['ratio'] = df_compare.apply(compare, axis = 1)

# label maker
def label(x):
    if abs(x) == 1:
        label = 'neutral'
    elif x > 1:
        label = 'male'
    else:
        label = 'female'
    return label

df_compare['label'] = df_compare['ratio'].apply(lambda x: label(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_compare['total_female_words'] = df_female_words_all_count['total_female_words']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_compare['ratio'] = df_compare.apply(compare, axis = 1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_compare['label'] = df_compare['ratio'].apply(lambda x: label(

In [46]:
# save df compare
compression_opts1 = dict(method='zip',
                         archive_name='df_compare.csv')
df_compare.to_csv('out1.zip', index=True,
           compression=compression_opts1)