 Function (modular) that will take any dataset and return the top words associated with each gender:

In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
import numpy as np
import pandas as pd

In [5]:
df_physician = pd.read_csv(r"C:\Shilpi\BFH\MSE_Courses\Semester4\P2_BiasMitigation\New_again\P2_Code_New\csv_DS\physician_ds.csv")
df_nurse = pd.read_csv(r"C:\Shilpi\BFH\MSE_Courses\Semester4\P2_BiasMitigation\New_again\P2_Code_New\csv_DS\nurse_ds.csv")
df_surgeon = pd.read_csv(r"C:\Shilpi\BFH\MSE_Courses\Semester4\P2_BiasMitigation\New_again\P2_Code_New\csv_DS\surgeon_ds.csv")

In [7]:
df_nurse.tail()

Unnamed: 0,hard_text,profession,gender
18945,She graduated with honors in 2009. Having more...,nurse,Female
18946,"Twenty-four years ago, she was born three mont...",nurse,Female
18947,"One day, Pam walked into the hospital break ro...",nurse,Female
18948,She graduated with honors in 2013. Having more...,nurse,Female
18949,He graduated with honors in 2009. Having more ...,nurse,Male


In [None]:

def top_gender_words(df, text_col='text', target_col='gender', top_n):
    """
    Returns top words associated with each gender in a dataset.

    Parameters:
    - df: pandas DataFrame
    - text_col: name of the text column
    - target_col: name of the gender column
    - top_n: number of top words to return per gender

    Returns:
    - top_words: dictionary with 'male' and 'female' lists of top words
    """
    X = df[text_col]
    y = df[target_col]

    # Vectorize text
    vectorizer = CountVectorizer(binary=True)
    X_vec = vectorizer.fit_transform(X)

    # Train logistic regression
    clf = LogisticRegression(max_iter=1000)
    clf.fit(X_vec, y)

    # Get feature names and coefficients
    features = np.array(vectorizer.get_feature_names_out())
    coef = clf.coef_[0]

    # Top words per gender
    top_male = features[np.argsort(coef)[-top_n:]][::-1]    # highest positive
    top_female = features[np.argsort(coef)[:top_n]]         # most negative

    return {'male': top_male.tolist(), 'female': top_female.tolist()}


In [19]:
datasets = {
    "Physician": df_physician,
    "Nurse": df_nurse,
    "Surgeon": df_surgeon
}

In [20]:
# Function to get top words as DataFrame
def top_words_df(df, text_col='hard_text', target_col='gender', top_n=20):
    words = top_gender_words(df, text_col=text_col, target_col=target_col, top_n=top_n)
    return pd.DataFrame({'male': words['male'], 'female': words['female']})


In [21]:
# Loop through datasets
top_words_by_profession = {}
for prof_name, df_prof in datasets.items():
    top_words_by_profession[prof_name] = top_words_df(df_prof)


In [22]:
# Now you can access each profession's top words
for prof_name, df_words in top_words_by_profession.items():
    print(f"\nTop gender words for {prof_name}:")
    print(df_words)


Top gender words for Physician:
           male       female
0            mr          she
1            he           ms
2           his          her
3           him          mrs
4   orthopaedic        women
5       surgery   gynecology
6           the  specializes
7            of          can
8    california          and
9      research           is
10           at        phone
11           to      reached
12      michael          via
13          its       listed
14        james     registry
15      himself   affiliated
16         john   obstetrics
17           on   physicians
18      twitter         this
19      faculty    assistant

Top gender words for Nurse:
            male     female
0             he        she
1             mr        her
2            his         ms
3            him        mrs
4             to         of
5           john      women
6        richard   patients
7             by        all
8        michael         as
9          david        ann
10     institute   in

In [23]:
for prof_name, df_words in top_words_by_profession.items():
    df_words.to_csv(f"top_gender_words_{prof_name}.csv", index=False)
