In [12]:
#pip install gender-guesser gender-guesser
#python -m spacy download da_core_news_sm

import pandas as pd
import numpy as np

import nltk
import spacy

import gender_guesser.detector as gd

### Create term list


In [5]:
df = pd.read_csv('/work/SofieNørboMosegaard#5741/NLP/NLP-exam/data/reviews.csv')
df[:6]

Unnamed: 0,Title,URL,Author,Date,Rating,Review,Sentiment,ID,Title_cleaned,Review_cleaned
0,Modig,https://www.ekkofilm.dk/anmeldelser/modig/,Jakob Stegelmann,29/08/2012,6,"Modiger en Pixar-film, der ikke blot handler o...",pos,0,modig,modiger en pixarfilm der ikke blot handler om ...
1,On the Road,https://www.ekkofilm.dk/anmeldelser/road/,Lars Movin,03/10/2012,3,Filmatiseringen af Jack Kerouacs beat-romanOn ...,neg,1,on the road,filmatiseringen af jack kerouacs beatromanon t...
2,Hestehviskeren Buck,https://www.ekkofilm.dk/anmeldelser/hestehvisk...,nne-Sophie Thostrup,26/09/2012,4,"Det er smukt og stemningsfuldt, når den amerik...",pos,2,hestehviskeren buck,det er smukt og stemningsfuldt når den amerika...
3,Marie Krøyer,https://www.ekkofilm.dk/anmeldelser/marie-kroyer/,Klaus Rifbjerg,26/09/2012,3,"I hver anden butik på Skt. Laurentiivej, som e...",neg,3,marie krøyer,i hver anden butik på skt laurentiivej som er ...
4,Forbrydelsen III,https://www.ekkofilm.dk/anmeldelser/forbrydels...,Bo Tao Michaëlis,23/09/2012,4,Et maltrakteret lig bliver fundet ude i Københ...,pos,4,forbrydelsen iii,et maltrakteret lig bliver fundet ude i københ...
5,Kapringen,https://www.ekkofilm.dk/anmeldelser/kapringen/,Jesper Bo Petersen,19/09/2012,5,"”Jeg elsker dig,” nærmest sprutter skibskokken...",pos,5,kapringen,jeg elsker dig nærmest sprutter skibskokken mi...


### Extract gender-specific terms

In [40]:
# Part-of-Speech (POS) tagging for adjectives and nouns

def extract_POS(text):
    """
    Extracts adjectives and nouns from the input text
    """
    doc = nlp(text)
    adjectives = [token.text for token in doc if token.pos_ == "ADJ"]
    nouns = [token.text for token in doc if token.pos_ == "NOUN"]
    return adjectives, nouns

# Apply function to extract POS tags
POS_results = df['Review_cleaned'].apply(extract_POS)

# Initialize empty lists to store the POS words
all_adjectives = []
all_nouns = []

# Loop through the POS results and extend the lists with individual words
for adjectives, nouns in POS_results:
    all_adjectives.extend(adjectives) # Flatten the lists
    all_nouns.extend(nouns)

# Get unique POS words
unique_adjectives = np.unique(all_adjectives).tolist()
unique_nouns = np.unique(all_nouns).tolist()

In [None]:
print(f'Length of the list with unique adjectives: {len(unique_adjectives)}')  # 22,631
print(f'Length of the list with unique nouns: {len(unique_nouns)}')  # 98,288

In [290]:
unique_adjectives_df = pd.DataFrame(unique_adjectives, columns = ["adjectives"])
unique_nouns_df = pd.DataFrame(unique_nouns, columns = ["nouns"])

# Save to .csv files
unique_adjectives_df.to_csv("/work/SofieNørboMosegaard#5741/NLP/exam_testing/data/unique_adjectives.csv", index = False)
unique_nouns_df.to_csv("/work/SofieNørboMosegaard#5741/NLP/exam_testing/data/unique_nouns.csv", index = False)

### Extract gender-specific names

In [None]:
# Named Entity Recognition (NER) to extract names

def load_spacy():
    """
    The function loads the Danish spaCy 'da_core_news_sm' model
    """
    nlp = spacy.load("da_core_news_sm")
    return nlp

# For each review, extract entities identified as names (labeled as PERSON)
def extract_names(text):
    doc = nlp(text)
    names = [ent.text for ent in doc.ents if ent.label_ == "PER"]
    return names

# Load model
nlp = load_spacy()

# Apply function to extract all unique names
# obs, the names will be extracted from the non-cleaned text column (meaning all names are with capital first letter)
unique_names = list(set(df['Review'].apply(extract_names).sum()))

print(len(unique_names)) # 26,373
print(unique_names)

In [21]:
# https://pypi.org/project/gender-guesser/

# Initialize the gender detector
gender_detector = gd.Detector()

In [160]:
# Initialize empty lists for male and female names
male_names = []
female_names = []
unknown_names = [] # the unkown names will manually be classified

# Classify each name from the list
for name in unique_names:
    '''
    The function will classifiy names from a list as either female/male/unknown by ulitzing the
    gender_detector.get_gender() function. The input name will be split and only the first element
    in the list will be considered (i.e. the first name). Afterwards, the detected name will be
    appeneded to a list based on the classified gender.
    '''
    gender = gender_detector.get_gender(name.split()[0]) 
    
    if gender == 'male':
        male_names.append(name)
    elif gender == 'female':
        female_names.append(name)

In [176]:
print(f'Length of the list with male names: {len(male_names)}') # 9,817
print(f'Length of the list with female names: {len(female_names)}') # 4,414

Length of the list with male names: 9817
Length of the list with female names: 4414


In [193]:
# Create dfs for male, female, and unknown names
male_df = pd.DataFrame(male_names, columns = ["Name"])
female_df = pd.DataFrame(female_names, columns = ["Name"])

# Convert to lowercase + remove punc to align with the cleaned df
punc = r'[^\w\s]'
male_df["Name"] = male_df["Name"].str.lower().replace(punc, '', regex = True)
female_df["Name"] = female_df["Name"].str.lower().replace(punc, '', regex = True)

# Save to .csv files
male_df.to_csv("/work/SofieNørboMosegaard#5741/NLP/exam_testing/data/male_names.csv", index = False)
female_df.to_csv("/work/SofieNørboMosegaard#5741/NLP/exam_testing/data/female_names.csv", index = False)