In [2]:
#python -m spacy download da_core_news_sm

import pandas as pd
import numpy as np

import nltk
import spacy

import requests
import json

### Create term list


In [8]:
df = pd.read_csv('data/reviews.csv')
df[:6]

Unnamed: 0,Title,URL,Author,Date,Rating,Review,Sentiment,ID,Review_cleaned
0,Modig,https://www.ekkofilm.dk/anmeldelser/modig/,Jakob Stegelmann,29/08/2012,6,"Modiger en Pixar-film, der ikke blot handler o...",pos,0,modiger en pixarfilm der ikke blot handler om ...
1,On the Road,https://www.ekkofilm.dk/anmeldelser/road/,Lars Movin,03/10/2012,3,Filmatiseringen af Jack Kerouacs beat-romanOn ...,neg,1,filmatiseringen af jack kerouacs beatromanon t...
2,Hestehviskeren Buck,https://www.ekkofilm.dk/anmeldelser/hestehvisk...,nne-Sophie Thostrup,26/09/2012,4,"Det er smukt og stemningsfuldt, når den amerik...",neu,2,det er smukt og stemningsfuldt når den amerika...
3,Marie Krøyer,https://www.ekkofilm.dk/anmeldelser/marie-kroyer/,Klaus Rifbjerg,26/09/2012,3,"I hver anden butik på Skt. Laurentiivej, som e...",neg,3,i hver anden butik på skt laurentiivej som er ...
4,Forbrydelsen III,https://www.ekkofilm.dk/anmeldelser/forbrydels...,Bo Tao Michaëlis,23/09/2012,4,Et maltrakteret lig bliver fundet ude i Københ...,neu,4,et maltrakteret lig bliver fundet ude i københ...
5,Kapringen,https://www.ekkofilm.dk/anmeldelser/kapringen/,Jesper Bo Petersen,19/09/2012,5,"”Jeg elsker dig,” nærmest sprutter skibskokken...",pos,5,jeg elsker dig nærmest sprutter skibskokken mi...


### Extract gender-specific terms

In [40]:
# Part-of-Speech (POS) tagging for adjectives and nouns

def extract_POS(text):
    """
    Extracts adjectives and nouns from the input text
    """
    doc = nlp(text)
    adjectives = [token.text for token in doc if token.pos_ == "ADJ"]
    nouns = [token.text for token in doc if token.pos_ == "NOUN"]
    return adjectives, nouns

# Apply function to extract POS tags
POS_results = df['Review_cleaned'].apply(extract_POS)

# Initialize empty lists to store the POS words
all_adjectives = []
all_nouns = []

# Loop through the POS results and extend the lists with individual words
for adjectives, nouns in POS_results:
    all_adjectives.extend(adjectives) # Flatten the lists
    all_nouns.extend(nouns)

# Get unique POS words
unique_adjectives = np.unique(all_adjectives).tolist()
unique_nouns = np.unique(all_nouns).tolist()

In [None]:
print(f'Length of the list with unique adjectives: {len(unique_adjectives)}')  # 22,631
print(f'Length of the list with unique nouns: {len(unique_nouns)}')  # 98,288

In [290]:
unique_adjectives_df = pd.DataFrame(unique_adjectives, columns = ["adjectives"])
unique_nouns_df = pd.DataFrame(unique_nouns, columns = ["nouns"])

# Save to .csv files
unique_adjectives_df.to_csv("data/unique_adjectives.csv", index = False)
unique_nouns_df.to_csv("data/unique_nouns.csv", index = False)

### Extract gender-specific names

In [None]:
# Named Entity Recognition (NER) to extract names

def load_spacy():
    """
    The function loads the Danish spaCy 'da_core_news_sm' model
    """
    nlp = spacy.load("da_core_news_sm")
    return nlp

# For each review, extract entities identified as names (labeled as PERSON)
def extract_names(text):
    doc = nlp(text)
    names = [ent.text for ent in doc.ents if ent.label_ == "PER"]
    return names

# Load model
nlp = load_spacy()

# Apply function to extract all unique names
# obs, the names will be extracted from the non-cleaned text column (meaning all names are with capital first letter)
unique_names = list(set(df['Review'].apply(extract_names).sum()))

print(len(unique_names)) # 26,373
print(unique_names)

In [None]:
API_KEY = "add_api_key"

# Genderize API URL
API_URL = "https://api.genderize.io"

# API limit, 10 names per API request
batch_size = 10

# threshold
confidence_threshold = 0.75

# Initialize lists for male, female, and unknown names
male_names = []
female_names = []
unknown_names = []

# Fetch gender data for a batch of names
def fetch_gender(names_batch):
    params = {"apikey": API_KEY, "name[]": names_batch}
    response = requests.get(API_URL, params = params)
    if response.status_code == 200:
        return response.json()
    else:
        print(f"Error: {response.status_code} - {response.text}")
        return []

# Process names in batches and classify gender
for i in range(0, len(names), batch_size): 
    
    batch = names[i:i + batch_size]
    print(f"Processing batch: {batch}")

    try:
        data = fetch_gender(batch)
        for entry in data:

            if "gender" in entry and "probability" in entry:

                if entry["probability"] >= confidence_threshold:

                    if entry["gender"] == "male":
                        male_names.append(entry["name"])

                    elif entry["gender"] == "female":
                        female_names.append(entry["name"])

                else:
                    unknown_names.append(entry["name"])
            else:
                unknown_names.append(entry["name"])
                
    except Exception as e:
        print(f"Error processing batch {batch}: {e}")


In [54]:
print(f'Length of the list with male names: {len(male_names)}') # 14,791
print(f'Length of the list with female names: {len(female_names)}') # 5,838
print(f'Length of the list with unknown names: {len(unknown_names)}') # 5,744

Length of the list with male names: 14791
Length of the list with female names: 5838
Length of the list with unknown names: 5744


In [38]:
# Create dfs for male, female, and unknown names
male_df = pd.DataFrame(male_names, columns = ["Name"])
female_df = pd.DataFrame(female_names, columns = ["Name"])

# Convert to lowercase + remove punc to align with the cleaned df
punc = r'[^\w\s]'
male_df["Name"] = male_df["Name"].str.lower().replace(punc, '', regex = True)
female_df["Name"] = female_df["Name"].str.lower().replace(punc, '', regex = True)

# Save to .csv files
male_df.to_csv("data/male_names.csv", index = False)
female_df.to_csv("data/female_names.csv", index = False)