In [5]:
import pandas as pd
import re

# Ensure you have the 'cleaned_data.csv' file in the correct location and it contains the expected columns
df = pd.read_csv('cleaned_data.csv')

# Convert 'Release Date' to datetime format and extract the year
df['Release Date'] = pd.to_datetime(df['Release Date'], dayfirst=True)
df['Release Year'] = df['Release Date'].dt.year

# Define lists of gender-specific terms and neutral pronouns
male_terms = [
    'he', 'him', 'his', 'himself', 'boy', 'boyfriend', 'husband', 'father', 'son', 'uncle', 'nephew', 'brother',
    'man', 'men', 'male', 'gentleman', 'gentlemen', 'sir', 'mr', 'mister', 'king', 'prince', 'lord'
]
female_terms = [
    'she', 'her', 'hers', 'herself', 'girl', 'girlfriend', 'wife', 'mother', 'daughter', 'aunt', 'niece', 'sister',
    'woman', 'women', 'female', 'lady', 'ladies', 'miss', 'ms', 'mrs', 'queen', 'princess', 'madam'
]
neutral_pronouns = ['they', 'them', 'theirs', 'themselves']

# Compile regex patterns for efficiency
compiled_patterns = {
    'male_terms': [re.compile(r'\b' + term + r'\b', flags=re.IGNORECASE) for term in male_terms],
    'female_terms': [re.compile(r'\b' + term + r'\b', flags=re.IGNORECASE) for term in female_terms],
    'neutral_pronouns': [re.compile(r'\b' + term + r'\b', flags=re.IGNORECASE) for term in neutral_pronouns]
}

# Function to count occurrences of compiled regex patterns in text
def count_terms(text, pattern_list):
    if not isinstance(text, str):
        return 0
    return sum(len(pattern.findall(text)) for pattern in pattern_list)

# Apply function to count gendered terms in lyrics
for term_type, patterns in compiled_patterns.items():
    df[term_type] = df['Lyrics'].apply(lambda x: count_terms(x, patterns))

# Group by album, aggregating Release Year and summing terms
album_stats = df.groupby('Album').agg({
    'Release Year': 'min',  # Use 'min' to ensure correct aggregation
    'male_terms': 'sum',
    'female_terms': 'sum',
    'neutral_pronouns': 'sum'
}).reset_index()

# Calculate totals and percentages
album_stats['total_terms'] = album_stats['male_terms'] + album_stats['female_terms'] + album_stats['neutral_pronouns']
album_stats['male_terms_pct'] = (album_stats['male_terms'] / album_stats['total_terms']) * 100
album_stats['female_terms_pct'] = (album_stats['female_terms'] / album_stats['total_terms']) * 100
album_stats['neutral_pronouns_pct'] = (album_stats['neutral_pronouns'] / album_stats['total_terms']) * 100

# Formatting percentages
album_stats['male_terms_pct'] = album_stats['male_terms_pct'].apply(lambda x: f"{x:.2f}")
album_stats['female_terms_pct'] = album_stats['female_terms_pct'].apply(lambda x: f"{x:.2f}")
album_stats['neutral_pronouns_pct'] = album_stats['neutral_pronouns_pct'].apply(lambda x: f"{x:.2f}")

# Sort the DataFrame by Release Year
album_stats.sort_values(by='Release Year', inplace=True)

# Saving and displaying the result
output_filename = 'album_gender_bias_stats_sorted_by_year.csv'
album_stats.to_csv(output_filename, index=False)
album_stats


Unnamed: 0,Album,Release Year,male_terms,female_terms,neutral_pronouns,total_terms,male_terms_pct,female_terms_pct,neutral_pronouns_pct
6,Taylor Swift (Deluxe),2007,5,10,0,15,33.33,66.67,0.0
0,1989 (Deluxe),2014,2,32,0,34,5.88,94.12,0.0
9,reputation,2017,23,13,0,36,63.89,36.11,0.0
2,Lover,2019,60,15,0,75,80.0,20.0,0.0
8,folklore (deluxe version),2020,7,21,0,28,25.0,75.0,0.0
1,Fearless (Taylor’s Version),2021,42,27,0,69,60.87,39.13,0.0
4,Red (Taylor’s Version),2021,28,45,0,73,38.36,61.64,0.0
7,evermore (deluxe version),2021,19,12,0,31,61.29,38.71,0.0
3,Midnights (3am Edition),2022,18,26,0,44,40.91,59.09,0.0
5,Speak Now (Taylor’s Version),2023,17,25,0,42,40.48,59.52,0.0


In [6]:

avg_male_terms_pct = album_stats['male_terms_pct'].astype(float).mean()
avg_female_terms_pct = album_stats['female_terms_pct'].astype(float).mean()
avg_neutral_pronouns_pct = album_stats['neutral_pronouns_pct'].astype(float).mean()

print(f"Average male terms percentage: {avg_male_terms_pct:.2f}%")
print(f"Average female terms percentage: {avg_female_terms_pct:.2f}%")
print(f"Average neutral pronouns percentage: {avg_neutral_pronouns_pct:.2f}%")
avg_stats = pd.DataFrame({
    'avg_male_terms_pct': [avg_male_terms_pct],
    'avg_female_terms_pct': [avg_female_terms_pct],
    'avg_neutral_pronouns_pct': [avg_neutral_pronouns_pct]
})
output_avg_filename = 'average_gender_bias_stats.csv'
avg_stats.to_csv(output_avg_filename, index=False)

Average male terms percentage: 45.00%
Average female terms percentage: 55.00%
Average neutral pronouns percentage: 0.00%
