In [85]:
import os
import pandas as pd

In [86]:
##### MODEL SELECTION #####

model_name = "unsloth/Mistral-Small-Instruct-2409-bnb-4bit"
#model_name = "unsloth/Phi-4-mini-instruct-bnb-4bit"
#model_name = "unsloth/DeepSeek-R1-Distill-Qwen-1.5B-bnb-4bit"
#model_name = "unsloth/Qwen2-7B-Instruct-bnb-4bit"
#model_name = "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit"

modele_str = model_name.split('/')[-1]

###### TOPIC MODELING #####
target_text = "keywords_LLM_Gender"
#target_text = "keywords_LLM_Ethnicity"
#target_text = "english_lyrics"

target_labels = "genre_LLM"
#target_labels = "ethnicity_LLM"

In [87]:
PATH_input_gender = '/home/evuichard/Projet DEBIAR/labeled_lyrics_gender_' + modele_str + '.xlsx'
PATH_output_gender_temp = '/home/evuichard/Projet DEBIAR/' + modele_str + '/'
PATH_output_gender = PATH_output_gender_temp
df = pd.ExcelFile(PATH_input_gender)
df = df.parse("Sheet1")

#créer le dossier modele_str s'il n'existe pas déjà
if not os.path.exists(PATH_output_gender):
    os.makedirs(PATH_output_gender)

df = df.dropna(subset=['genre_LLM'])
df = df.dropna(subset=['ethnicity_LLM'])
#supprimer les colonnes qui commencent par "Unnamed"
df = df.loc[:, ~df.columns.str.startswith('Unnamed')]
#afficher les clés du dataframe
print(df.keys())

df.to_excel(PATH_input_gender, index=False)

In [88]:
#calcul des pourcentages des keywords présents dans les lyrics traduites
def keyword_percentage(lyrics, keywords):
    lyrics = lyrics.lower()
    keyword_count = sum(1 for keyword in keywords if keyword.lower() in lyrics)
    return keyword_count / len(keywords) * 100 if keywords else 0

def score_keywords_model(model_name_f):
    modele_str_f = model_name_f.split('/')[-1]

    df_f = pd.ExcelFile('/home/evuichard/Projet DEBIAR/labeled_lyrics_gender_' + modele_str_f + '.xlsx')
    df_f = df_f.parse("Sheet1")
    df_f["score_keywords_gender"] = df_f.apply(lambda x: keyword_percentage(x["english_lyrics"], str(x["keywords_LLM_Gender"]).replace(',', ' ').split()), axis=1)
    df_f["score_keywords_ethnicity"] = df_f.apply(lambda x: keyword_percentage(x["english_lyrics"], str(x["keywords_LLM_Ethnicity"]).replace(',', ' ').split()), axis=1)

    #affichage des résultats
    print(f"Résultats pour le modèle {model_name_f}:")
    print("Score moyen des keywords liés au genre :")
    print(df_f["score_keywords_gender"].mean())
    print("Score moyen des keywords liés à l'ethnicité :")
    print(df_f["score_keywords_ethnicity"].mean())
    print()

for model_name_f in ["unsloth/Mistral-Small-Instruct-2409-bnb-4bit", "unsloth/Phi-4-mini-instruct-bnb-4bit", 
                     "unsloth/DeepSeek-R1-Distill-Qwen-1.5B-bnb-4bit", "unsloth/Qwen2-7B-Instruct-bnb-4bit", 
                     "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit"]:
    score_keywords_model(model_name_f)

# Statistiques

In [89]:
PATH_output_gender = PATH_output_gender_temp + 'stats/'
os.makedirs(PATH_output_gender, exist_ok=True)

In [90]:
#on affiche le nom des artistes qui ont pour genre "person"
df[df['genre'] == 'Person']['track_artist'].unique()
#SAYMYNAME => female
#Aleman => male
#Fili => group
#MiMS => male

df.loc[df['track_artist'] == 'SAYMYNAME', 'genre'] = 'female'
df.loc[df['track_artist'] == 'Aleman', 'genre'] = 'male'
df.loc[df['track_artist'] == 'Fili', 'genre'] = 'Group'
df.loc[df['track_artist'] == 'MiMS', 'genre'] = 'male'

df = df[df['genre'].isin(['female', 'male', 'Group'])]

df = df.dropna(subset=['genre'])
df.to_excel(PATH_input_gender, index=False)


In [91]:
import matplotlib.pyplot as plt
df = pd.ExcelFile(PATH_input_gender)
df = df.parse("Sheet1")

#on affiche les classes de genre de df qui ont un genre
#df[df['genre'] == 'Person'] = None  # Remove this line
df_genre = df[df['genre_LLM'].notna()]
print(df_genre['genre'].value_counts())
df_genre['genre_LLM'].value_counts().plot(kind='bar')
plt.title('Number of lines per genre for "' + modele_str + '"')
plt.xlabel('Predicted Gender (LLM)')
plt.ylabel('Number of lines')
plt.savefig(PATH_output_gender + 'genre_LLM_' + modele_str + '.png')
plt.show()

genre_llm_counts = df_genre.groupby('genre')['genre_LLM'].value_counts().unstack(fill_value=0)
fig, ax = plt.subplots(figsize=(10, 6))

genre_llm_counts.plot(kind='bar', stacked=True, ax=ax)

ax.set_title('Distribution of genre_LLM by genre for "' + modele_str + '"')
ax.set_xlabel('Actual Gender')
ax.set_ylabel('Number of lines')
ax.tick_params(axis='x', rotation=45)
ax.legend(title='Predicted Gender (LLM)', loc='upper right')

plt.tight_layout()
plt.savefig(PATH_output_gender + 'stacked_bar_chart_gender_' + modele_str + '.png')
plt.show()

In [92]:
from scipy.stats import chi2_contingency

df_cleaned_contingency = df.dropna(subset=['genre', 'genre_LLM']).copy()

contingency_table = pd.crosstab(df_cleaned_contingency['genre'], df_cleaned_contingency['genre_LLM'])
chi2, p, dof, expected = chi2_contingency(contingency_table)
fig, ax = plt.subplots(figsize=(6, 2))
ax.axis('off')
table_data = [
    ["Chi-squared", f"{chi2:.3f}"],
    ["Degrees of Freedom", dof],
    ["P-value", f"{p:.4e}"]
]
table = ax.table(cellText=table_data,
                 colLabels=["Stat", "Value"],
                 cellLoc="center",
                 loc="center")
table.auto_set_font_size(False)
table.set_fontsize(12)
table.scale(1.5, 1.5)

plt.title("Chi-squared independence test results for genre and genre_LLM")
plt.tight_layout()
plt.savefig(PATH_output_gender + 'chi-squared_independence_test_' + modele_str + '.png')
plt.show()


In [93]:
import seaborn as sns
contingency_table_percentage = contingency_table.apply(lambda x: x / x.sum(), axis=1) * 100
annot = contingency_table_percentage.applymap(lambda x: f"{x:.2f}%")

plt.figure(figsize=(10, 7))
sns.heatmap(contingency_table_percentage, annot=annot, fmt="", cmap="Reds")

plt.title('Heatmap of LLM Gender Prediction Percentage by Actual Gender for "' + modele_str + '"')
plt.xlabel('Predicted Gender (LLM)')
plt.ylabel('Actual Gender')
plt.tight_layout()
plt.savefig(PATH_output_gender + 'heatmap_percentage_gender_' + modele_str + '.png')
plt.show()

In [94]:
import matplotlib.pyplot as plt
df = pd.ExcelFile(PATH_input_gender)
df = df.parse("Sheet1")

#on affiche les classes d'ethnie de df qui ont une ethnie
df_ethnicity = df[df['ethnicity_LLM'].notna()]
print(df_ethnicity['ethnie'].value_counts())
df_ethnicity['ethnicity_LLM'].value_counts().plot(kind='bar')
plt.title('Number of lines per ethnicity for "' + modele_str + '"')
plt.xlabel('Predicted Ethnicity (LLM)')
plt.ylabel('Number of lines')
plt.savefig(PATH_output_gender + 'ethnicity_LLM_' + modele_str + '.png')
plt.show()

ethnie_llm_counts = df_ethnicity.groupby('ethnie')['ethnicity_LLM'].value_counts().unstack(fill_value=0)
fig, ax = plt.subplots(figsize=(10, 6))

ethnie_llm_counts.plot(kind='bar', stacked=True, ax=ax)

ax.set_title('Distribution of ethnicity_LLM by ethnicity for "' + modele_str + '"')
ax.set_xlabel('Actual Ethnicity')
ax.set_ylabel('Number of lines')
ax.tick_params(axis='x', rotation=45)
ax.legend(title='Predicted Ethnicity (LLM)', loc='upper right')

plt.tight_layout()
plt.savefig(PATH_output_gender + 'stacked_bar_chart_ethnicity_' + modele_str + '.png')
plt.show()

In [95]:
from scipy.stats import chi2_contingency

df_cleaned_contingency_ethnicity = df.dropna(subset=['ethnie', 'ethnicity_LLM']).copy()
contingency_table_ethnicity = pd.crosstab(df_cleaned_contingency_ethnicity['ethnie'], df_cleaned_contingency_ethnicity['ethnicity_LLM'])
chi2_ethnicity, p_ethnicity, dof_ethnicity, expected_ethnicity = chi2_contingency(contingency_table_ethnicity)

fig, ax = plt.subplots(figsize=(6, 2))
ax.axis('off')
table_data_ethnicity = [
    ["Chi-squared", f"{chi2_ethnicity:.3f}"],
    ["Degrees of Freedom", dof_ethnicity],
    ["P-value", f"{p_ethnicity:.4e}"]
]
table_ethnicity = ax.table(cellText=table_data_ethnicity,
                 colLabels=["Stat", "Value"],
                 cellLoc="center",
                 loc="center")
table_ethnicity.auto_set_font_size(False)
table_ethnicity.set_fontsize(12)
table_ethnicity.scale(1.5, 1.5)

plt.title("Chi-squared independence test results for ethnie and ethnicity_LLM")
plt.tight_layout()
plt.savefig(PATH_output_gender + 'chi-squared_independence_test_ethnicity_' + modele_str + '.png')
plt.show()

In [96]:
contingency_table_percentage_ethnicity = contingency_table_ethnicity.apply(lambda x: x / x.sum(), axis=1) * 100
annot_ethnicity = contingency_table_percentage_ethnicity.applymap(lambda x: f"{x:.2f}%")

plt.figure(figsize=(10, 7))
sns.heatmap(contingency_table_percentage_ethnicity, annot=annot_ethnicity, fmt="", cmap="Reds")

plt.title('Heatmap of LLM Ethnicity Prediction Percentage by Actual Ethnicity for "' + modele_str + '"')
plt.xlabel('Predicted Ethnicity (LLM)')
plt.ylabel('Actual Ethnicity')
plt.tight_layout()
plt.savefig(PATH_output_gender + 'heatmap_percentage_ethnicity_' + modele_str + '.png')
plt.show()


In [97]:
import matplotlib.pyplot as plt
df = pd.ExcelFile(PATH_input_gender)
df = df.parse("Sheet1")

# Filter out rows where either ethnicity_LLM or genre_LLM is missing
df_filtered = df.dropna(subset=['ethnicity_LLM', 'genre_LLM'])

# Create the cross-tabulation of ethnicity_LLM and genre_LLM
ethnicity_genre_llm_counts = df_filtered.groupby('ethnicity_LLM')['genre_LLM'].value_counts().unstack(fill_value=0)

# Plot the stacked bar chart
fig, ax = plt.subplots(figsize=(12, 7))

ethnicity_genre_llm_counts.plot(kind='bar', stacked=True, ax=ax)

ax.set_title('Distribution of genre_LLM by ethnicity_LLM for "' + modele_str + '"')
ax.set_xlabel('Predicted Ethnicity (LLM)')
ax.set_ylabel('Number of lines')
ax.tick_params(axis='x', rotation=45)
ax.legend(title='Predicted Gender (LLM)', loc='upper right')

plt.tight_layout()
plt.savefig(PATH_output_gender + 'stacked_bar_chart_ethnicity_gender_' + modele_str + '.png')
plt.show()

# Topic Modelling

In [98]:
PATH_output_gender = PATH_output_gender_temp + 'topic_modelling/'
os.makedirs(PATH_output_gender, exist_ok=True)

In [99]:
#!pip install bertopic
#!pip install umap-learn
#!pip install -U kaleido
from bertopic import BERTopic
import shutil

In [100]:
df = pd.ExcelFile(PATH_input_gender)
df = df.parse("Sheet1")
df.head()

PATH_output_gender = PATH_output_gender_temp + 'topic_modelling/' + target_labels + '_and_' + target_text + '/'
#supprimer le dossier s'il existe
if os.path.exists(PATH_output_gender):
    shutil.rmtree(PATH_output_gender)
os.makedirs(PATH_output_gender, exist_ok=True)

In [101]:
import nltk
import os
import re
from nltk.corpus import stopwords

nltk_data_path = os.path.expanduser("~/Projet DEBIAR/nltk_data")
nltk.data.path.append(nltk_data_path)
stop_words = set(stopwords.words("english"))

def clean_text(text):
    # Met en minuscule
    text = text.lower()
    # Supprime les caractères non alphabétiques
    text = re.sub(r"[^a-z\s]", "", text)
    #return text
    # Supprime les stopwords
    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words]
    return " ".join(tokens)

# Applique sur ta colonne lyrics
df[target_text] = df[target_text].astype(str).apply(clean_text)

In [102]:
documents = df[target_text].tolist()
print(len(documents))
labels = df[target_labels].unique().tolist()
labels = [label for label in labels if str(label) != 'nan']

topic_model = BERTopic(language="english"
                    #, nr_topics=len(labels)
                    )

topics, probs = topic_model.fit_transform(documents, y=[labels.index(label) if label in labels else -1 for label in df[target_labels].tolist()])
#topics, probs = topic_model.fit_transform(documents)
topics_per_class = topic_model.topics_per_class(documents, classes = df[target_labels])

topic_model.reduce_topics(documents, nr_topics=len(labels)+1)
topics, probs = topic_model.transform(documents)

In [None]:
#affichage de la carte des topics
fig = topic_model.visualize_documents(documents)
fig.write_html(PATH_output_gender + 'documents_and' + target_labels + '_' + modele_str + '.html')
fig.show()

fig = topic_model.visualize_topics_per_class(topics_per_class)
fig.write_html(PATH_output_gender + 'topics_per_' + target_labels + '_' + modele_str + '.html')
fig.show()

fig = topic_model.visualize_barchart()
fig.write_html(PATH_output_gender + 'barchart_' + target_labels + '_' + modele_str + '.html')
fig.show()

fig = topic_model.visualize_heatmap()
fig.write_html(PATH_output_gender + 'heatmap_' + target_labels + '_' + modele_str + '.html')
fig.show()

fig = topic_model.visualize_hierarchy()
fig.write_html(PATH_output_gender + 'hierarchy_' + target_labels + '_' + modele_str + '.html')
fig.show()

In [104]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

# Regrouper par topic et classe
df_tpc = topics_per_class.groupby(["Topic", "Class"]).agg({"Frequency": "sum"}).reset_index()

# Construire une table de fréquence
topic_label_freq = df_tpc.pivot(index="Topic", columns="Class", values="Frequency").fillna(0)

# Trouver le label dominant par topic
class_columns = [col for col in topic_label_freq.columns if col not in ["dominant_label", "dominant_score"]]
topic_label_freq["dominant_label"] = topic_label_freq[class_columns].idxmax(axis=1)
topic_label_freq["dominant_score"] = topic_label_freq[class_columns].max(axis=1) / topic_label_freq[class_columns].sum(axis=1)

# Dictionnaire label -> liste de topics dominants
dLabelTopic = {
    label: topic_label_freq[topic_label_freq["dominant_label"] == label].index.tolist()
    for label in topic_label_freq["dominant_label"].unique()
}

In [105]:
def plot_wordcloud(topic_model, topic_id, title=None):
    topic_words = topic_model.get_topic(topic_id)
    if not topic_words or isinstance(topic_words, bool):
        print(f"Warning: No words found for topic {topic_id}")
        return
    words = dict(topic_words)
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(words)

    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.title(title or f"Topic {topic_id}")
    plt.savefig(PATH_output_gender + 'wordcloud_topic_' + (title.replace(" ", "_") if title else str(topic_id)) + '_' + modele_str + '.png')
    plt.show()

# Iterate through the keys of the dLabelTopic dictionary
for label in dLabelTopic.keys():
    print(f"Label: {label}")
    for topic_id in dLabelTopic[label]:
        plot_wordcloud(topic_model, topic_id, title=f"Topic {topic_id} for {label.replace('/','')}")