In [None]:
from openai import OpenAI
import pandas as pd
import json
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
import matplotlib.pyplot as plt
import seaborn as sns

# ------------------------
# Config
# ------------------------
client = OpenAI(api_key="")  # ⚠️ Remplace par ta clé
df = pd.read_csv("comments_labeled.csv")

In [None]:
for col in ["sentiment_label", "purchase_intent", "themes", "tone", "intent_reason", "keywords", "clean_comment"]:
    df[col] = ""

# ------------------------
# Fonction GPT par batch
# ------------------------
def analyze_comments_batch(batch_comments):
    prompt = f"""
    Tu es un expert en analyse de sentiment marketing.
    Analyse les commentaires YouTube suivants, renvoie uniquement un JSON de la forme :
    [
        {{
            "sentiment_label": "positif" | "neutre" | "negatif",
            "purchase_intent": 0 | 1 | 2,
            "themes": ["price", "quality", "taste", "brand", "portion", "service", "promotion", "desire", "disappointment"],
            "tone": "humorous" | "enthusiastic" | "angry" | "sarcastic" | "neutral",
            "intent_reason": "phrase courte expliquant pourquoi ce score d'achat",
            "keywords": ["mot1", "mot2", "mot3"],
            "clean_comment": "commentaire nettoyé"
        }}
    ]

    Les commentaires :
    {batch_comments}
    """
    try:
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[{"role": "user", "content": prompt}],
            temperature=0.4
        )
        raw = response.choices[0].message.content.strip()
        start = raw.find("[")
        end = raw.rfind("]") + 1
        json_text = raw[start:end]
        return json.loads(json_text)

    except Exception as e:
        print(f"⚠️ Erreur : {e}")
        return [{
            "sentiment_label": "neutre",
            "purchase_intent": 0,
            "themes": [],
            "tone": "neutral",
            "intent_reason": "",
            "keywords": [],
            "clean_comment": comment
        } for comment in batch_comments]

# ------------------------
# Traitement parallèle
# ------------------------
batch_size = 50  # Grosse taille pour réduire le nombre de requêtes
batches = [df["comment"].iloc[i:i+batch_size].tolist() for i in range(0, len(df), batch_size)]

results = [None] * len(batches)

with ThreadPoolExecutor(max_workers=5) as executor:  # 5 threads simultanés
    futures = {executor.submit(analyze_comments_batch, batch): idx for idx, batch in enumerate(batches)}
    for future in tqdm(as_completed(futures), total=len(futures)):
        idx = futures[future]
        results[idx] = future.result()

# ------------------------
# Remplir le DataFrame
# ------------------------
for i, batch_result in enumerate(results):
    start_idx = i * batch_size
    for j, data in enumerate(batch_result):
        for key, value in data.items():
            if isinstance(value, list):
                value = ", ".join(map(str, value))
            df.loc[start_idx + j, key] = value

# ------------------------



 22%|██▏       | 50/232 [16:27<39:26, 13.00s/it]

⚠️ Erreur : Expecting ',' delimiter: line 143 column 26 (char 4719)


 36%|███▌      | 84/232 [27:19<1:06:24, 26.92s/it]

⚠️ Erreur : Expecting ',' delimiter: line 14 column 21 (char 368)


 55%|█████▍    | 127/232 [42:03<31:01, 17.73s/it]

⚠️ Erreur : Expecting ',' delimiter: line 332 column 32 (char 15376)


 61%|██████    | 141/232 [46:12<21:28, 14.16s/it]

⚠️ Erreur : Expecting value: line 216 column 26 (char 7149)


 64%|██████▍   | 148/232 [48:32<24:24, 17.43s/it]

⚠️ Erreur : Expecting value: line 414 column 26 (char 14199)


 66%|██████▌   | 152/232 [50:39<27:16, 20.45s/it]

⚠️ Erreur : Expecting ',' delimiter: line 1679 column 21 (char 66972)


 66%|██████▋   | 154/232 [52:00<36:51, 28.36s/it]

⚠️ Erreur : Expecting ',' delimiter: line 1652 column 28 (char 66715)


 89%|████████▉ | 207/232 [1:11:46<07:24, 17.76s/it]

⚠️ Erreur : Expecting ',' delimiter: line 8 column 32 (char 226)


100%|██████████| 232/232 [1:19:33<00:00, 20.58s/it]


In [19]:
# Sauvegarde
# ------------------------
df.to_excel("comments_analyzed.xlsx", index=False)



In [None]:
# Purchase intent par video
intent_by_video = df.groupby(["video_id", "purchase_intent"]).size().reset_index(name="Number_of_comments")

# Calcul du pourcentage par video_id (avec transform)
intent_by_video["Percentage"] = intent_by_video.groupby("video_id")["Number_of_comments"].transform(lambda x: x / x.sum() * 100)

intent_by_video.rename(columns={"purchase_intent": "Purchase_intent_score"}, inplace=True)


# Top thèmes par video
themes_exploded = df[["video_id", "themes"]].dropna()
themes_exploded = themes_exploded.assign(theme=themes_exploded["themes"].str.split(", ")).explode("theme")

themes_by_video = themes_exploded.groupby(["video_id", "theme"]).size().reset_index(name="Number_of_mentions")

# Correction ici aussi
themes_by_video["Percentage"] = themes_by_video.groupby("video_id")["Number_of_mentions"].transform(lambda x: x / x.sum() * 100)

themes_by_video.rename(columns={"theme": "Most_mentioned_theme"}, inplace=True)


In [None]:
themes_by_video

Unnamed: 0,video_id,Most_mentioned_theme,Number_of_mentions,Percentage
0,9cPxh2DikIA,,7252,60.650665
1,9cPxh2DikIA,advertisement,8,0.066906
2,9cPxh2DikIA,advertising,4,0.033453
3,9cPxh2DikIA,anxiety,1,0.008363
4,9cPxh2DikIA,brand,1275,10.663210
...,...,...,...,...
75,guS6wULNixE,humor,25,27.777778
76,guS6wULNixE,price,2,2.222222
77,guS6wULNixE,quality,5,5.555556
78,guS6wULNixE,service,3,3.333333


In [None]:
intent_by_video

Unnamed: 0,video_id,Purchase_intent_score,Number_of_comments,Percentage
0,9cPxh2DikIA,0.0,5774,52.846421
1,9cPxh2DikIA,1.0,2284,20.904265
2,9cPxh2DikIA,2.0,413,3.779974
3,9cPxh2DikIA,,2455,22.469339
4,B0O_LeuKTJU,0.0,112,35.33123
5,B0O_LeuKTJU,1.0,135,42.586751
6,B0O_LeuKTJU,2.0,2,0.630915
7,B0O_LeuKTJU,,68,21.451104
8,IK0i48ffjnY,0.0,49,38.888889
9,IK0i48ffjnY,1.0,34,26.984127


In [None]:
# Sauvegarde du résumé Purchase Intent par vidéo
intent_by_video.to_excel("purchase_intent_summary_by_video.xlsx", index=False)

# Sauvegarde du résumé Top Thèmes par vidéo
themes_by_video.to_excel("top_themes_summary_by_video.xlsx", index=False)
