<a href="https://colab.research.google.com/github/OdysseusPolymetis/enssib_class/blob/main/5_sentiment_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install stanza

In [None]:
import os
import stanza
from lxml import etree as ET
import lxml.html
import string
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import torch
from transformers import pipeline

In [None]:
# Utilisation de CUDA
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialisation du pipeline avec le modèle en utilisant CUDA
sentiment_pipe = pipeline(
    "text-classification",
    model="ac0hik/Sentiment_Analysis_French",
    device=0 if torch.cuda.is_available() else -1
)

In [None]:
!wget https://raw.githubusercontent.com/ebalzac/FC/refs/heads/gh-pages/balzac-22-FC-pere-goriot.xml

In [None]:
def strip_ns_prefix(tree):
    query = "descendant-or-self::*[namespace-uri()!='']"
    for element in tree.xpath(query):
        element.tag = ET.QName(element).localname
    return tree

In [None]:
filepath_of_text = "/content/balzac-22-FC-pere-goriot.xml"

In [None]:
nlp_stanza = stanza.Pipeline(lang='fr', processors='tokenize,mwt')

In [None]:
parser = ET.XMLParser(remove_blank_text=True, resolve_entities=False, encoding='utf8')
tree = strip_ns_prefix(ET.parse(filepath_of_text, parser))
ps = tree.xpath(".//p")
paragraphs= []
for p in ps:
  sentences = []
  for sentence in nlp_stanza("".join(p.itertext())).sentences:
    sentences.append(sentence.text)
  paragraphs.append(sentences)

In [None]:
len(paragraphs)

In [None]:
paragraph_sentiments = []
for paragraph in paragraphs:
    sentence_scores = []
    confidence_scores = []  # Liste pour stocker les scores de confiance

    for sentence in paragraph:
        sentiment = sentiment_pipe(sentence)[0]
        confidence = sentiment['score']
        confidence_scores.append(confidence)

        # Calcul du score de base en fonction de l'étiquette
        if sentiment['label'] == 'negative':
            score = - (confidence - 0.5) * 2  # Score négatif ajusté
        elif sentiment['label'] == 'positive':
            score = (confidence - 0.5) * 2  # Score positif ajusté
        else:  # 'neutral'
            score = 0  # Pas de sentiment

        sentence_scores.append(score)

    # Calcul de la moyenne des scores de sentiment pour le paragraphe
    avg_score = np.mean(sentence_scores)

    # Calcul de la moyenne des scores de confiance
    avg_confidence = np.mean(confidence_scores)

    # Ajustement du score basé sur la longueur du paragraphe
    paragraph_length = len(paragraph)  # Nombre de phrases dans le paragraphe

    # Ajuster l'impact de la longueur et de la confiance de manière plus prononcée
    if paragraph_length > 4:  # Paragraphe long
        adjusted_score = avg_score * (1 + 0.2 * (avg_confidence - 0.5))  # Amplification plus forte
    elif paragraph_length <= 2:  # Paragraphe court
        adjusted_score = avg_score * (1 - 0.2 * (1 - avg_confidence))  # Réduction plus forte
    else:
        adjusted_score = avg_score  # Pas de changement pour les paragraphes de taille moyenne

    paragraph_sentiments.append(adjusted_score)

In [None]:
indices_to_print = [30, 240, 930]
for index in indices_to_print:
    if index < len(paragraphs):
        paragraph = paragraphs[index]

        full_paragraph = " ".join(paragraph)

        print(f"\nParagraphe {index}:")
        print(full_paragraph)
        print(f"Score moyen de sentiment: {paragraph_sentiments[index]}")

In [None]:
window_size = 50
smoothed_sentiments = np.convolve(paragraph_sentiments, np.ones(window_size), 'valid') / window_size

plt.figure(figsize=(12, 6))

plt.scatter(range(len(paragraph_sentiments)), paragraph_sentiments, alpha=0.3, color='lightgray', label='Paragraph Sentiment Scores')

plt.plot(range(window_size - 1, len(paragraph_sentiments)), smoothed_sentiments, color='blue', label='Smoothed Sentiment Trend')

plt.ylim(-1, 1)

plt.xlabel('Paragraph Number')
plt.ylabel('Average Sentiment Score')
plt.title('Evolution of Average Sentiment per Paragraph')
plt.legend()

for i, sentiment in enumerate(paragraph_sentiments):
    if i % 10 == 0:
        plt.annotate(str(i), (i, sentiment), textcoords="offset points", xytext=(0, 5), ha='center')

plt.show()

In [None]:
def display_paragraphs(indices):
  for i in indices:
    if 0 <= i < len(paragraphs):
      print(f"Paragraph {i}:")
      for sentence in paragraphs[i]:
        print(sentence)
    else:
      print(f"Index {i} is out of bounds.")

display_paragraphs([570])

In [None]:
display_paragraphs([1370])

In [None]:
display_paragraphs([320])

In [None]:
display_paragraphs([230])

In [None]:
display_paragraphs([840])

In [None]:
display_paragraphs([1250])

In [None]:
display_paragraphs([1260])

In [None]:
display_paragraphs([1557])