In [2]:
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer

import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.sentiment import SentimentIntensityAnalyzer

from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

nltk.download("punkt")
nltk.download("stopwords")
nltk.download('vader_lexicon')

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /usr/share/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

# Analyse de sentiments

In [3]:
df = pd.read_parquet("../data/part-00000-f8a4960b-1ec0-462e-8964-6eba169347e8-c000.snappy.parquet")
df = pd.melt(df, value_vars=df.columns, var_name='titre', value_name='avis')

In [4]:
ps = PorterStemmer()

def stem_word(word):
    return ps.stem(word)

def stem_column(column):
    return column.apply(lambda x: ' '.join([stem_word(word) for word in nltk.word_tokenize(x)]))

def remove_stop_words(text):
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words("english"))  
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
    filtered_text = ' '.join(filtered_tokens)
    return filtered_text

In [5]:
df["avis_stopwords"] = df["avis"].apply(remove_stop_words)
df["avis_preprocess"] = stem_column(df["avis_stopwords"])

In [6]:
MODEL_NAME = "wang3820/movies"
TOKENIZER = AutoTokenizer.from_pretrained(MODEL_NAME)
MODEL = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, from_tf = True)

sentiment_task = pipeline("sentiment-analysis", model=MODEL, tokenizer=TOKENIZER)

tokenizer_config.json:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/640 [00:00<?, ?B/s]

tf_model.h5:   0%|          | 0.00/268M [00:00<?, ?B/s]

All TF 2.0 model weights were used when initializing DistilBertForSequenceClassification.

All the weights of DistilBertForSequenceClassification were initialized from the TF 2.0 model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use DistilBertForSequenceClassification for predictions without further training.


In [7]:
def get_sentiment(text):
    
    max_length = 512
    text = text[:max_length]
    result = sentiment_task(text)[0]
    
    return result["label"]

df['sentiment_HF'] =df['avis'].apply(get_sentiment)

In [8]:
vader = SentimentIntensityAnalyzer()

def get_sentiment(text):

    sentiment_score = vader.polarity_scores(text)['compound']
    
    if sentiment_score >= 0.05:
        return 'positive'
    elif sentiment_score <= -0.05:
        return 'negative'
    else:
        return 'neutral'
    
def get_sentiment_score(text):
    return vader.polarity_scores(text)['compound']

df['sentiment_VADER'] = df['avis_preprocess'].apply(get_sentiment)
df['sentiment_score_VADER'] = df['avis_preprocess'].apply(get_sentiment_score)

In [9]:
tfidf_positive = TfidfVectorizer(stop_words=stopwords.words("english"))
tfidf_negative = TfidfVectorizer(stop_words=stopwords.words("english"))
tfidf_neutral = TfidfVectorizer(stop_words=stopwords.words("english"))

positive_reviews = df[df['sentiment_VADER'] == 'positive']
negative_reviews = df[df['sentiment_VADER'] == 'negative']

tfidf_positive_matrix = tfidf_positive.fit_transform(positive_reviews['avis'])
tfidf_negative_matrix = tfidf_negative.fit_transform(negative_reviews['avis'])

positive_feature_names = tfidf_positive.get_feature_names_out()
positive_weights = tfidf_positive.idf_

negative_feature_names = tfidf_negative.get_feature_names_out()
negative_weights = tfidf_negative.idf_

In [10]:
positive_reviews.to_csv("../data/psitive_reviews.csv")
negative_reviews.to_csv("../data/negative_reviews.csv")

In [11]:
def percentage_sentiment_by_film(df, sentiment_model, titre, sentiment):

    positive_sentiments_by_movie = df[df[sentiment_model] == sentiment].groupby(titre).size()
    total_reviews_by_movie = df.groupby(titre).size()
    percentage_positive_sentiments_by_movie = round((positive_sentiments_by_movie / total_reviews_by_movie) * 100, 2)
    df['note'] = round((df["sentiment_score_VADER"] + 1) * 10)

    percentage_df = pd.DataFrame({
        'Film': percentage_positive_sentiments_by_movie.index,
        f'pourcentages_{sentiment}': percentage_positive_sentiments_by_movie.values,
        f'notes': df.groupby(titre)['note'].mean().values
    })
    return percentage_df

In [12]:
positive = percentage_sentiment_by_film(df, 'sentiment_VADER', 'titre', "positive")

positive.sort_values(by = 'pourcentages_positive', ascending = False, inplace=True)

In [13]:
positive.reset_index(drop=True, inplace=True)

In [14]:
positive.to_csv("../data/notes_percentage_positive.csv", index=False)