In [2]:
import nltk
import pandas as pd  
from nltk import word_tokenize, ConditionalFreqDist
from nltk.corpus import stopwords
from nltk import pos_tag
import string
import re
from nltk.corpus import sentiwordnet as swn
from nltk.corpus import wordnet
import statsmodels.api as sm
from statsmodels.miscmodels.ordinal_model import OrderedModel
from collections import Counter
from sklearn.model_selection import train_test_split

In [3]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('sentiwordnet')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\pengj\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\pengj\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package sentiwordnet to
[nltk_data]     C:\Users\pengj\AppData\Roaming\nltk_data...
[nltk_data]   Package sentiwordnet is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\pengj\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\pengj\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [4]:
df=pd.read_csv('C:/Users/pengj/Desktop/NPL/projet/Movies_Reviews_modified_version1.csv')
print(df.head())

   Unnamed: 0  Ratings                                            Reviews  \
0           0      3.0  It had some laughs, but overall the motivation...   
1           1      4.0  WAITING TO EXHALE Waiting, and waiting, and wa...   
2           2      4.0  Angela Basset was good as expected, but Whitne...   
3           3      5.0  The movie is okay, mediocre might even be the ...   
4           4      5.0  I got an opportunity to see Waiting To Exhale ...   

          movie_name                                           Resenhas  \
0  Waiting to Exhale  Riu algumas risadas, mas no geral a motivação ...   
1  Waiting to Exhale  ESPERANDO PARA EXALAR Esperando, e esperando, ...   
2  Waiting to Exhale  Angela Basset foi boa como o esperado, mas Whi...   
3  Waiting to Exhale  O filme é bom, medíocre pode até ser a palavra...   
4  Waiting to Exhale  Tive a oportunidade de ver Waiting To Exhale p...   

                           genres  \
0  ['Comedy', 'Drama', 'Romance']   
1  ['Comedy'

In [5]:
review_genre_null=df[['Reviews', 'genres']].isnull().sum()
print(review_genre_null)
df= df[df['genres'].notna()]

Reviews    0
genres     0
dtype: int64


In [6]:
df_tokens=pd.DataFrame()
df_tokens['review'] = df['Reviews'].apply(lambda x: [word for word in word_tokenize(x) if word not in string.punctuation and word.strip() != ''])
df_tokens['genre'] = df['genres'].apply(lambda x: [genre.strip() for genre in x.split(',')]) 
df_tokens = df_tokens.explode('genre') 
df_tokens['genre'] = df_tokens['genre'].apply(lambda x: x.strip("'[]"))
unique_genres = df_tokens['genre'].unique().tolist()
df_tokens['note']=df['Ratings']
print(df_tokens)
print(unique_genres)

                                                  review     genre  note
0      [It, had, some, laughs, but, overall, the, mot...    Comedy   3.0
0      [It, had, some, laughs, but, overall, the, mot...     Drama   3.0
0      [It, had, some, laughs, but, overall, the, mot...   Romance   3.0
1      [WAITING, TO, EXHALE, Waiting, and, waiting, a...    Comedy   4.0
1      [WAITING, TO, EXHALE, Waiting, and, waiting, a...     Drama   4.0
...                                                  ...       ...   ...
46171  [As, thrillers, go, there, are, a, few, surpri...     Drama   5.0
46171  [As, thrillers, go, there, are, a, few, surpri...  Thriller   5.0
46172  [As, thrillers, go, there, are, a, few, surpri...    Action   5.0
46172  [As, thrillers, go, there, are, a, few, surpri...     Drama   5.0
46172  [As, thrillers, go, there, are, a, few, surpri...  Thriller   5.0

[113225 rows x 3 columns]
['Comedy', 'Drama', 'Romance', 'Action', 'Crime', 'Thriller', 'Adventure', 'Family', 'Animation',

In [7]:
#nettoyage des données 
#surprimer des la ponctuation et des éléments vides
def nettoyage_string(tokens):
    return [re.sub(r'[@#{}?§/+~.,:…]', ' ', token) for token in tokens]
def supprimer_symboles_autour(tokens):
    return [re.sub(r'^[\W_]+|[\W_]+$', ' ', token) for token in tokens]
def supprime_token_vide(tokens):
    return [token for token in tokens if token.strip() and not re.fullmatch(r'\W+', token)]

df_tokens['review'] = df_tokens['review'].apply(nettoyage_string)
df_tokens['review'] = df_tokens['review'].apply(supprimer_symboles_autour)
df_tokens['review'] = df_tokens['review'].apply(supprime_token_vide)

In [8]:
#nettoyage des données 
#supprimer des stopwords 
stop_words=set(stopwords.words('english'))
df_tokens['review']=df_tokens['review'].apply(
    lambda tokens: [token for token in tokens if token.lower() not in stop_words])
print(df_tokens['review'].head())


0    [laughs, overall, motivation, characters, inco...
0    [laughs, overall, motivation, characters, inco...
0    [laughs, overall, motivation, characters, inco...
1    [WAITING, EXHALE, Waiting, waiting, waiting, w...
1    [WAITING, EXHALE, Waiting, waiting, waiting, w...
Name: review, dtype: object


In [9]:
#Distributions conditionnelles des commentaires par rapport aux types de films 
cfd = ConditionalFreqDist(
    (str(genres), word.lower())  
    for genres, reviews in zip(df_tokens['genre'], df_tokens['review'])  
    for genre in genres  
    for word in reviews  
)
print(cfd.conditions())

['Comedy', 'Drama', 'Romance', 'Action', 'Crime', 'Thriller', 'Adventure', 'Family', 'Animation', 'Horror', 'Mystery', 'Science Fiction', 'Fantasy', 'War', 'Western', 'Music', 'History', 'Foreign', 'Documentary', 'TV Movie']


In [10]:
#Etiquetage de parties du discours
#Distribution des mots par noms, adjective et verb  etc 
df_tokens['pos_tagged'] = df_tokens['review'].apply(lambda tokens: nltk.pos_tag(tokens))
print(df_tokens['pos_tagged'].head())

0    [(laughs, NNS), (overall, JJ), (motivation, NN...
0    [(laughs, NNS), (overall, JJ), (motivation, NN...
0    [(laughs, NNS), (overall, JJ), (motivation, NN...
1    [(WAITING, NNP), (EXHALE, NNP), (Waiting, VBG)...
1    [(WAITING, NNP), (EXHALE, NNP), (Waiting, VBG)...
Name: pos_tagged, dtype: object


In [11]:
#Selectioner des mots adjectives pour chaque observation 
df_tokens['mot_adj'] = df_tokens['pos_tagged'].apply(
    lambda tagged_tokens: [word for word, pos in tagged_tokens if pos in ['JJ', 'JJR', 'JJS']]
)
print(df_tokens['mot_adj'].head())

0    [overall, incomprehensible, mad, hypocritical,...
0    [overall, incomprehensible, mad, hypocritical,...
0    [overall, incomprehensible, mad, hypocritical,...
1    [popular, popular, popular, much, easy, potent...
1    [popular, popular, popular, much, easy, potent...
Name: mot_adj, dtype: object


In [12]:
#Distributions conditionnelles des mots adjectives  par rapport aux types de films 
cfd_adj = ConditionalFreqDist(
    (genre, word.lower())  
    for genre, adj_words in zip(df_tokens['genre'], df_tokens['mot_adj'])  
    for word in adj_words  
)
print(cfd_adj.conditions())

['Comedy', 'Drama', 'Romance', 'Action', 'Crime', 'Thriller', 'Adventure', 'Family', 'Animation', 'Horror', 'Mystery', 'Science Fiction', '', 'Fantasy', 'War', 'Western', 'Music', 'History', 'Foreign', 'Documentary', 'TV Movie']


In [13]:
#Construction une dataframe pour des mots adjectives par rapport aux types de films
df_adj_frequant = pd.DataFrame(
    {genre: dict(cfd_adj[genre]) for genre in cfd_adj.conditions()}  
).fillna(0)
df_adj_frequant = df_adj_frequant.T
print(df_adj_frequant)

#Selectionner des mots adjectives très cumulatives pour les supprimer par raport aux leur std 
std_valeur=df_adj_frequant.std()
print(std_valeur)
mot_frequante=std_valeur[std_valeur<0.5].index
print(mot_frequante)


                 overall  incomprehensible    mad  hypocritical  messed  \
Comedy             824.0              14.0   84.0          17.0     5.0   
Drama             1377.0              25.0  106.0          11.0     3.0   
Romance           1131.0              16.0   94.0          14.0     3.0   
Action             313.0               8.0   19.0           1.0     1.0   
Crime              229.0              10.0    9.0           0.0     0.0   
Thriller           470.0              30.0   39.0           2.0     2.0   
Adventure          223.0              10.0   16.0           0.0     0.0   
Family             266.0               1.0   38.0           0.0     0.0   
Animation           99.0               0.0   18.0           0.0     0.0   
Horror             280.0              12.0   75.0           2.0     1.0   
Mystery            190.0              14.0    8.0           2.0     1.0   
Science Fiction    194.0              10.0   17.0           1.0     1.0   
                    46.0 

In [14]:
#Définition d'une fonction d'obtenir des dataframes des mots adjectives pour chaque type de film 
def genre_data(df, genre):
    genre_df = df[df['genre'] == genre][['mot_adj', 'note']]
    genre_df.reset_index(drop=True, inplace=True)
    list_adj = set([adj for sublist in genre_df['mot_adj'] for adj in sublist])
    return genre_df, list_adj

comedy_df, unique_adj_comedy = genre_data(df_tokens, 'Comedy')
horror_df, unique_adj_horror=genre_data(df_tokens,'Horror')
thriller_df, unique_adj_thriller=genre_data(df_tokens,'Thriller')
animation_df, unique_adj_animation=genre_data(df_tokens,'Animation')
romance_df, unique_adj_romance=genre_data(df_tokens,'Romance')


In [15]:
#Faire une bocule pour prendre toutes les données pour chaque type de film 
#Et 70% des échantilonnages pour chaque type de film sont distribuées être le training échantillonnage; 
# le reste 30% des échantillonages our chaque type de film sont distribuées être le test échantillonnage
toute_genre_data= {}
for genre in cfd_adj.conditions():
    genre_df, unique_adj = genre_data(df_tokens, genre)
    train_data, test_data = train_test_split(genre_df, test_size=0.3, random_state=42)

    toute_genre_data[genre] = {
        'train_data': train_data,
        'test_data': test_data,
        'unique_adjectives': unique_adj
    }


In [16]:
#Faire une boucle pour obtenir toutes les fréquentes des mots adjective pour chaque type de film d'échantillonage de training
toute_genre_adjective_freq = {}
for genre in cfd_adj.conditions():
    train_data = toute_genre_data[genre]['train_data']
    all_adjectives = [adj for sublist in train_data['mot_adj'] for adj in sublist]
    adj_select = [adj for adj in all_adjectives if adj not in [' s', ' m']]
    adj_frequency = Counter(adj_select)
    cent_adjectives_freq = adj_frequency.most_common(100)
    cent_adjectives= [adj[0] for adj in cent_adjectives_freq]
    cinqcent_adjectives_freq = adj_frequency.most_common(500)
    cinqcent_adjectives= [adj[0] for adj in cinqcent_adjectives_freq]

    toute_genre_adjective_freq[genre] = {
        'frequency': adj_frequency,
        'adjectives': adj_select,
        'cent_adj': cent_adjectives,
        'cinqcent_adj':cinqcent_adjectives
    }

In [17]:
#Définition d'une fonction de sentiment des mots adjectives 
def sentiment_point(word, pos):
    if pos == 'a':  
        synsets = list(swn.senti_synsets(word, 'a'))
    else:
        return None  
    if synsets:
        sentiment = synsets[0]  
        positive = sentiment.pos_score()  
        negative = sentiment.neg_score()  
        objective = sentiment.obj_score() 
        total_score = positive - negative  
        return total_score
    return 0

In [18]:
#Calculer des nombres des mots, des nombres de mot négatives et postives  par chaque comentaires 
def calculate_sentiments(row):
    words = row['mot_adj']
    word_count = len(words)
    positive_count = 0
    negative_count = 0
    
    for word in words:
        score = sentiment_point(word, 'a')
        if score > 0:
            positive_count += 1
        elif score < 0:
            negative_count += 1
    
    positive_ratio = positive_count / word_count if word_count > 0 else 0
    negative_ratio = negative_count / word_count if word_count > 0 else 0
    
    return pd.Series({
        'nombre_mot': word_count,
        'nombre_positive': positive_count,
        'nombre_negative': negative_count,
        'positive_ratio': positive_ratio,
        'negative_ratio': negative_ratio
    })

In [19]:
#Calculation des points de sentiments totales pour chaque observation à l'échantilonnage 
def calculate_total_sentiment(adjectives, unique_adjectives):
    total_score = 0  
    for adj in adjectives:
        if adj in unique_adjectives: 
            total_score += sentiment_point(adj, 'a')  
    return total_score

In [20]:
#Transformer des notes à la variable binomale: 0 et 1 par rapport sa valeur de median
def binarize_by_median(df, note, median_value):
    binary_column = df[note].apply(lambda x: 1 if x >= median_value else 0)
    return binary_column


In [21]:
#Définition d'une fonction pour obtenir des points de sentiments pour chaque observtion
def create_adj_sentiment_matrix(train_df, cent_adj):
    sentiment_matrix = pd.DataFrame(0.0, index=train_df.index, columns=cent_adj)  

    for index, row in train_df.iterrows():
        adjectives = row['mot_adj']
        for adj in adjectives:
            if adj in cent_adj:
                sentiment_score = sentiment_point(adj, 'a')  
                sentiment_matrix.at[index, adj] = sentiment_score  
    return sentiment_matrix

In [22]:
comedy_train_df = toute_genre_data['Comedy']['train_data']
comedy_unique_adjectives = toute_genre_data['Comedy']['unique_adjectives']
comedy_cent_adj = toute_genre_adjective_freq.get('Comedy', {}).get('cent_adj', [])
comedy_cinqcent_adj = toute_genre_adjective_freq.get('Comedy', {}).get('cinqcent_adj', [])
print(comedy_train_df)
print(comedy_cent_adj)
print(comedy_cinqcent_adj)

                                                 mot_adj  note
10806  [better, ensemble, short, full-length, god-awf...   5.0
10784  [different, turned, big, stupid, much, little,...   4.0
11323  [poor, planet, little, romantic-comedy, enough...   5.0
10715  [non-stop, off-color, early, first, object, hi...   8.0
11452  [etc ,  stellar, watchable, horrendous, script...   3.0
...                                                  ...   ...
13418  [care, follow, next, hairy, top, heavy, differ...   7.0
5390   [stupid, detective, good, terrific, simple, gr...   6.0
860    [last, dandy, little, al, inked, late, next, b...  10.0
15795  [sure, inordinate, much, good, hideous, light,...   2.0
7270   [user, good, bad, thought, great, funny, good,...   7.0

[11136 rows x 2 columns]
['good', 'great', 'funny', 'much', 'bad', 'many', 'little', 'best', 'romantic', 'new', 'real', 'original', 'first', 'old', 'big', 'whole', 'different', 'young', 'nice', 'least', 'last', 'better', 'sure', 'high', 'hilar

In [23]:
horror_train_df = toute_genre_data['Horror']['train_data']
horror_unique_adjectives = toute_genre_data['Horror']['unique_adjectives']
horror_cent_adj = toute_genre_adjective_freq.get('Horror', {}).get('cent_adj', [])
horror_cinqcent_adj = toute_genre_adjective_freq.get('Horror', {}).get('cinqcent_adj', [])
print(horror_train_df)
print(horror_cent_adj)
print(horror_cinqcent_adj)

                                                mot_adj  note
1614  [aware, avid, higher, first, cheap, predictabl...   9.0
336   [ The, directorial, starred, first, admirable,...   8.0
2505  [usual, correct, brush, chitty, real, sick, Ca...   2.0
2664  [absolute, follow, self-imposed, many, book-ba...   2.0
3795  [allow, low, little, make, good, small, simila...   1.0
...                                                 ...   ...
4426  [difficult, bad, better, talentless, watch, gr...   1.0
466   [deep, old, French, serious, serious, dark, su...   6.0
3092  [trailer, shot, clusterfluck, fair, fair, horr...   1.0
3772  [bad, good, independent, tight, awful Let, man...   1.0
860        [diabolical, enjoyed, good, recommend, good]  10.0

[3420 rows x 2 columns]
['good', 'bad', 'great', 'much', 'many', 'little', 'best', 'original', 'old', 'new', 'first', 'real', 'least', 'scary', 'young', 'whole', 'better', 'different', 'big', 'last', 'funny', 'main', 'low', 'nice', 'beautiful', 'classic',

In [24]:
thriller_train_df = toute_genre_data['Thriller']['train_data']
thriller_unique_adjectives = toute_genre_data['Thriller']['unique_adjectives']
thriller_cent_adj = toute_genre_adjective_freq.get('Thriller', {}).get('cent_adj', [])
thriller_cinqcent_adj = toute_genre_adjective_freq.get('Thriller', {}).get('cinqcent_adj', [])
print(thriller_train_df)
print(thriller_cent_adj)
print(thriller_cinqcent_adj)

                                                mot_adj  note
8365  [good, slow, difficult, 10-15, many, unnecessa...   6.0
3463  [English, different, psychological, real, open...   8.0
1477  [three-week, last, lucky, thought, great, char...  10.0
7559  [Luckily, ripped, many, actual, awful, much, u...   1.0
1173  [common, original, good, eager, so Regardless,...   8.0
...                                                 ...   ...
5734  [surprising, simple, predictable, fantastic, e...   7.0
5191  [walked, certain, bad, bad, actual, whole, fac...   2.0
5390                  [cool, daughter, good, new, like]   7.0
860        [beautiful, smart, good, grandfather, great]  10.0
7270  [fishy, toxic, nuclear, crew, American, sub, m...   4.0

[6019 rows x 2 columns]
['good', 'great', 'bad', 'much', 'many', 'little', 'best', 'first', 'real', 'new', 'big', 'original', 'main', 'old', 'whole', 'last', 'different', 'least', 'young', 'better', 'funny', 'low', 'sure', 'interesting', 'beautiful', 'nic

In [29]:
animation_train_df = toute_genre_data['Animation']['train_data']
animation_unique_adjectives = toute_genre_data['Animation']['unique_adjectives']
animation_cent_adj = toute_genre_adjective_freq.get('Animation', {}).get('cent_adj', [])
animation_cinqcent_adj = toute_genre_adjective_freq.get('Animation', {}).get('cinqcent_adj', [])
print(animation_train_df)
print(animation_cent_adj)
print(animation_cinqcent_adj)

                                                mot_adj  note
285   [equal, classic, outcast, medieval, re-enactme...   7.0
1304  [good, legendary, celebrate, keep, human, inte...   6.0
428   [early, many, well-known, iconic, well-known, ...  10.0
1608  [recommend, finish, pulling, sure, future, gre...   9.0
757   [promising, great, good, realistic, great, rea...   2.0
...                                                 ...   ...
1130  [previous, new, classic, fairy, Maldonian, sou...   9.0
1294  [little, many, animated, innovative, deep, who...   9.0
860   [green, nonsense, whole, shot, talent, latest,...   1.0
1459  [restorer, boy-friend, flat, stupid, nice, fro...   6.0
1126  [best, great, bad, good, best, great, twist, c...   8.0

[1153 rows x 2 columns]
['good', 'bad', 'great', 'little', 'much', 'many', 'original', 'best', 'old', 'new', 'first', 'classic', 'whole', 'funny', 'real', 'young', 'big', 'beautiful', 'nice', 'sure', 'least', 'last', 'different', 'better', 'main', 'specia

In [30]:
romance_train_df = toute_genre_data['Romance']['train_data']
romance_unique_adjectives = toute_genre_data['Romance']['unique_adjectives']
romance_cent_adj = toute_genre_adjective_freq.get('Romance', {}).get('cent_adj', [])
romance_cinqcent_adj = toute_genre_adjective_freq.get('Romance', {}).get('cinqcent_adj', [])
print(romance_train_df)
print(romance_cent_adj)
print(romance_cinqcent_adj)

                                                 mot_adj  note
16025  [interested, dubious, dubious, bad, non-existe...   1.0
3576   [dead, big, writer, meet, popular, full, impre...   7.0
9863   [close, convey, important, good, able, able, a...   7.0
14461  [good, semi-remake, little, easier, good, idea...   2.0
5516   [good, semi-remake, little, easier, good, idea...   2.0
...                                                  ...   ...
11284  [mixed, good, good, fluffy, depth, emotional, ...   7.0
11964  [bad, great, whole, complain, whole, good, swe...   7.0
5390                            [teen, authentic, right]   4.0
860    [infamous, genocide, first, 20th, present, eth...   8.0
15795  [root, sociopathic, particular, enjoyable, ove...   8.0

[13428 rows x 2 columns]
['good', 'great', 'bad', 'much', 'many', 'little', 'best', 'funny', 'real', 'first', 'romantic', 'new', 'original', 'big', 'whole', 'old', 'young', 'beautiful', 'different', 'nice', 'last', 'least', 'main', 'better', '

In [25]:
#Faire le modèle logistique entre des notes et des ratios postives et négatives par chaque comentaires 
nombre_comedy_df = comedy_df.apply(calculate_sentiments, axis=1)
median_comedy=comedy_df['note'].median()
comedy_df['binary_note'] = binarize_by_median(comedy_df, 'note', median_comedy)
nombre_comedy_df=pd.concat([comedy_df[['binary_note']], nombre_comedy_df], axis=1)
print(nombre_comedy_df.head())
X = nombre_comedy_df[['positive_ratio', 'negative_ratio']]
y= nombre_comedy_df['binary_note']
model_initial = OrderedModel(y, X, distr='logit')
result_nombre_comedy = model_initial.fit(method='bfgs', disp=False)
print(result_nombre_comedy.summary())

   binary_note  nombre_mot  nombre_positive  nombre_negative  positive_ratio  \
0            0         7.0              1.0              3.0        0.142857   
1            0        35.0             11.0              8.0        0.314286   
2            0         2.0              2.0              0.0        1.000000   
3            0        12.0              4.0              6.0        0.333333   
4            0        14.0              3.0              2.0        0.214286   

   negative_ratio  
0        0.428571  
1        0.228571  
2        0.000000  
3        0.500000  
4        0.142857  
                             OrderedModel Results                             
Dep. Variable:            binary_note   Log-Likelihood:                -10395.
Model:                   OrderedModel   AIC:                         2.080e+04
Method:            Maximum Likelihood   BIC:                         2.082e+04
Date:                Wed, 13 Nov 2024                                         
Time

In [26]:
#Faire le modèle logistique entre des notes et des ratios postives et négatives par chaque comentaires 
nombre_horror_df = horror_df.apply(calculate_sentiments, axis=1)
median_horror=horror_df['note'].median()
horror_df['binary_note'] = binarize_by_median(horror_df, 'note', median_horror)
nombre_horror_df=pd.concat([horror_df[['binary_note']], nombre_horror_df], axis=1)
print(nombre_horror_df.head())
X = nombre_horror_df[['positive_ratio', 'negative_ratio']]
y= nombre_horror_df['binary_note']
model_initial = OrderedModel(y, X, distr='logit')
result_nombre_horror = model_initial.fit(method='bfgs', disp=False)
print(result_nombre_horror.summary())

   binary_note  nombre_mot  nombre_positive  nombre_negative  positive_ratio  \
0            0        13.0              7.0              1.0        0.538462   
1            0         8.0              2.0              4.0        0.250000   
2            1         9.0              1.0              4.0        0.111111   
3            1        13.0              4.0              3.0        0.307692   
4            1         8.0              3.0              3.0        0.375000   

   negative_ratio  
0        0.076923  
1        0.500000  
2        0.444444  
3        0.230769  
4        0.375000  
                             OrderedModel Results                             
Dep. Variable:            binary_note   Log-Likelihood:                -3146.1
Model:                   OrderedModel   AIC:                             6298.
Method:            Maximum Likelihood   BIC:                             6318.
Date:                Wed, 13 Nov 2024                                         
Time

In [33]:
#Faire le modèle logistique entre des notes et des ratios postives et négatives par chaque comentaires 
nombre_thriller_df = thriller_df.apply(calculate_sentiments, axis=1)
median_thriller=thriller_df['note'].median()
thriller_df['binary_note'] = binarize_by_median(thriller_df, 'note', median_thriller)
nombre_thriller_df=pd.concat([thriller_df[['binary_note']], nombre_thriller_df], axis=1)
print(nombre_thriller_df.head())
X = nombre_thriller_df[['positive_ratio', 'negative_ratio']]
y= nombre_thriller_df['binary_note']
model_initial = OrderedModel(y, X, distr='logit')
result_nombre_thriller= model_initial.fit(method='bfgs', disp=False)
print(result_nombre_thriller.summary())

   binary_note  nombre_mot  nombre_positive  nombre_negative  positive_ratio  \
0            0        23.0              8.0              6.0        0.347826   
1            0        55.0             22.0             10.0        0.400000   
2            1        62.0             16.0              5.0        0.258065   
3            1        21.0              8.0              2.0        0.380952   
4            0        10.0              1.0              2.0        0.100000   

   negative_ratio  
0        0.260870  
1        0.181818  
2        0.080645  
3        0.095238  
4        0.200000  
                             OrderedModel Results                             
Dep. Variable:            binary_note   Log-Likelihood:                -5547.5
Model:                   OrderedModel   AIC:                         1.110e+04
Method:            Maximum Likelihood   BIC:                         1.112e+04
Date:                Wed, 13 Nov 2024                                         
Time

In [34]:
#Faire le modèle logistique entre des notes et des ratios postives et négatives par chaque comentaires 
nombre_romance_df = romance_df.apply(calculate_sentiments, axis=1)
median_romance=romance_df['note'].median()
romance_df['binary_note'] = binarize_by_median(romance_df, 'note', median_romance)
nombre_romance_df=pd.concat([romance_df[['binary_note']], nombre_romance_df], axis=1)
print(nombre_romance_df.head())
X = nombre_romance_df[['positive_ratio', 'negative_ratio']]
y= nombre_romance_df['binary_note']
model_initial = OrderedModel(y, X, distr='logit')
result_nombre_romance= model_initial.fit(method='bfgs', disp=False)
print(result_nombre_romance.summary())

   binary_note  nombre_mot  nombre_positive  nombre_negative  positive_ratio  \
0            0         7.0              1.0              3.0        0.142857   
1            0        35.0             11.0              8.0        0.314286   
2            0         2.0              2.0              0.0        1.000000   
3            0        12.0              4.0              6.0        0.333333   
4            0        14.0              3.0              2.0        0.214286   

   negative_ratio  
0        0.428571  
1        0.228571  
2        0.000000  
3        0.500000  
4        0.142857  
                             OrderedModel Results                             
Dep. Variable:            binary_note   Log-Likelihood:                -12357.
Model:                   OrderedModel   AIC:                         2.472e+04
Method:            Maximum Likelihood   BIC:                         2.474e+04
Date:                Wed, 13 Nov 2024                                         
Time

In [36]:
#Faire le modèle logistique entre des notes et des ratios postives et négatives par chaque comentaires 
nombre_animation_df = animation_df.apply(calculate_sentiments, axis=1)
median_animation=animation_df['note'].median()
animation_df['binary_note'] = binarize_by_median(animation_df, 'note', median_animation)
nombre_animation_df=pd.concat([animation_df[['binary_note']], nombre_animation_df], axis=1)
print(nombre_animation_df.head())
X = nombre_animation_df[['positive_ratio', 'negative_ratio']]
y= nombre_animation_df['binary_note']
model_initial = OrderedModel(y, X, distr='logit')
result_nombre_animation= model_initial.fit(method='bfgs', disp=False)
print(result_nombre_animation.summary())

   binary_note  nombre_mot  nombre_positive  nombre_negative  positive_ratio  \
0            0         9.0              0.0              3.0        0.000000   
1            0        32.0              7.0              8.0        0.218750   
2            0        13.0              3.0              3.0        0.230769   
3            0         9.0              3.0              4.0        0.333333   
4            1        59.0             17.0             18.0        0.288136   

   negative_ratio  
0        0.333333  
1        0.250000  
2        0.230769  
3        0.444444  
4        0.305085  
                             OrderedModel Results                             
Dep. Variable:            binary_note   Log-Likelihood:                -996.36
Model:                   OrderedModel   AIC:                             1999.
Method:            Maximum Likelihood   BIC:                             2015.
Date:                Wed, 13 Nov 2024                                         
Time

In [37]:
comedy_adj_sentiment_matrix = create_adj_sentiment_matrix(comedy_train_df,comedy_cent_adj )
print(comedy_adj_sentiment_matrix)
horror_adj_sentiment_matrix = create_adj_sentiment_matrix(horror_train_df,horror_cent_adj )
thriller_adj_sentiment_matrix = create_adj_sentiment_matrix(thriller_train_df,thriller_cent_adj )
romance_adj_sentiment_matrix = create_adj_sentiment_matrix(romance_train_df,romance_cent_adj )
animation_adj_sentiment_matrix = create_adj_sentiment_matrix(animation_train_df,animation_cent_adj )

       good  great  funny  much    bad  many  little  best  romantic    new  \
10806  0.00    0.0    0.5   0.0  0.000   0.0   0.000  0.00       0.0  0.375   
10784  0.00    0.0    0.0   0.0 -0.625   0.0  -0.375  0.00       0.0  0.000   
11323  0.75    0.0    0.0   0.0  0.000   0.0  -0.375  0.00       0.0  0.375   
10715  0.00    0.0    0.0   0.0  0.000   0.0   0.000  0.00       0.0  0.000   
11452  0.00    0.0    0.5   0.0 -0.625   0.0   0.000  0.00       0.0  0.000   
...     ...    ...    ...   ...    ...   ...     ...   ...       ...    ...   
13418  0.75    0.0    0.0   0.0  0.000   0.0   0.000  0.75       0.0  0.000   
5390   0.75    0.0    0.5   0.0  0.000   0.0   0.000  0.00       0.0  0.000   
860    0.75    0.0    0.0   0.0  0.000   0.0  -0.375  0.75       0.0  0.000   
15795  0.75    0.0    0.5   0.0 -0.625   0.0  -0.375  0.00       0.0  0.000   
7270   0.75    0.0    0.5   0.0 -0.625   0.0  -0.375  0.00       0.0  0.000   

       ...  sweet  important  disappointed  average

In [44]:
#Combinaison la matrice de comdy_adj pour prépare la modélisation
print(comedy_train_df [['note']])
comedy_train_df.reset_index(drop=True, inplace=True)
comedy_adj_sentiment_matrix.reset_index(drop=True, inplace=True)
comedy_adj = pd.concat([comedy_train_df [['note']], comedy_adj_sentiment_matrix], axis=1)
comedy_adj['total_sentiment'] = comedy_adj_sentiment_matrix.sum(axis=1)
comedy_adj = comedy_adj.loc[:, (comedy_adj != 0).any(axis=0)]
comedy_adj = comedy_adj[comedy_adj['total_sentiment'] != 0]
comedy_adj = comedy_adj.dropna(subset=['total_sentiment'])
comedy_adj = comedy_adj.drop_duplicates()
comedy_adj = comedy_adj.apply(pd.to_numeric, errors='coerce')
median_comedy=comedy_df['note'].median()
comedy_adj['note'] = binarize_by_median(comedy_adj, 'note', median_comedy)
print(comedy_adj)


       note
10806   5.0
10784   4.0
11323   5.0
10715   8.0
11452   3.0
...     ...
13418   7.0
5390    6.0
860    10.0
15795   2.0
7270    7.0

[11136 rows x 1 columns]
       note  good  funny    bad  little  best    new  old    big  whole  ...  \
0         0  0.00    0.5  0.000   0.000  0.00  0.375  0.0  0.000    0.0  ...   
1         0  0.00    0.0 -0.625  -0.375  0.00  0.000  0.0  0.125    0.0  ...   
2         0  0.75    0.0  0.000  -0.375  0.00  0.375  0.0  0.000    0.0  ...   
3         1  0.00    0.0  0.000   0.000  0.00  0.000  0.0  0.000    0.0  ...   
4         0  0.00    0.5 -0.625   0.000  0.00  0.000  0.0  0.000    0.0  ...   
...     ...   ...    ...    ...     ...   ...    ...  ...    ...    ...  ...   
11126     0  0.75    0.0  0.000   0.000  0.75  0.000  0.0  0.000    0.0  ...   
11127     0  0.00    0.0  0.000   0.000  0.00  0.000  0.0  0.125    0.0  ...   
11128     0  0.00    0.0  0.000  -0.375  0.75  0.375  0.0  0.125    0.0  ...   
11130     1  0.75    0.5  0.00

In [45]:
#Modèle Logit Ordonnance 
y = comedy_adj['note']
X = comedy_adj.drop(columns=['note','total_sentiment'])
model_inital = OrderedModel(y, X, distr='logit')
result = model_inital.fit(method='bfgs', disp=False)
aic_initiale=result.aic
print(result.summary())
print(aic_initiale)

                             OrderedModel Results                             
Dep. Variable:                   note   Log-Likelihood:                -3830.4
Model:                   OrderedModel   AIC:                             7791.
Method:            Maximum Likelihood   BIC:                             8231.
Date:                Wed, 13 Nov 2024                                         
Time:                        22:03:08                                         
No. Observations:                6497                                         
Df Residuals:                    6432                                         
Df Model:                          64                                         
                   coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------
good             0.0998      0.075      1.322      0.186      -0.048       0.248
funny            0.3645      0.126      2.903 

In [39]:
#Combinaison la matrice de horror_adj pour prépare la modélisation
print(horror_train_df [['note']])
horror_train_df.reset_index(drop=True, inplace=True)
horror_adj_sentiment_matrix.reset_index(drop=True, inplace=True)
horror_adj = pd.concat([horror_train_df [['note']], horror_adj_sentiment_matrix], axis=1)
horror_adj['total_sentiment'] = horror_adj_sentiment_matrix.sum(axis=1)
horror_adj = horror_adj.loc[:, (horror_adj != 0).any(axis=0)]
horror_adj = horror_adj[horror_adj['total_sentiment'] != 0]
horror_adj = horror_adj.dropna(subset=['total_sentiment'])
horror_adj = horror_adj.drop_duplicates()
horror_adj = horror_adj.apply(pd.to_numeric, errors='coerce')
median_horror=horror_df['note'].median()
horror_adj['note'] = binarize_by_median(horror_adj, 'note', median_horror)
print(horror_adj)

      note
0      9.0
1      8.0
2      2.0
3      2.0
4      1.0
...    ...
3415   1.0
3416   6.0
3417   1.0
3418   1.0
3419  10.0

[3420 rows x 1 columns]
      note  good    bad  little  best    old    new  scary  whole  better  \
0        1  0.00  0.000   0.000  0.00  0.000  0.000  -0.75  0.000   0.875   
1        1  0.75  0.000  -0.375  0.75  0.000  0.375   0.00  0.125   0.000   
2        0  0.00  0.000  -0.375  0.00  0.000  0.000   0.00  0.000   0.000   
3        0  0.00  0.000   0.000  0.00  0.000  0.375   0.00  0.000   0.000   
5        1  0.75  0.000  -0.375  0.00  0.000  0.000   0.00  0.000   0.000   
...    ...   ...    ...     ...   ...    ...    ...    ...    ...     ...   
3401     1  0.00  0.000  -0.375  0.00  0.000  0.000  -0.75  0.000   0.000   
3403     1  0.00  0.000  -0.375  0.00  0.000  0.000   0.00  0.000   0.000   
3405     0  0.75 -0.625   0.000  0.75  0.000  0.000   0.00  0.000   0.000   
3415     0  0.00 -0.625   0.000  0.00  0.375  0.000   0.00  0.125   0.875

In [46]:
#Modèle Logit Ordonnance pour film horrible 
y = horror_adj['note']
X = horror_adj.drop(columns=['note','total_sentiment'])
model_inital = OrderedModel(y, X, distr='logit')
result = model_inital.fit(method='bfgs', disp=False)
aic_initiale=result.aic
print(result.summary())
print(aic_initiale)

                             OrderedModel Results                             
Dep. Variable:                   note   Log-Likelihood:                -1032.8
Model:                   OrderedModel   AIC:                             2198.
Method:            Maximum Likelihood   BIC:                             2561.
Date:                Wed, 13 Nov 2024                                         
Time:                        22:05:38                                         
No. Observations:                1818                                         
Df Residuals:                    1752                                         
Df Model:                          65                                         
                   coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------
good             0.1600      0.149      1.077      0.282      -0.131       0.451
bad              1.0591      0.201      5.267 

In [50]:
#Combinaison la matrice de thriller_adj pour prépare la modélisation
print(thriller_train_df [['note']])
thriller_train_df.reset_index(drop=True, inplace=True)
thriller_adj_sentiment_matrix.reset_index(drop=True, inplace=True)
thriller_adj = pd.concat([thriller_train_df [['note']], thriller_adj_sentiment_matrix], axis=1)
thriller_adj['total_sentiment'] = thriller_adj_sentiment_matrix.sum(axis=1)
thriller_adj =thriller_adj.loc[:, (thriller_adj != 0).any(axis=0)]
thriller_adj =thriller_adj[thriller_adj['total_sentiment'] != 0]
thriller_adj = thriller_adj.dropna(subset=['total_sentiment'])
thriller_adj = thriller_adj.drop_duplicates()
thriller_adj = thriller_adj.apply(pd.to_numeric, errors='coerce')
median_thriller=thriller_df['note'].median()
thriller_adj['note'] = binarize_by_median(thriller_adj, 'note', median_thriller)
print(thriller_adj)

      note
0      6.0
1      8.0
2     10.0
3      1.0
4      8.0
...    ...
6014   7.0
6015   2.0
6016   7.0
6017  10.0
6018   4.0

[6019 rows x 1 columns]
      note  good    bad  little  best    new    big   main  old  whole  ...  \
1        1  0.00  0.000   0.000  0.00  0.000  0.000  0.000  0.0  0.000  ...   
2        1  0.00  0.000   0.000  0.00  0.000  0.000  0.375  0.0  0.000  ...   
3        0  0.00  0.000   0.000  0.00  0.000  0.000  0.000  0.0  0.000  ...   
4        1  0.75  0.000  -0.375  0.00  0.000  0.000  0.000  0.0  0.125  ...   
5        0  0.75  0.000   0.000  0.00  0.000  0.000  0.000  0.0  0.000  ...   
...    ...   ...    ...     ...   ...    ...    ...    ...  ...    ...  ...   
6001     1  0.75  0.000  -0.375  0.00  0.000  0.125  0.000  0.0  0.125  ...   
6002     0  0.00  0.000  -0.375  0.00  0.000  0.000  0.000  0.0  0.000  ...   
6004     1  0.00  0.000   0.000  0.00  0.375  0.125  0.000  0.0  0.000  ...   
6010     1  0.00 -0.625   0.000  0.00  0.000  0.000  

In [None]:
#Modèle Logit Ordonnance pour film thriller
y = thriller_adj['note']
X = thriller_adj.drop(columns=['note','total_sentiment'])
model_inital = OrderedModel(y, X, distr='logit')
result = model_inital.fit(method='bfgs', disp=False)
aic_initiale=result.aic
print(result.summary())
print(aic_initiale)

                             OrderedModel Results                             
Dep. Variable:                   note   Log-Likelihood:                -1773.6
Model:                   OrderedModel   AIC:                             3679.
Method:            Maximum Likelihood   BIC:                             4079.
Date:                Wed, 13 Nov 2024                                         
Time:                        22:09:21                                         
No. Observations:                3158                                         
Df Residuals:                    3092                                         
Df Model:                          65                                         
                  coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------
good            0.4875      0.113      4.325      0.000       0.267       0.708
bad             0.7789      0.153      5.077     

In [42]:
#Combinaison la matrice de romance_adj pour prépare la modélisation
print(romance_train_df [['note']])
romance_train_df.reset_index(drop=True, inplace=True)
romance_adj_sentiment_matrix.reset_index(drop=True, inplace=True)
romance_adj = pd.concat([romance_train_df [['note']], romance_adj_sentiment_matrix], axis=1)
romance_adj['total_sentiment'] = romance_adj_sentiment_matrix.sum(axis=1)
romance_adj = romance_adj.loc[:, (romance_adj != 0).any(axis=0)]
romance_adj = romance_adj[romance_adj['total_sentiment'] != 0]
romance_adj = romance_adj.dropna(subset=['total_sentiment'])
romance_adj = romance_adj.drop_duplicates()
romance_adj = romance_adj.apply(pd.to_numeric, errors='coerce')
median_romance=romance_df['note'].median()
romance_adj['note'] = binarize_by_median(romance_adj, 'note', median_romance)
print(romance_adj)

       note
16025   1.0
3576    7.0
9863    7.0
14461   2.0
5516    2.0
...     ...
11284   7.0
11964   7.0
5390    4.0
860     8.0
15795   8.0

[13428 rows x 1 columns]
       note  good    bad  little  best  funny    new    big  whole  old  ...  \
0         0  0.00 -0.625  -0.375  0.75    0.0  0.000  0.000  0.000  0.0  ...   
1         1  0.00  0.000   0.000  0.00    0.0  0.000  0.125  0.000  0.0  ...   
2         1  0.75  0.000   0.000  0.00    0.0  0.000  0.000  0.000  0.0  ...   
3         0  0.75  0.000  -0.375  0.00    0.0  0.000  0.000  0.000  0.0  ...   
5         1  0.75  0.000   0.000  0.00    0.0  0.000  0.000  0.125  0.0  ...   
...     ...   ...    ...     ...   ...    ...    ...    ...    ...  ...  ...   
13410     0  0.00  0.000   0.000  0.00    0.0  0.375  0.000  0.000  0.0  ...   
13413     1  0.75  0.000  -0.375  0.00    0.5  0.000  0.125  0.000  0.0  ...   
13416     1  0.75 -0.625  -0.375  0.00    0.0  0.000  0.000  0.000  0.0  ...   
13420     1  0.00 -0.625   0.0

In [54]:
#Combinaison la matrice de romance_adj pour prépare la modélisation
print(romance_train_df [['note']])
romance_train_df.reset_index(drop=True, inplace=True)
romance_adj_sentiment_matrix.reset_index(drop=True, inplace=True)
romance_adj = pd.concat([romance_train_df [['note']], romance_adj_sentiment_matrix], axis=1)
romance_adj['total_sentiment'] = romance_adj_sentiment_matrix.sum(axis=1)
romance_adj = romance_adj.loc[:, (romance_adj != 0).any(axis=0)]
romance_adj = romance_adj[romance_adj['total_sentiment'] != 0]
romance_adj = romance_adj.dropna(subset=['total_sentiment'])
romance_adj = romance_adj.drop_duplicates()
romance_adj = romance_adj.apply(pd.to_numeric, errors='coerce')
median_romance=romance_df['note'].median()
romance_adj['note'] = binarize_by_median(romance_adj, 'note', median_romance)
print(romance_adj)

       note
0       1.0
1       7.0
2       7.0
3       2.0
4       2.0
...     ...
13423   7.0
13424   7.0
13425   4.0
13426   8.0
13427   8.0

[13428 rows x 1 columns]
       note  good    bad  little  best  funny    new    big  whole  old  ...  \
0         0  0.00 -0.625  -0.375  0.75    0.0  0.000  0.000  0.000  0.0  ...   
1         1  0.00  0.000   0.000  0.00    0.0  0.000  0.125  0.000  0.0  ...   
2         1  0.75  0.000   0.000  0.00    0.0  0.000  0.000  0.000  0.0  ...   
3         0  0.75  0.000  -0.375  0.00    0.0  0.000  0.000  0.000  0.0  ...   
5         1  0.75  0.000   0.000  0.00    0.0  0.000  0.000  0.125  0.0  ...   
...     ...   ...    ...     ...   ...    ...    ...    ...    ...  ...  ...   
13410     0  0.00  0.000   0.000  0.00    0.0  0.375  0.000  0.000  0.0  ...   
13413     1  0.75  0.000  -0.375  0.00    0.5  0.000  0.125  0.000  0.0  ...   
13416     1  0.75 -0.625  -0.375  0.00    0.0  0.000  0.000  0.000  0.0  ...   
13420     1  0.00 -0.625   0.0

In [55]:
#Modèle Logit Ordonnance pour film romantique 
y = romance_adj['note']
X = romance_adj.drop(columns=['note','total_sentiment'])
model_inital = OrderedModel(y, X, distr='logit')
result = model_inital.fit(method='bfgs', disp=False)
aic_initiale=result.aic
print(result.summary())
print(aic_initiale)

                             OrderedModel Results                             
Dep. Variable:                   note   Log-Likelihood:                -3673.1
Model:                   OrderedModel   AIC:                             7484.
Method:            Maximum Likelihood   BIC:                             7953.
Date:                Wed, 13 Nov 2024                                         
Time:                        22:12:56                                         
No. Observations:                6594                                         
Df Residuals:                    6525                                         
Df Model:                          68                                         
                   coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------
good             0.0719      0.078      0.918      0.358      -0.081       0.225
bad              0.8425      0.113      7.454 

In [56]:
#Combinaison la matrice de animation,_adj pour prépare la modélisation
print(animation_train_df [['note']])
animation_train_df.reset_index(drop=True, inplace=True)
animation_adj_sentiment_matrix.reset_index(drop=True, inplace=True)
animation_adj = pd.concat([animation_train_df [['note']], animation_adj_sentiment_matrix], axis=1)
animation_adj['total_sentiment'] = animation_adj_sentiment_matrix.sum(axis=1)
animation_adj = animation_adj.loc[:, (animation_adj != 0).any(axis=0)]
animation_adj = animation_adj[animation_adj['total_sentiment'] != 0]
animation_adj = animation_adj.dropna(subset=['total_sentiment'])
animation_adj = animation_adj.drop_duplicates()
animation_adj = animation_adj.apply(pd.to_numeric, errors='coerce')
median_animation=animation_df['note'].median()
animation_adj['note'] = binarize_by_median(animation_adj, 'note', median_animation)
print(animation_adj)

      note
285    7.0
1304   6.0
428   10.0
1608   9.0
757    2.0
...    ...
1130   9.0
1294   9.0
860    1.0
1459   6.0
1126   8.0

[1153 rows x 1 columns]
      note  good    bad  little  best    old    new  classic  whole  funny  \
0        1  0.75  0.000   0.000  0.00  0.000  0.000    0.375  0.000    0.0   
1        1  0.75  0.000  -0.375  0.00  0.000  0.000    0.000  0.000    0.0   
2        1  0.00  0.000  -0.375  0.00  0.000  0.375    0.375  0.125    0.0   
3        1  0.00  0.000   0.000  0.00  0.000  0.000    0.000  0.000    0.0   
4        0  0.75  0.000   0.000  0.00  0.000  0.000    0.000  0.125    0.5   
...    ...   ...    ...     ...   ...    ...    ...      ...    ...    ...   
1147     1  0.00 -0.625   0.000  0.00  0.000  0.000    0.000  0.000    0.0   
1148     1  0.75  0.000  -0.375  0.00  0.375  0.375    0.375  0.125    0.5   
1150     0  0.75 -0.625   0.000  0.00  0.375  0.000    0.000  0.125    0.0   
1151     1  0.00  0.000   0.000  0.00  0.000  0.000    0.000  0

In [57]:
#Modèle Logit Ordonnance 
y = animation_adj['note']
X = animation_adj.drop(columns=['note','total_sentiment'])
model_inital = OrderedModel(y, X, distr='logit')
result = model_inital.fit(method='bfgs', disp=False)
aic_initiale=result.aic
print(result.summary())
print(aic_initiale)

                             OrderedModel Results                             
Dep. Variable:                   note   Log-Likelihood:                -291.68
Model:                   OrderedModel   AIC:                             717.4
Method:            Maximum Likelihood   BIC:                             1010.
Date:              mer., 13 nov. 2024                                         
Time:                        22:17:42                                         
No. Observations:                 579                                         
Df Residuals:                     512                                         
Df Model:                          66                                         
                   coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------
good             0.5163      0.296      1.742      0.082      -0.065       1.097
bad              1.9228      0.438      4.389 