# model ML

In [None]:
import pandas as pd

train_data = pd.read_csv("train_encoded.csv")
test_data = pd.read_csv("test_encoded.csv")
train_data.head()

Unnamed: 0,Comment,Emotion,Processed_Comment,Emotion_encoded
0,i didnt feel humiliated,sadness,not_feel humiliate,4
1,i can go from feeling so hopeless to so damned...,sadness,feel hopeless damned hopeful around someone ca...,4
2,im grabbing a minute to post i feel greedy wrong,anger,grab minute post feel greedy wrong,0
3,i am ever feeling nostalgic about the fireplac...,love,ever feel nostalgic fireplace know still property,3
4,i am feeling grouchy,anger,feel grouchy,0


In [None]:
print(train_data.columns)

Index(['Comment', 'Emotion', 'Processed_Comment', 'Emotion_encoded'], dtype='object')


In [None]:
# Distribution des classes dans le jeu d'entra√Ænement
print("Distribution des classes - TRAIN :")
print(train_data['Emotion'].value_counts())

# Distribution des classes dans le jeu de test
print("\nDistribution des classes - TEST :")
print(test_data['Emotion'].value_counts())


Distribution des classes - TRAIN :
Emotion
joy         5362
sadness     4666
anger       2159
fear        1937
love        1304
surprise     572
Name: count, dtype: int64

Distribution des classes - TEST :
Emotion
joy         695
sadness     581
anger       275
fear        224
love        159
surprise     66
Name: count, dtype: int64


In [None]:
!pip install imbalanced-learn


Collecting imbalanced-learn
  Downloading imbalanced_learn-0.14.0-py3-none-any.whl.metadata (8.8 kB)
Downloading imbalanced_learn-0.14.0-py3-none-any.whl (239 kB)
Installing collected packages: imbalanced-learn
Successfully installed imbalanced-learn-0.14.0


------->ON va r√©soudre le probl√©me de d√©sequilibre

### RandomForest

In [None]:
# Importations des biblioth√®ques
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

X_train = train_data['Processed_Comment'].astype(str)
y_train = train_data['Emotion_encoded']
X_test = test_data['Processed_Comment'].astype(str)
y_test = test_data['Emotion_encoded']



#  TF-IDF vectorisation (manuelle pour SMOTE)
vectorizer = TfidfVectorizer(
    max_features=10000,
    ngram_range=(1, 2),
    min_df=5,
    max_df=0.7
)

X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)



# Application de SMOTE
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train_tfidf, y_train)

# Afficher la distribution des classes apr√®s SMOTE
pd.Series(y_train_smote).value_counts()




Emotion_encoded
4    5362
0    5362
3    5362
5    5362
1    5362
2    5362
Name: count, dtype: int64

In [None]:

rf_model = RandomForestClassifier(
    n_estimators=200,
    class_weight='balanced',
    random_state=42,
)

print("\n‚û°Ô∏è RandomForest (class_weight)")
rf_model.fit(X_train_tfidf, y_train)
y_pred_rf = rf_model.predict(X_test_tfidf)
print(classification_report(y_test, y_pred_rf))




‚û°Ô∏è RandomForest (class_weight)
              precision    recall  f1-score   support

           0       0.86      0.85      0.86       275
           1       0.83      0.87      0.85       224
           2       0.86      0.89      0.87       695
           3       0.72      0.72      0.72       159
           4       0.92      0.86      0.89       581
           5       0.63      0.73      0.68        66

    accuracy                           0.85      2000
   macro avg       0.80      0.82      0.81      2000
weighted avg       0.86      0.85      0.86      2000



TF-IDF + SMOTE est une combinaison couramment utilis√©e pour l'analyse des sentiments, particuli√®rement dans des situations o√π le d√©s√©quilibre des classes est un probl√®me.

In [None]:

rf_smote_model = RandomForestClassifier(
    n_estimators=200,
    random_state=42,
)

print("\n‚û°Ô∏è RandomForest (SMOTE)")
rf_smote_model.fit(X_train_smote, y_train_smote)
y_pred_smote = rf_smote_model.predict(X_test_tfidf)
print(classification_report(y_test, y_pred_smote))




‚û°Ô∏è RandomForest (SMOTE)
              precision    recall  f1-score   support

           0       0.85      0.87      0.86       275
           1       0.85      0.87      0.86       224
           2       0.87      0.87      0.87       695
           3       0.68      0.75      0.72       159
           4       0.93      0.85      0.89       581
           5       0.62      0.83      0.71        66

    accuracy                           0.85      2000
   macro avg       0.80      0.84      0.82      2000
weighted avg       0.86      0.85      0.86      2000



In [None]:
from sklearn.metrics import f1_score, accuracy_score

# F1-macro pour chaque mod√®le
f1_rf = f1_score(y_test, y_pred_rf, average='macro')
f1_smote = f1_score(y_test, y_pred_smote, average='macro')


# Accuracy pour chaque mod√®le
acc_rf = accuracy_score(y_test, y_pred_rf)
acc_smote = accuracy_score(y_test, y_pred_smote)


# R√©sum√© comparatif
results = pd.DataFrame({
    'Mod√®le': ['RandomForest (class_weight)', 'RandomForest (SMOTE)'],
    'F1_macro': [f1_rf, f1_smote ],
    'Accuracy': [acc_rf, acc_smote]
}).sort_values(by='F1_macro', ascending=False)

print("\nüìä Comparaison des mod√®les :")
print(results)


In [None]:
from sklearn.calibration import CalibratedClassifierCV
from sklearn.naive_bayes import MultinomialNB

nb = MultinomialNB(alpha=0.1)
calibrated_nb = CalibratedClassifierCV(nb, cv=5)
calibrated_nb.fit(X_train_smote, y_train_smote)
y_pred_cal_nb = calibrated_nb.predict(X_test_tfidf)

print("Confusion Matrix (Calibrated NB):")
print(confusion_matrix(y_test, y_pred_cal_nb))
print("\nClassification Report (Calibrated NB):")
print(classification_report(y_test, y_pred_cal_nb))


Confusion Matrix (Calibrated NB):
[[222  10  15   5  21   2]
 [  9 177  12   2  16   8]
 [ 12   5 596  61  14   7]
 [  2   1  25 122   7   2]
 [ 18  18  23  11 510   1]
 [  2   4  11   1   7  41]]

Classification Report (Calibrated NB):
              precision    recall  f1-score   support

           0       0.84      0.81      0.82       275
           1       0.82      0.79      0.81       224
           2       0.87      0.86      0.87       695
           3       0.60      0.77      0.68       159
           4       0.89      0.88      0.88       581
           5       0.67      0.62      0.65        66

    accuracy                           0.83      2000
   macro avg       0.78      0.79      0.78      2000
weighted avg       0.84      0.83      0.84      2000



In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split

# Optional: Reduce number of TF-IDF features during vectorization (if not already done)
# TfidfVectorizer(max_features=5000, ...) ‚Üê update in your TF-IDF section

# Logistic Regression with optimized settings
lr_l1 = LogisticRegression(
    max_iter=1000,
    solver='saga',
    penalty='l1',
    C=10,
    warm_start=True,              # Enables early stopping
    n_jobs=-1,                    # Use all CPU cores
    verbose=1,                    # Shows convergence log
    tol=1e-3                      # Slightly relaxed tolerance
)

# Fit model
lr_l1.fit(X_train_smote, y_train_smote)

# Predict
y_pred_l1 = lr_l1.predict(X_test_tfidf)

# Evaluate
print("Confusion Matrix (L1 Logistic Regression):")
print(confusion_matrix(y_test, y_pred_l1))

print("\nClassification Report (L1 Logistic Regression):")
print(classification_report(y_test, y_pred_l1))


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.


convergence after 321 epochs took 373 seconds
Confusion Matrix (L1 Logistic Regression):
[[239   6   7   2  19   2]
 [  8 197   4   0   5  10]
 [  7   6 617  49  10   6]
 [  3   2  21 129   4   0]
 [ 12  11  19   4 534   1]
 [  1  10   7   0   1  47]]

Classification Report (L1 Logistic Regression):
              precision    recall  f1-score   support

           0       0.89      0.87      0.88       275
           1       0.85      0.88      0.86       224
           2       0.91      0.89      0.90       695
           3       0.70      0.81      0.75       159
           4       0.93      0.92      0.93       581
           5       0.71      0.71      0.71        66

    accuracy                           0.88      2000
   macro avg       0.83      0.85      0.84      2000
weighted avg       0.88      0.88      0.88      2000



In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, VotingClassifier

nb = MultinomialNB(alpha=0.1)
rf = RandomForestClassifier(n_estimators=100, random_state=42)
lr = LogisticRegression(C=10, solver='saga', max_iter=1000)

voting = VotingClassifier(
    estimators=[('lr', lr), ('nb', nb), ('rf', rf)],
    voting='soft'  # or 'hard'
)

voting.fit(X_train_smote, y_train_smote)
y_pred_voting = voting.predict(X_test_tfidf)

print("Confusion Matrix (Voting Classifier):")
print(confusion_matrix(y_test, y_pred_voting))
print("\nClassification Report (Voting Classifier):")
print(classification_report(y_test, y_pred_voting))


Confusion Matrix (Voting Classifier):
[[240   6   9   3  15   2]
 [ 10 188   3   0  10  13]
 [  3   2 620  52  11   7]
 [  2   1  22 130   4   0]
 [ 14  10  21   4 531   1]
 [  0   8   4   0   2  52]]

Classification Report (Voting Classifier):
              precision    recall  f1-score   support

           0       0.89      0.87      0.88       275
           1       0.87      0.84      0.86       224
           2       0.91      0.89      0.90       695
           3       0.69      0.82      0.75       159
           4       0.93      0.91      0.92       581
           5       0.69      0.79      0.74        66

    accuracy                           0.88      2000
   macro avg       0.83      0.85      0.84      2000
weighted avg       0.88      0.88      0.88      2000

