# IDL TP Final `scikit-learn` Projet

## Charger les donnés

In [1]:
import os
import pandas as pd

In [2]:
def load_reviews(data_dir):
	reviews = []
	labels = []
	for label in ["pos", "neg"]:
		directory = os.path.join(data_dir, label)
		for filename in os.listdir(directory):
			if filename.endswith(".txt"):
				file_path = os.path.join(directory, filename)
				with open(file_path, 'r', encoding='utf-8') as file:
					reviews.append(file.read())
					labels.append(1 if label == "pos" else 0)
	return reviews, labels

In [3]:
data_dir = 'imdb_smol'
reviews, labels = load_reviews(data_dir)
reviews_df = pd.DataFrame({'review': reviews, 'label': labels})

In [4]:
print("Nombre de données:", len(reviews_df))

Nombre de données: 602


In [5]:
reviews_df.head()

Unnamed: 0,review,label
0,"The production quality, cast, premise, authent...",1
1,"This is no art-house film, it's mainstream ent...",1
2,Two great comedians in a great Neil Simon movi...,1
3,I'm a fan of TV movies in general and this was...,1
4,Once upon a time in a castle...... Two little ...,1


In [6]:
print("\nNombre de notes positives:", (reviews_df['label'] == 1).sum())
print("Nombre de notes négatives:", (reviews_df['label'] == 0).sum())


Nombre de notes positives: 301
Nombre de notes négatives: 301


In [7]:
print("\nInformations générales sur le DataFrame:")
print(reviews_df.info())


Informations générales sur le DataFrame:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 602 entries, 0 to 601
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   review  602 non-null    object
 1   label   602 non-null    int64 
dtypes: int64(1), object(1)
memory usage: 9.5+ KB
None


In [8]:
for index, row in reviews_df.sample(5).iterrows():
	print("\nIndex de la note de review:", index, "\nTexte de la note de review:\n", row['review'])
	print("Étiquette:", "Positive" if row['label'] == 1 else "Négative")

	print("\nIndex de la note de review:", index, "\nTexte de la note de review:\n", row['review'])
	print("Étiquette:", "Positive" if row['label'] == 1 else "Négative")


Index de la note de review: 203 
Texte de la note de review:
 What a fun movie! If you're a Giallo fan, Red Queen Kills 7 Times is a real winner. To begin with, it's hard to go wrong with Barbara Bouchet and Marina Malfatti in the same Giallo. Both are wonderful - especially the wide-eyed innocent Bouchet as the guilt ridden woman fearing for her life. The kill scenes in Red Queen Kills 7 Times are especially nice and feature enough blood to make most fans happy. One of the first murders comes rather unexpectedly and really gets the movie off to a good start. The killer, The Red Queen, is one of the most over-the-top and interesting looking murderers I've seen in an Italian movie. The 70s sets and fashions are wonderful. In fact, all the visuals are interesting with the laughing Red Queen running across the bridge at night being one of my favorite moments in the film. The convoluted plot held my interest throughout. Finally, Bruno Nicolai's score may be the best I've heard from the co

## Vectorisation

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [10]:
def vectorize_text(data, vectorizer):
	"""Vectorize text data using specified vectorizer."""
	vectorized_data = vectorizer.fit_transform(data)
	return vectorized_data, vectorizer

In [11]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)

In [12]:
count_vectorizer = CountVectorizer(stop_words='english', max_features=5000)

In [13]:
features, current_vectorizer = vectorize_text(reviews_df['review'], tfidf_vectorizer)
print("Forme de la matrice TF-IDF :", features.shape)

Forme de la matrice TF-IDF : (602, 5000)


In [14]:
features_count, _ = vectorize_text(reviews_df['review'], count_vectorizer)
print("Forme de la matrice Count :", features_count.shape)

Forme de la matrice Count : (602, 5000)


## Entraînement

### 1. Logistic Regression

In [15]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

In [16]:
def train_and_evaluate(features, labels):
	"""Train and evaluate a logistic regression model."""
	X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.25, random_state=42)
	model = LogisticRegression(random_state=42)
	model.fit(X_train, y_train)
	y_pred = model.predict(X_test)
	print("Accuracy:", accuracy_score(y_test, y_pred))
	print("Classification Report:\n", classification_report(y_test, y_pred))

In [17]:
train_and_evaluate(features, reviews_df['label'])

Accuracy: 0.8543046357615894
Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.88      0.85        68
           1       0.90      0.83      0.86        83

    accuracy                           0.85       151
   macro avg       0.85      0.86      0.85       151
weighted avg       0.86      0.85      0.85       151



In [18]:
train_and_evaluate(features_count, reviews_df['label'])

Accuracy: 0.8079470198675497
Classification Report:
               precision    recall  f1-score   support

           0       0.79      0.78      0.79        68
           1       0.82      0.83      0.83        83

    accuracy                           0.81       151
   macro avg       0.81      0.81      0.81       151
weighted avg       0.81      0.81      0.81       151



### 2. SVM & GridSearchCV

In [19]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

In [20]:
def train_and_evaluate_svm(X_train, y_train, X_test, y_test):
	param_grid = {
		'C': [0.1, 1, 10, 100],
		'kernel': ['linear', 'rbf'],
		'gamma': ['scale', 'auto']
	}
	grid_search = GridSearchCV(SVC(), param_grid, cv=5, scoring='accuracy', verbose=1)
	grid_search.fit(X_train, y_train)
	print("Best parameters:", grid_search.best_params_)
	best_model = grid_search.best_estimator_
	y_pred = best_model.predict(y_test)
	print("Accuracy:", accuracy_score(y_test, y_pred))
	print("Classification Report:\n", classification_report(y_test, y_pred))

In [21]:
train_and_evaluate_svm(features, reviews_df['label'], features, reviews_df['label'])

Fitting 5 folds for each of 16 candidates, totalling 80 fits
Best parameters: {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}


ValueError: Expected a 2-dimensional container but got <class 'pandas.core.series.Series'> instead. Pass a DataFrame containing a single row (i.e. single sample) or a single column (i.e. single feature) instead.

In [None]:
train_and_evaluate_svm(features_count, reviews_df['label'], features_count, reviews_df['label'])

Fitting 5 folds for each of 16 candidates, totalling 80 fits
Best parameters: {'C': 100, 'gamma': 'auto', 'kernel': 'rbf'}
Accuracy: 1.0
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       301
           1       1.00      1.00      1.00       301

    accuracy                           1.00       602
   macro avg       1.00      1.00      1.00       602
weighted avg       1.00      1.00      1.00       602



### 3. Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
def train_and_evaluate_rf(features, labels):
	X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.25, random_state=42)
	model = RandomForestClassifier(n_estimators=100, random_state=42)
	model.fit(X_train, y_train)
	y_pred = model.predict(X_test)
	print("Accuracy:", accuracy_score(y_test, y_pred))
	print("Classification Report:\n", classification_report(y_test, y_pred))

In [None]:
train_and_evaluate_rf(features, reviews_df['label'])

Accuracy: 0.7549668874172185
Classification Report:
               precision    recall  f1-score   support

           0       0.70      0.81      0.75        68
           1       0.82      0.71      0.76        83

    accuracy                           0.75       151
   macro avg       0.76      0.76      0.75       151
weighted avg       0.76      0.75      0.76       151



In [None]:
train_and_evaluate_rf(features_count, reviews_df['label'])

Accuracy: 0.7549668874172185
Classification Report:
               precision    recall  f1-score   support

           0       0.68      0.87      0.76        68
           1       0.86      0.66      0.75        83

    accuracy                           0.75       151
   macro avg       0.77      0.77      0.75       151
weighted avg       0.78      0.75      0.75       151



### 4. Naive Bayes

In [None]:
from sklearn.naive_bayes import MultinomialNB

In [None]:
def train_and_evaluate_nb(features, labels):
	X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.25, random_state=42)
	model = MultinomialNB()
	model.fit(X_train, y_train)
	y_pred = model.predict(X_test)
	print("Accuracy:", accuracy_score(y_test, y_pred))
	print("Classification Report:\n", classification_report(y_test, y_pred))

In [None]:
train_and_evaluate_nb(features, reviews_df['label'])

Accuracy: 0.8145695364238411
Classification Report:
               precision    recall  f1-score   support

           0       0.74      0.91      0.82        68
           1       0.91      0.73      0.81        83

    accuracy                           0.81       151
   macro avg       0.82      0.82      0.81       151
weighted avg       0.83      0.81      0.81       151



In [None]:
train_and_evaluate_nb(features_count, reviews_df['label'])

Accuracy: 0.8410596026490066
Classification Report:
               precision    recall  f1-score   support

           0       0.78      0.91      0.84        68
           1       0.92      0.78      0.84        83

    accuracy                           0.84       151
   macro avg       0.85      0.85      0.84       151
weighted avg       0.85      0.84      0.84       151



### 5. Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
def train_and_evaluate_dt(features, labels):
	X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.25, random_state=42)
	model = DecisionTreeClassifier(random_state=42)
	model.fit(X_train, y_train)
	y_pred = model.predict(X_test)
	print("Accuracy:", accuracy_score(y_test, y_pred))
	print("Classification Report:\n", classification_report(y_test, y_pred))

In [None]:
train_and_evaluate_dt(features, reviews_df['label'])

Accuracy: 0.6622516556291391
Classification Report:
               precision    recall  f1-score   support

           0       0.62      0.66      0.64        68
           1       0.71      0.66      0.68        83

    accuracy                           0.66       151
   macro avg       0.66      0.66      0.66       151
weighted avg       0.67      0.66      0.66       151



In [None]:
train_and_evaluate_dt(features_count, reviews_df['label'])

Accuracy: 0.6291390728476821
Classification Report:
               precision    recall  f1-score   support

           0       0.58      0.63      0.61        68
           1       0.68      0.63      0.65        83

    accuracy                           0.63       151
   macro avg       0.63      0.63      0.63       151
weighted avg       0.63      0.63      0.63       151



### 6. K-Nearest Neighbors

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
def train_and_evaluate_knn(features, labels):
	X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.25, random_state=42)
	model = KNeighborsClassifier()
	model.fit(X_train, y_train)
	y_pred = model.predict(X_test)
	print("Accuracy:", accuracy_score(y_test, y_pred))
	print("Classification Report:\n", classification_report(y_test, y_pred))

In [None]:
train_and_evaluate_knn(features, reviews_df['label'])

Accuracy: 0.8675496688741722
Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.85      0.85        68
           1       0.88      0.88      0.88        83

    accuracy                           0.87       151
   macro avg       0.87      0.87      0.87       151
weighted avg       0.87      0.87      0.87       151



In [None]:
train_and_evaluate_knn(features_count, reviews_df['label'])

Accuracy: 0.6291390728476821
Classification Report:
               precision    recall  f1-score   support

           0       0.60      0.53      0.56        68
           1       0.65      0.71      0.68        83

    accuracy                           0.63       151
   macro avg       0.62      0.62      0.62       151
weighted avg       0.63      0.63      0.63       151



### 7. XGBoost

In [None]:
from xgboost import XGBClassifier

In [None]:
def train_and_evaluate_xgb(features, labels):
	X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.25, random_state=42)
	model = XGBClassifier()
	model.fit(X_train, y_train)
	y_pred = model.predict(X_test)
	print("Accuracy:", accuracy_score(y_test, y_pred))
	print("Classification Report:\n", classification_report(y_test, y_pred))

In [None]:
train_and_evaluate_xgb(features, reviews_df['label'])

Accuracy: 0.7417218543046358
Classification Report:
               precision    recall  f1-score   support

           0       0.71      0.72      0.72        68
           1       0.77      0.76      0.76        83

    accuracy                           0.74       151
   macro avg       0.74      0.74      0.74       151
weighted avg       0.74      0.74      0.74       151



In [None]:
train_and_evaluate_xgb(features_count, reviews_df['label'])

Accuracy: 0.7483443708609272
Classification Report:
               precision    recall  f1-score   support

           0       0.70      0.76      0.73        68
           1       0.79      0.73      0.76        83

    accuracy                           0.75       151
   macro avg       0.75      0.75      0.75       151
weighted avg       0.75      0.75      0.75       151



## Evaluation

In [None]:
comparison_data = {
    "Model": [],
    "Vectorization": [],
    "Accuracy": [],
    "Precision (macro)": [],
    "Recall (macro)": [],
    "F1-score (macro)": [],
}

In [None]:
def add_to_comparison(model_name, vectorization, accuracy, report):
	"""Add data to the comparison table."""
	comparison_data["Model"].append(model_name)
	comparison_data["Vectorization"].append(vectorization)
	comparison_data["Accuracy"].append(accuracy)
	comparison_data["Precision (macro)"].append(report["macro avg"]["precision"])
	comparison_data["Recall (macro)"].append(report["macro avg"]["recall"])
	comparison_data["F1-score (macro)"].append(report["macro avg"]["f1-score"])

In [None]:
def run_and_add_to_comparison(features, labels, model_name, model, vectorization_type):
	"""Run the model and add it to the comparison table."""
	X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.25, random_state=42)
	model.fit(X_train, y_train)
	y_pred = model.predict(X_test)
	accuracy = accuracy_score(y_test, y_pred)
	report = classification_report(y_test, y_pred, output_dict=True)
	add_to_comparison(model_name, vectorization_type, accuracy, report)

In [None]:
run_and_add_to_comparison(features, reviews_df['label'], "Logistic Regression", LogisticRegression(random_state=42), "TF-IDF")
run_and_add_to_comparison(features_count, reviews_df['label'], "Logistic Regression", LogisticRegression(random_state=42), "Count")

run_and_add_to_comparison(features, reviews_df['label'], "Random Forest", RandomForestClassifier(n_estimators=100, random_state=42), "TF-IDF")
run_and_add_to_comparison(features_count, reviews_df['label'], "Random Forest", RandomForestClassifier(n_estimators=100, random_state=42), "Count")

run_and_add_to_comparison(features, reviews_df['label'], "Naive Bayes", MultinomialNB(), "TF-IDF")
run_and_add_to_comparison(features_count, reviews_df['label'], "Naive Bayes", MultinomialNB(), "Count")

run_and_add_to_comparison(features, reviews_df['label'], "SVM", SVC(), "TF-IDF")
run_and_add_to_comparison(features_count, reviews_df['label'], "SVM", SVC(), "Count")

run_and_add_to_comparison(features, reviews_df['label'], "Decision Tree", DecisionTreeClassifier(random_state=42), "TF-IDF")
run_and_add_to_comparison(features_count, reviews_df['label'], "Decision Tree", DecisionTreeClassifier(random_state=42), "Count")

run_and_add_to_comparison(features, reviews_df['label'], "KNN", KNeighborsClassifier(), "TF-IDF")
run_and_add_to_comparison(features_count, reviews_df['label'], "KNN", KNeighborsClassifier(), "Count")

run_and_add_to_comparison(features, reviews_df['label'], "XGBoost", XGBClassifier(), "TF-IDF")
run_and_add_to_comparison(features_count, reviews_df['label'], "XGBoost", XGBClassifier(), "Count")

In [None]:
comparison_df = pd.DataFrame(comparison_data)
comparison_df.head(20)

Unnamed: 0,Model,Vectorization,Accuracy,Precision (macro),Recall (macro),F1-score (macro)
0,Logistic Regression,TF-IDF,0.854305,0.853457,0.856839,0.853785
1,Logistic Regression,Count,0.807947,0.806237,0.805369,0.805766
2,Random Forest,TF-IDF,0.754967,0.757823,0.759833,0.754795
3,Random Forest,Count,0.754967,0.768768,0.765149,0.754795
4,Naive Bayes,TF-IDF,0.81457,0.824271,0.823352,0.814561
5,Naive Bayes,Count,0.84106,0.845246,0.847449,0.840997
6,SVM,TF-IDF,0.854305,0.85634,0.859497,0.854145
7,SVM,Count,0.761589,0.762454,0.753898,0.755927
8,Decision Tree,TF-IDF,0.662252,0.660783,0.662208,0.660764
9,Decision Tree,Count,0.629139,0.628203,0.629429,0.627817
