# IDL TP Final `scikit-learn` Projet

## Charger les donnés

In [1]:
import os
import pandas as pd

In [2]:
def load_reviews(data_dir):
	reviews = []
	labels = []
	for label in ["pos", "neg"]:
		directory = os.path.join(data_dir, label)
		for filename in os.listdir(directory):
			if filename.endswith(".txt"):
				file_path = os.path.join(directory, filename)
				with open(file_path, 'r', encoding='utf-8') as file:
					reviews.append(file.read())
					labels.append(1 if label == "pos" else 0)
	return reviews, labels

In [3]:
data_dir = 'imdb_smol'
reviews, labels = load_reviews(data_dir)
reviews_df = pd.DataFrame({'review': reviews, 'label': labels})

In [4]:
print(reviews_df.head())

                                              review  label
0  The production quality, cast, premise, authent...      1
1  This is no art-house film, it's mainstream ent...      1
2  Two great comedians in a great Neil Simon movi...      1
3  I'm a fan of TV movies in general and this was...      1
4  Once upon a time in a castle...... Two little ...      1


In [6]:
print("\nNombre de notes positives:", (reviews_df['label'] == 1).sum())
print("Nombre de notes négatives:", (reviews_df['label'] == 0).sum())


Nombre de notes positives: 301
Nombre de notes négatives: 301


In [7]:
print("\nInformations générales sur le DataFrame:")
print(reviews_df.info())


Informations générales sur le DataFrame:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 602 entries, 0 to 601
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   review  602 non-null    object
 1   label   602 non-null    int64 
dtypes: int64(1), object(1)
memory usage: 9.5+ KB
None


In [8]:
for index, row in reviews_df.sample(5).iterrows():
	print("\nIndex de la note de review:", index, "\nTexte de la note de review:\n", row['review'])
	print("Étiquette:", "Positive" if row['label'] == 1 else "Négative")

	print("\nIndex de la note de review:", index, "\nTexte de la note de review:\n", row['review'])
	print("Étiquette:", "Positive" if row['label'] == 1 else "Négative")


Index de la note de review: 166 
Texte de la note de review:
 Undying is a very good game which brings some new elements on the tired genre of first person shoot em ups. It tells the story of Patrick Galloway an expert of the occult and a formidable fighter who is summoned by a friend to his estate in Ireland to investigate some weird phainomena. The game is set in Ireland after World War one so don't expect to find weapons like chainguns or rocket launchers.All the weapons in the game can be considered antiques but the real fun in the game are its spells and the system they operate on.Our hero is ambidexterous so he can use both his hands at the same time: he casts spells with his right arm and uses his guns with the left.So you can shoot and cast spells at the same time which as you understand very fun and also unique to this game! The graphics are great and they can run very well on a medium power P.C..Level design is also cool and atmospheric. Mostly the game revolves around the C

## Vectorisation

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [10]:
def vectorize_text(data):
	"""Vectorize text data using TF-IDF vectorizer."""
	tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
	features = tfidf_vectorizer.fit_transform(data)
	return features, tfidf_vectorizer

In [11]:
features, tfidf_vectorizer = vectorize_text(reviews_df['review'])
print("Forme de la matrice TF-IDF :", features.shape)

Forme de la matrice TF-IDF : (602, 5000)


## Entraînement

### 1. Logistic Regression

In [12]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

In [13]:
def train_and_evaluate(features, labels):
	"""Train and evaluate a logistic regression model."""
	X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.25, random_state=42)
	model = LogisticRegression(random_state=42)
	model.fit(X_train, y_train)
	y_pred = model.predict(X_test)
	print("Accuracy:", accuracy_score(y_test, y_pred))
	print("Classification Report:\n", classification_report(y_test, y_pred))

In [14]:
train_and_evaluate(features, reviews_df['label'])

Accuracy: 0.8543046357615894
Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.88      0.85        68
           1       0.90      0.83      0.86        83

    accuracy                           0.85       151
   macro avg       0.85      0.86      0.85       151
weighted avg       0.86      0.85      0.85       151

