In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import nltk
from nltk.corpus import stopwords
import re
import spacy
from collections import Counter
from xgboost import XGBClassifier

# Download necessary NLTK data
nltk.download('stopwords')

# Download spaCy French model
spacy.cli.download("fr_core_news_sm")

# Load spaCy French model
nlp = spacy.load('fr_core_news_sm')

# Load the dataset
file_path = 'projetintegrer.csv'  # Update the path if necessary
data = pd.read_csv(file_path)

# Initialize French stopwords
stop_words = set(stopwords.words('french'))

# Function to clean tokenized text
def clean_text(tokens):
    # Remove punctuation and special characters
    tokens = [re.sub(r'\W+', '', token) for token in tokens]
    # Convert to lowercase
    tokens = [token.lower() for token in tokens]
    # Remove stopwords
    tokens = [token for token in tokens if token not in stop_words]
    # Lemmatize using spaCy
    doc = nlp(' '.join(tokens))
    tokens = [token.lemma_ for token in doc]
    # Remove short words
    tokens = [token for token in tokens if len(token) > 2]
    return tokens

# Apply cleaning function to tokenized text
data['Cleaned_Tokens'] = data['Tokenized'].apply(lambda x: clean_text(eval(x)))

# Flatten the list of tokens to create a single list of all words
all_words = [word for tokens in data['Cleaned_Tokens'] for word in tokens]

# Calculate the frequency of each word
word_freq = Counter(all_words)

# Define a threshold for rare words (e.g., words that appear less than 5 times)
threshold = 5

# Create a set of rare words
rare_words = {word for word, freq in word_freq.items() if freq < threshold}

# Function to remove rare words from tokenized text
def remove_rare_words(tokens):
    return [token for token in tokens if token not in rare_words]

# Apply the function to remove rare words
data['Cleaned_Tokens'] = data['Cleaned_Tokens'].apply(remove_rare_words)
data['Cleaned_Text'] = data['Cleaned_Tokens'].apply(lambda x: ' '.join(x))

# Generate dummy sentiment labels for demonstration (replace with actual labels if available)
import numpy as np
np.random.seed(42)
data['Sentiment'] = np.random.choice([0, 1], size=len(data))

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data['Cleaned_Text'], data['Sentiment'], test_size=0.2, random_state=42)

# Adjusting the TF-IDF vectorizer parameters
vectorizer = TfidfVectorizer(
    max_features=5000,  # Consider only the top 5000 features
    ngram_range=(1, 2),  # Consider unigrams and bigrams
    max_df=0.95,  # Ignore terms that appear in more than 95% of the documents
    min_df=2  # Ignore terms that appear in fewer than 2 documents
)

X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Define hyperparameters for Grid Search
param_grid_nb = {
    'alpha': [0.01, 0.1, 1, 10, 100]
}
param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}
param_grid_xgb = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2]
}

# Perform Grid Search for Naive Bayes
nb_model = MultinomialNB()
grid_search_nb = GridSearchCV(nb_model, param_grid_nb, cv=5, scoring='accuracy')
grid_search_nb.fit(X_train_tfidf, y_train)
nb_best = grid_search_nb.best_estimator_

# Train and evaluate the best Naive Bayes model
nb_pred = nb_best.predict(X_test_tfidf)
nb_accuracy = accuracy_score(y_test, nb_pred)
nb_report = classification_report(y_test, nb_pred)

print("Naive Bayes Accuracy:", nb_accuracy)
print("Naive Bayes Classification Report:\n", nb_report)

# Perform Grid Search for Random Forest
rf_model = RandomForestClassifier(random_state=42)
grid_search_rf = GridSearchCV(rf_model, param_grid_rf, cv=5, scoring='accuracy')
grid_search_rf.fit(X_train_tfidf, y_train)
rf_best = grid_search_rf.best_estimator_

# Train and evaluate the best Random Forest model
rf_pred = rf_best.predict(X_test_tfidf)
rf_accuracy = accuracy_score(y_test, rf_pred)
rf_report = classification_report(y_test, rf_pred)

print("Random Forest Accuracy:", rf_accuracy)
print("Random Forest Classification Report:\n", rf_report)

# Perform Grid Search for XGBoost
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
grid_search_xgb = GridSearchCV(xgb_model, param_grid_xgb, cv=5, scoring='accuracy')
grid_search_xgb.fit(X_train_tfidf, y_train)
xgb_best = grid_search_xgb.best_estimator_

# Train and evaluate the best XGBoost model
xgb_pred = xgb_best.predict(X_test_tfidf)
xgb_accuracy = accuracy_score(y_test, xgb_pred)
xgb_report = classification_report(y_test, xgb_pred)

print("XGBoost Accuracy:", xgb_accuracy)
print("XGBoost Classification Report:\n", xgb_report)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Administrator\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('fr_core_news_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Naive Bayes Accuracy: 0.46437994722955145
Naive Bayes Classification Report:
               precision    recall  f1-score   support

           0       0.46      1.00      0.63       176
           1       0.00      0.00      0.00       203

    accuracy                           0.46       379
   macro avg       0.23      0.50      0.32       379
weighted avg       0.22      0.46      0.29       379

Random Forest Accuracy: 0.503957783641161
Random Forest Classification Report:
               precision    recall  f1-score   support

           0       0.47      0.52      0.49       176
           1       0.54      0.49      0.51       203

    accuracy                           0.50       379
   macro avg       0.51      0.51      0.50       379
weighted avg       0.51      0.50      0.50       379

XGBoost Accuracy: 0.48021108179419525
XGBoost Classification Report:
               precision    recall  f1-score   support

           0       0.47      0.88      0.61       176
         

In [8]:
!pip install smote
!pip install imbalanced-learn
!pip install GridSearchCV
from imblearn.over_sampling import SMOTE
from sklearn.pipeline import Pipeline





ERROR: Could not find a version that satisfies the requirement GridSearchCV (from versions: none)
ERROR: No matching distribution found for GridSearchCV


In [9]:
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_tfidf, y_train)


# Naive Bayes
nb_model = MultinomialNB()
grid_search_nb = GridSearchCV(nb_model, param_grid_nb, cv=5, scoring='accuracy')
grid_search_nb.fit(X_train_resampled, y_train_resampled)
nb_best = grid_search_nb.best_estimator_

# Train and evaluate the best Naive Bayes model
nb_pred = nb_best.predict(X_test_tfidf)
nb_accuracy = accuracy_score(y_test, nb_pred)
nb_report = classification_report(y_test, nb_pred)

print("Naive Bayes Accuracy:", nb_accuracy)
print("Naive Bayes Classification Report:\n", nb_report)

# Random Forest
rf_model = RandomForestClassifier(random_state=42)
grid_search_rf = GridSearchCV(rf_model, param_grid_rf, cv=5, scoring='accuracy')
grid_search_rf.fit(X_train_resampled, y_train_resampled)
rf_best = grid_search_rf.best_estimator_

# Train and evaluate the best Random Forest model
rf_pred = rf_best.predict(X_test_tfidf)
rf_accuracy = accuracy_score(y_test, rf_pred)
rf_report = classification_report(y_test, rf_pred)

print("Random Forest Accuracy:", rf_accuracy)
print("Random Forest Classification Report:\n", rf_report)

# XGBoost
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
grid_search_xgb = GridSearchCV(xgb_model, param_grid_xgb, cv=5, scoring='accuracy')
grid_search_xgb.fit(X_train_resampled, y_train_resampled)
xgb_best = grid_search_xgb.best_estimator_

# Train and evaluate the best XGBoost model
xgb_pred = xgb_best.predict(X_test_tfidf)
xgb_accuracy = accuracy_score(y_test, xgb_pred)
xgb_report = classification_report(y_test, xgb_pred)

print("XGBoost Accuracy:", xgb_accuracy)
print("XGBoost Classification Report:\n", xgb_report)

Naive Bayes Accuracy: 0.49340369393139843
Naive Bayes Classification Report:
               precision    recall  f1-score   support

           0       0.46      0.56      0.51       176
           1       0.53      0.44      0.48       203

    accuracy                           0.49       379
   macro avg       0.50      0.50      0.49       379
weighted avg       0.50      0.49      0.49       379

Random Forest Accuracy: 0.525065963060686
Random Forest Classification Report:
               precision    recall  f1-score   support

           0       0.49      0.47      0.48       176
           1       0.56      0.57      0.56       203

    accuracy                           0.53       379
   macro avg       0.52      0.52      0.52       379
weighted avg       0.52      0.53      0.52       379

XGBoost Accuracy: 0.5408970976253298
XGBoost Classification Report:
               precision    recall  f1-score   support

           0       0.51      0.40      0.45       176
          

In [37]:
import pandas as pd

input_file_path = 'projetintegrer.csv'
df = pd.read_csv(input_file_path)

if 'Cleaned_Tokens' not in df.columns:
    raise ValueError("mkynach had l column")

tokens = df['Cleaned_Tokens']

output_file_path = 'newp.csv'
tokens.to_csv(output_file_path, index = False, header = True)

print(f"Extracted tokens have been saved to {output_file_path}")

Extracted tokens have been saved to newp.csv
