In [1]:
import shelve
import snowballstemmer
from stop_words import get_stop_words
import re
from sklearn.feature_extraction.text import CountVectorizer
import joblib
from collections import Counter
import pandas as pd

def separate_articles_by_language(article_db):
    article_db_french = {}
    article_db_english = {}

    for article_id, article in article_db.items():
        language = article.get('Langue', '').lower()
        if language == 'fr':
            article_db_french[article_id] = article
        elif language == 'en':
            article_db_english[article_id] = article

    return article_db_french, article_db_english

def process_articles(article_db, stopwords, stemmer):
    all_processed_texts = []

    for article_id, article in article_db.items():
        title = article.get('Titre', '')
        description = article.get('Description / Résumé', '')
        content = article.get('Contenu', '')

        # Combine the values and print
        combined_text = f"{title} {description} {content}"

        # Remove punctuation, make lowercase, and remove numbers
        text_without_punctuation = re.sub(r'[^\w\s]', '', combined_text)
        text_lower = text_without_punctuation.lower()
        text_without_numbers = re.sub(r'\d+', '', text_lower)

        # Tokenize the text and perform stemming
        words = text_without_numbers.split()
        filtered_words = [word for word in words if word.lower() not in stopwords]
        stemmed_words = [stemmer.stemWord(word) for word in filtered_words]

        # Combine the stemmed words into a string for CountVectorizer
        processed_text = ' '.join(stemmed_words)

        # Append processed text to the list
        all_processed_texts.append(processed_text)

    return all_processed_texts

def save_and_load_data(vectorizer, sparse_matrix, filename_feature, filename_matrix):
    # Save feature names and sparse matrix into two files
    joblib.dump(vectorizer.get_feature_names_out(), filename_feature)
    joblib.dump(sparse_matrix, filename_matrix)

    # Load feature names and sparse matrix back into memory
    loaded_feature_names = joblib.load(filename_feature)
    loaded_sparse_matrix = joblib.load(filename_matrix)

    return loaded_feature_names, loaded_sparse_matrix

def calculate_word_occurrences(feature_names, sparse_matrix):
    total_word_occurrences = Counter()
    for row in sparse_matrix:
        total_word_occurrences += Counter({feature: count for feature, count in zip(feature_names, row.toarray()[0])})

    return total_word_occurrences

def display_word_occurrences(word_occurrences):
    # Display the total occurrences of each word with more than 100 occurrences, sorted by count in descending order
    print("\nTotal Word Occurrences Across All Articles (with more than 100 occurrences), sorted by count:")

    # Sort the total_word_occurrences dictionary by count in descending order
    sorted_word_occurrences = sorted(word_occurrences.items(), key=lambda x: x[1], reverse=False)

    # Display sorted results
    for word, count in sorted_word_occurrences:
        print(f"{word}: {count}")

def create_dataset(article_db, stopwords, stemmer, vectorizer, filename_feature, filename_matrix, language):
    all_processed_texts = process_articles(article_db, stopwords, stemmer)
    all_sparse_matrix = vectorizer.transform(all_processed_texts)
    loaded_feature_names, loaded_sparse_matrix = save_and_load_data(vectorizer, all_sparse_matrix, filename_feature,
                                                                   filename_matrix)
    total_word_occurrences = calculate_word_occurrences(loaded_feature_names, loaded_sparse_matrix)

    result_list = []

    for article_id, article in article_db.items():
        title = article.get('Titre', '')
        description = article.get('Description / Résumé', '')
        content = article.get('Contenu', '')
        combined_text = f"{title} {description} {content}"
        text_without_punctuation = re.sub(r'[^\w\s]', '', combined_text)
        text_lower = text_without_punctuation.lower()
        text_without_numbers = re.sub(r'\d+', '', text_lower)
        words = text_without_numbers.split()
        filtered_words = [word for word in words if word.lower() not in stopwords]
        stemmed_words = [stemmer.stemWord(word) for word in filtered_words]
        processed_text = ' '.join(stemmed_words)
        word_occurrences_dict = {word: total_word_occurrences[word] for word in processed_text.split()}

        category = article.get('Catégorie', '')



        result_list.append({'document': article_id, 'word_occurrences': word_occurrences_dict,
                            'catégorie': category })

    result_dataset = pd.DataFrame(result_list)
    return result_dataset

# Language settings
lang_french = "french"
lang_english = "english"

# Snowball stemmers and stop words
stemmer_french = snowballstemmer.stemmer(lang_french)
stopwords_french = get_stop_words(lang_french)

stemmer_english = snowballstemmer.stemmer(lang_english)
stopwords_english = get_stop_words(lang_english)

# Open the shelve file for reading
article_db = shelve.open('./benchmark/defi_db', 'r')

# Separate articles by language
article_db_french, article_db_english = separate_articles_by_language(article_db)

# Close the shelve file when done
article_db.close()

# Use CountVectorizer directly on processed texts for each language
vectorizer_french = CountVectorizer(stop_words=stopwords_french)
all_processed_texts_french = process_articles(article_db_french, stopwords_french, stemmer_french)
all_sparse_matrix_french = vectorizer_french.fit_transform(all_processed_texts_french)

vectorizer_english = CountVectorizer(stop_words=stopwords_english)
all_processed_texts_english = process_articles(article_db_english, stopwords_english, stemmer_english)
all_sparse_matrix_english = vectorizer_english.fit_transform(all_processed_texts_english)




# Create dataset for French
defi_french = create_dataset(article_db_french, stopwords_french, stemmer_french, vectorizer_french,
                                'feature_names_defi_french.joblib', 'sparse_matrix_defi_french.joblib', lang_french)

# Create dataset for English
defi_english = create_dataset(article_db_english, stopwords_english, stemmer_english, vectorizer_english,
                                 'feature_names_defi_english.joblib', 'sparse_matrix_defi_english.joblib', lang_english)

# Display the datasets
print("\nDataset for French:")
print(defi_french.head())

print("\nDataset for English:")
print(defi_english.head())





Dataset for French:
                           document  \
0  527ba1db806cfd245d90526c5a7b9342   
1  150854b0beccb3ee7d55fad80f92e58b   
2  e9ad60a84487a9e79cc3e95fc1d60f95   
3  395be8ab43a1833438a972ac5ecd0f48   
4  4722474dddcd6a982556ec48b8b07ab3   

                                    word_occurrences catégorie  
0  {'b': 0, 'london': 27, 'lion': 103, 'do': 44, ...         ?  
1  {'b': 0, 'french': 110, 'football': 1626, 'pre...         ?  
2  {'b': 0, 'patisser': 27, 'valer': 113, 'four':...         ?  
3  {'b': 0, 'nas': 566, 'spacex': 232, 'launch': ...         ?  
4  {'b': 0, 'russian': 17, 'influx': 1, 'driv': 3...         ?  

Dataset for English:
                           document  \
0  a5319b640496be985be9e46141a44d26   
1  733691e2b8bb30046fcb3b3870acdda9   
2  aae4c53ffd63ab90fafe911020a39411   
3  bd90e1776424ffb662d7502846824612   
4  729d9fa185f1ab06151e847d1e3aacea   

                                    word_occurrences catégorie  
0  {'b': 0, 'true': 155, 'corona

In [2]:
defi_french

Unnamed: 0,document,word_occurrences,catégorie
0,527ba1db806cfd245d90526c5a7b9342,"{'b': 0, 'london': 27, 'lion': 103, 'do': 44, ...",?
1,150854b0beccb3ee7d55fad80f92e58b,"{'b': 0, 'french': 110, 'football': 1626, 'pre...",?
2,e9ad60a84487a9e79cc3e95fc1d60f95,"{'b': 0, 'patisser': 27, 'valer': 113, 'four':...",?
3,395be8ab43a1833438a972ac5ecd0f48,"{'b': 0, 'nas': 566, 'spacex': 232, 'launch': ...",?
4,4722474dddcd6a982556ec48b8b07ab3,"{'b': 0, 'russian': 17, 'influx': 1, 'driv': 3...",?
...,...,...,...
69411,ce384ee0886519501c7d197fb26dad3c,"{'b': 0, 'anod': 6, 'catalyseur': 10, 'bio': 7...",?
69412,1fcc25ec3f38cac13d439395e3fe83eb,"{'b': 0, 'video': 1800, 'assemble': 2850, 'nat...",?
69413,5fa23821f22a069f1cac6154070d428d,"{'b': 0, 'equip': 3966, 'franc': 18198, 'attaq...",?
69414,d09ab27a292d00d6ed34e1e24dc8079c,"{'b': 0, 'attend': 607, 'fin': 3321, 'weekend'...",?


In [3]:
defi_english

Unnamed: 0,document,word_occurrences,catégorie
0,a5319b640496be985be9e46141a44d26,"{'b': 0, 'true': 155, 'coronavirus': 3840, 'de...",?
1,733691e2b8bb30046fcb3b3870acdda9,"{'b': 0, 'unearth': 42, 'maya': 28, 'civil': 2...",?
2,aae4c53ffd63ab90fafe911020a39411,"{'b': 0, 't': 0, 'row': 361, 'price': 1403, 'r...",?
3,bd90e1776424ffb662d7502846824612,"{'b': 0, 'new': 7788, 'voluntari': 31, 'mobili...",?
4,729d9fa185f1ab06151e847d1e3aacea,"{'b': 0, 'bmw': 18, 'say': 9018, 'will': 7135,...",?
...,...,...,...
58248,0bda59e8dc68f3a997ae5ff2888aff35,"{'b': 0, 'minecraft': 10, 'legend': 237, 'terr...",?
58249,182e9e428854d4823557db43b26a4fa6,"{'b': 0, 'bfm': 339, 'patrimoin': 10, 'h': 0, ...",?
58250,084da97559c8a0d17268573fe515fffe,"{'b': 0, 'l': 0, 'integral': 3, 'de': 2448, 'g...",?
58251,61d784b9e5f9de2319c013a6e64adef8,"{'b': 0, 'contenus': 8, 'pro': 226, 'nazi': 67...",?


In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.feature_extraction import DictVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt

# Charger le dataset (assurez-vous que dataset_french est correctement défini)
df = defi_french

# Diviser le dataset en features (X) et la target (y)
X = df['word_occurrences']
y = df['catégorie']

# Convertir les occurrences de mots en vecteurs
vectorizer = DictVectorizer(sparse=False)
X = vectorizer.fit_transform(X)

# Diviser le dataset en ensembles d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialiser les classifieurs
classifiers = {
    'k-NN': KNeighborsClassifier(n_neighbors=3),
    'Logistic Regression': LogisticRegression(random_state=42),
    'Naive Bayes': GaussianNB(),
    'SVM': SVC(probability=True, random_state=42),
    'Neural Network': MLPClassifier(hidden_layer_sizes=(100,), max_iter=500, random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42)
}

# Comparaison des classifieurs avec validation croisée
results = {'Classifier': [], 'Accuracy': []}

# Définir une stratégie de validation croisée (StratifiedKFold)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for clf_name, clf in classifiers.items():
    # Entraîner le classifieur sur l'ensemble d'entraînement
    clf.fit(X_train, y_train)
    
    # Évaluer le classifieur sur l'ensemble de test
    accuracy = clf.score(X_test, y_test)
    
    # Stocker les résultats
    results['Classifier'].append(clf_name)
    results['Accuracy'].append(accuracy)

# Afficher les résultats
results_df = pd.DataFrame(results)
print(results_df)

# Visualisation des résultats avec des barres
plt.figure(figsize=(10, 6))
plt.bar(results_df['Classifier'], results_df['Accuracy'], color='blue', alpha=0.7, label='Accuracy')
plt.xlabel('Classifier')
plt.ylabel('Accuracy')
plt.title('Comparison of Classifiers')
plt.legend(loc='lower right')
plt.show()


MemoryError: Unable to allocate 18.0 GiB for an array with shape (55532, 43428) and data type float64

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.feature_extraction import DictVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt

# Charger le dataset (assurez-vous que dataset_french est correctement défini)
df = dataset_english

# Diviser le dataset en features (X) et la target (y)
X = df['word_occurrences']
y = df['catégorie']

# Convertir les occurrences de mots en vecteurs
vectorizer = DictVectorizer(sparse=False)
X = vectorizer.fit_transform(X)

# Diviser le dataset en ensembles d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialiser les classifieurs
classifiers = {
    'k-NN': KNeighborsClassifier(n_neighbors=3),
    'Logistic Regression': LogisticRegression(random_state=42),
    'Naive Bayes': GaussianNB(),
    'SVM': SVC(probability=True, random_state=42),
    'Neural Network': MLPClassifier(hidden_layer_sizes=(100,), max_iter=500, random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42)
}

# Comparaison des classifieurs avec validation croisée
results = {'Classifier': [], 'Accuracy': []}

# Définir une stratégie de validation croisée (StratifiedKFold)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for clf_name, clf in classifiers.items():
    # Entraîner le classifieur sur l'ensemble d'entraînement
    clf.fit(X_train, y_train)
    
    # Évaluer le classifieur sur l'ensemble de test
    accuracy = clf.score(X_test, y_test)
    
    # Stocker les résultats
    results['Classifier'].append(clf_name)
    results['Accuracy'].append(accuracy)

# Afficher les résultats
results_df = pd.DataFrame(results)
print(results_df)

# Visualisation des résultats avec des barres
plt.figure(figsize=(10, 6))
plt.bar(results_df['Classifier'], results_df['Accuracy'], color='blue', alpha=0.7, label='Accuracy')
plt.xlabel('Classifier')
plt.ylabel('Accuracy')
plt.title('Comparison of Classifiers')
plt.legend(loc='lower right')
plt.show()


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.feature_extraction import DictVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, roc_curve
from sklearn.preprocessing import label_binarize
import matplotlib.pyplot as plt

# Charger le dataset (assurez-vous que dataset_french est correctement défini)
df = dataset_french

# Diviser le dataset en features (X) et la target (y)
X = df['word_occurrences']
y = df['catégorie']

# Convertir les occurrences de mots en vecteurs
vectorizer = DictVectorizer(sparse=False)
X = vectorizer.fit_transform(X)

# Diviser le dataset en ensembles d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialiser les classifieurs
classifiers = {
    'KNN': KNeighborsClassifier(n_neighbors=3),
    'LogReg': LogisticRegression(random_state=42),
    'Bayes': GaussianNB(),
    'SVM': SVC(probability=True, random_state=42),
    'Neural': MLPClassifier(hidden_layer_sizes=(100,), max_iter=500, random_state=42),
    'RF': RandomForestClassifier(n_estimators=100, random_state=42)
}

# Comparaison des classifieurs
results = {'Classifier': [], 'Accuracy': [], 'Precision (micro)': [], 'Recall (micro)': [], 'AUC (micro)': [],
           'Precision (macro)': [], 'Recall (macro)': [], 'AUC (macro)': []}

# Définir une stratégie de validation croisée (StratifiedKFold)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for clf_name, clf in classifiers.items():
    # Entraîner le classifieur sur l'ensemble d'entraînement
    clf.fit(X_train, y_train)
    
    # Prédire les catégories pour l'ensemble de test
    y_pred = clf.predict(X_test)
    
    # Calculer les métriques
    accuracy = accuracy_score(y_test, y_pred)
    precision_micro = precision_score(y_test, y_pred, average='micro')
    recall_micro = recall_score(y_test, y_pred, average='micro')
    roc_auc_micro = roc_auc_score(label_binarize(y_test, classes=clf.classes_), label_binarize(y_pred, classes=clf.classes_), average='micro')
    precision_macro = precision_score(y_test, y_pred, average='macro')
    recall_macro = recall_score(y_test, y_pred, average='macro')
    roc_auc_macro = roc_auc_score(label_binarize(y_test, classes=clf.classes_), label_binarize(y_pred, classes=clf.classes_), average='macro')
    
    # Stocker les résultats
    results['Classifier'].append(clf_name)
    results['Accuracy'].append(accuracy)
    results['Precision (micro)'].append(precision_micro)
    results['Recall (micro)'].append(recall_micro)
    results['AUC (micro)'].append(roc_auc_micro)
    results['Precision (macro)'].append(precision_macro)
    results['Recall (macro)'].append(recall_macro)
    results['AUC (macro)'].append(roc_auc_macro)

# Afficher les résultats
results_df = pd.DataFrame(results)

# Visualisation des résultats avec des barres
plt.figure(figsize=(12, 12))

# Accuracy
plt.subplot(3, 3, 1)
plt.bar(results_df['Classifier'], results_df['Accuracy'], color='blue', alpha=0.7)
plt.title('Accuracy')

# Precision (micro)
plt.subplot(3, 3, 2)
plt.bar(results_df['Classifier'], results_df['Precision (micro)'], color='green', alpha=0.7)
plt.title('Precision (micro)')

# Recall (micro)
plt.subplot(3, 3, 3)
plt.bar(results_df['Classifier'], results_df['Recall (micro)'], color='orange', alpha=0.7)
plt.title('Recall (micro)')

# AUC (micro)
plt.subplot(3, 3, 4)
plt.bar(results_df['Classifier'], results_df['AUC (micro)'], color='red', alpha=0.7)
plt.title('AUC (micro)')

# Precision (macro)
plt.subplot(3, 3, 5)
plt.bar(results_df['Classifier'], results_df['Precision (macro)'], color='purple', alpha=0.7)
plt.title('Precision (macro)')

# Recall (macro)
plt.subplot(3, 3, 6)
plt.bar(results_df['Classifier'], results_df['Recall (macro)'], color='brown', alpha=0.7)
plt.title('Recall (macro)')

# AUC (macro)
plt.subplot(3, 3, 7)
plt.bar(results_df['Classifier'], results_df['AUC (macro)'], color='pink', alpha=0.7)
plt.title('AUC (macro)')

plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.feature_extraction import DictVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, roc_curve
from sklearn.preprocessing import label_binarize
import matplotlib.pyplot as plt

# Charger le dataset (assurez-vous que dataset_french est correctement défini)
df = dataset_english

# Diviser le dataset en features (X) et la target (y)
X = df['word_occurrences']
y = df['catégorie']

# Convertir les occurrences de mots en vecteurs
vectorizer = DictVectorizer(sparse=False)
X = vectorizer.fit_transform(X)

# Diviser le dataset en ensembles d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialiser les classifieurs
classifiers = {
    'KNN': KNeighborsClassifier(n_neighbors=3),
    'LogReg': LogisticRegression(random_state=42),
    'Bayes': GaussianNB(),
    'SVM': SVC(probability=True, random_state=42),
    'Neural': MLPClassifier(hidden_layer_sizes=(100,), max_iter=500, random_state=42),
    'RF': RandomForestClassifier(n_estimators=100, random_state=42)
}

# Comparaison des classifieurs
results = {'Classifier': [], 'Accuracy': [], 'Precision (micro)': [], 'Recall (micro)': [], 'AUC (micro)': [],
           'Precision (macro)': [], 'Recall (macro)': [], 'AUC (macro)': []}

# Définir une stratégie de validation croisée (StratifiedKFold)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for clf_name, clf in classifiers.items():
    # Entraîner le classifieur sur l'ensemble d'entraînement
    clf.fit(X_train, y_train)
    
    # Prédire les catégories pour l'ensemble de test
    y_pred = clf.predict(X_test)
    
    # Calculer les métriques
    accuracy = accuracy_score(y_test, y_pred)
    precision_micro = precision_score(y_test, y_pred, average='micro')
    recall_micro = recall_score(y_test, y_pred, average='micro')
    roc_auc_micro = roc_auc_score(label_binarize(y_test, classes=clf.classes_), label_binarize(y_pred, classes=clf.classes_), average='micro')
    precision_macro = precision_score(y_test, y_pred, average='macro')
    recall_macro = recall_score(y_test, y_pred, average='macro')
    roc_auc_macro = roc_auc_score(label_binarize(y_test, classes=clf.classes_), label_binarize(y_pred, classes=clf.classes_), average='macro')
    
    # Stocker les résultats
    results['Classifier'].append(clf_name)
    results['Accuracy'].append(accuracy)
    results['Precision (micro)'].append(precision_micro)
    results['Recall (micro)'].append(recall_micro)
    results['AUC (micro)'].append(roc_auc_micro)
    results['Precision (macro)'].append(precision_macro)
    results['Recall (macro)'].append(recall_macro)
    results['AUC (macro)'].append(roc_auc_macro)

# Afficher les résultats
results_df = pd.DataFrame(results)

# Visualisation des résultats avec des barres
plt.figure(figsize=(12, 12))

# Accuracy
plt.subplot(3, 3, 1)
plt.bar(results_df['Classifier'], results_df['Accuracy'], color='blue', alpha=0.7)
plt.title('Accuracy')

# Precision (micro)
plt.subplot(3, 3, 2)
plt.bar(results_df['Classifier'], results_df['Precision (micro)'], color='green', alpha=0.7)
plt.title('Precision (micro)')

# Recall (micro)
plt.subplot(3, 3, 3)
plt.bar(results_df['Classifier'], results_df['Recall (micro)'], color='orange', alpha=0.7)
plt.title('Recall (micro)')

# AUC (micro)
plt.subplot(3, 3, 4)
plt.bar(results_df['Classifier'], results_df['AUC (micro)'], color='red', alpha=0.7)
plt.title('AUC (micro)')

# Precision (macro)
plt.subplot(3, 3, 5)
plt.bar(results_df['Classifier'], results_df['Precision (macro)'], color='purple', alpha=0.7)
plt.title('Precision (macro)')

# Recall (macro)
plt.subplot(3, 3, 6)
plt.bar(results_df['Classifier'], results_df['Recall (macro)'], color='brown', alpha=0.7)
plt.title('Recall (macro)')

# AUC (macro)
plt.subplot(3, 3, 7)
plt.bar(results_df['Classifier'], results_df['AUC (macro)'], color='pink', alpha=0.7)
plt.title('AUC (macro)')

plt.tight_layout()
plt.show()


In [7]:
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.ensemble import RandomForestClassifier

# Supposons que vous ayez une nouvelle structure item-RSS appelée new_data
# new_data doit contenir un champ 'word_occurrences' similaire à votre ensemble d'entraînement

# Charger le meilleur classifieur RandomForest (assurez-vous de remplacer cela par votre meilleur classifieur)
best_classifier = RandomForestClassifier(n_estimators=100, random_state=42)  # Remplacez cela par votre meilleur classifieur

# Charger l'ensemble de données existant
existing_data = defi_french

# Diviser l'ensemble de données existant en features (X) et la target (y)
X_existing = existing_data['word_occurrences']
y_existing = existing_data['catégorie']

# Convertir les occurrences de mots en vecteurs
vectorizer = DictVectorizer(sparse=False)
X_existing = vectorizer.fit_transform(X_existing)

# Entraîner le meilleur classifieur sur l'ensemble de données existant
best_classifier.fit(X_existing, y_existing)

# Supposons que new_data contienne un champ 'word_occurrences' similaire à votre ensemble d'entraînement
X_new = dataset_french['word_occurrences']

# Convertir les occurrences de mots en vecteurs (utilisez le même vectorizer)
X_new = vectorizer.transform(X_new)

# Prédire les catégories pour les nouvelles données
y_pred_new = best_classifier.predict(X_new)

# Obtenir les probabilités associées à chaque prédiction
probs_new = best_classifier.predict_proba(X_new)

# Afficher les résultats
print("Predicted:", list(y_pred_new))
print("Label Probability Prediction Order:", best_classifier.classes_)
print("Probs:", probs_new)


MemoryError: Unable to allocate 22.5 GiB for an array with shape (69416, 43428) and data type float64

In [None]:
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.ensemble import RandomForestClassifier

# Supposons que vous ayez une nouvelle structure item-RSS appelée new_data
# new_data doit contenir un champ 'word_occurrences' similaire à votre ensemble d'entraînement

# Charger le meilleur classifieur RandomForest (assurez-vous de remplacer cela par votre meilleur classifieur)
best_classifier = RandomForestClassifier(n_estimators=100, random_state=42)  # Remplacez cela par votre meilleur classifieur

# Charger l'ensemble de données existant
existing_data = dataset_french

# Diviser l'ensemble de données existant en features (X) et la target (y)
X_existing = existing_data['word_occurrences']
y_existing = existing_data['catégorie']

# Convertir les occurrences de mots en vecteurs
vectorizer = DictVectorizer(sparse=False)
X_existing = vectorizer.fit_transform(X_existing)

# Entraîner le meilleur classifieur sur l'ensemble de données existant
best_classifier.fit(X_existing, y_existing)

# Supposons que new_data contienne un champ 'word_occurrences' similaire à votre ensemble d'entraînement
X_new = defi_french['word_occurrences']

# Convertir les occurrences de mots en vecteurs (utilisez le même vectorizer)
X_new = vectorizer.transform(X_new)

# Prédire les catégories pour les nouvelles données (seulement pour le premier article)
y_pred_new = best_classifier.predict(X_new)

# Obtenir les probabilités associées à chaque prédiction (seulement pour le premier article)
probs_new = best_classifier.predict_proba(X_new)

# Créer un DataFrame avec les résultats pour le premier article
results_df = pd.DataFrame(probs_new, columns=best_classifier.classes_)
results_df.insert(0, 'Classe Prédite', y_pred_new)

# Afficher le DataFrame
results_df
