# SAE Modélisation Mathématique

MARIE Nathan <br>
COUET Benjamin

### Packages et librairies

In [8]:
%pip install pandas
%pip install tqdm
%pip install spacy
!python3 -m spacy download en_core_web_sm
!python -m spacy download en_core_web_sm
%pip install scikit-learn

import pandas as pd
from tqdm import tqdm
import os
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import spacy
from tqdm import tqdm

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m29.3 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
Collecting en-core-web-sm==3.7.1
  Using cached https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
Note: you may need to restart the kernel to use updated packages.


### Definition des variables globales

In [9]:
BOLD = '\033[1m' # ACTIONS
BLUE = '\033[94m' # ACTIONSv
RESET = '\033[0m'
RED = '\033[91m' # ERRORS
GREEN = '\033[92m' # SUCCESS
YELLOW = '\033[93m' # INFORMATIONS

### Nettoyage des données
Ce nettoyage consiste en .....

In [10]:
def traitementData():
    print(BOLD+BLUE+"\n\nChargement des données...\n\n*")
    df = pd.concat([chunk for chunk in tqdm(pd.read_json('./data/Video_Games.json', lines=True, chunksize=1000), desc=BLUE+'Chargement des données')])
    df = df.drop(['style', 'image'], axis=1)


    df['vote'] = df['vote'].fillna(0.0)

    df['reviewTime'] = pd.to_datetime(df['reviewTime'], format='%m %d, %Y')

    df['year'] = df['reviewTime'].dt.year

    os.makedirs('./split_data', exist_ok=True)

    for year, group in df.groupby('year'):
        if not os.path.exists(f'./split_data/reviews_{year}.json'):
            group.to_json(f'./split_data/reviews_{year}.json', orient='records', lines=True)
            print(GREEN + f'Fichier JSON créé pour l\'année {year}' + RESET)
        else:
            print(GREEN + f'Fichier JSON pour l\'année {year} existe déjà !' + RESET)
        
        group=group.drop_duplicates(subset=["asin","reviewerID","vote"], keep='last', inplace=True)
        if group!=None:
            print(f"{RED}Problème de doublons sur : {group}")
        else:
            print(f"{YELLOW}PAS DE DOUBLONS")


    print(BLUE+'*\n\n'+RESET+BOLD+"Dimensions du DataFrame => "+YELLOW+f"{df.shape}" + RESET)
    print(BOLD+"\nColonnes => "+YELLOW+f"{df.columns.to_list()}"+RESET)
    print(BOLD+"\nNombre de textes d'avis null => "+YELLOW+f"{df['reviewText'].isnull().sum()}"+RESET)
    print(BOLD+"\nNombre de titres d'avis null => "+YELLOW+f"{df['summary'].isnull().sum()}"+RESET)
    print(BOLD+"\nNombre de notes null => "+YELLOW+f"{df['overall'].isnull().sum()}"+RESET)
    print(BOLD+"\nNombre de votes null => "+YELLOW+f"{df['vote'].isnull().sum()}"+RESET)

In [11]:
traitementData()

[1m[94m

Chargement des données...

*


FileNotFoundError: File ./data/Video_Games.json does not exist

### Sélection du jeu de données à utiliser

In [None]:
df=pd.read_json('./split_data/reviews_2000.json', lines=True)

df = df.dropna(subset=['reviewText'])
df = df[df['reviewText'] != '']
df['label'] = df['overall'].apply(lambda x: 1 if x > 3 else 0) 

if not df['reviewText'].notna().all():
    print(RED+"Certains reviewText sont null")
else:
    print(GREEN+"Aucun reviewText null")

[92mAucun reviewText null


### Traitement linguistique avec Spacy


In [None]:
nlp = spacy.load("en_core_web_sm")
t = tqdm(total=df.shape[0], desc="Prétraitement des textes")

def preprocess(text):
    doc = nlp(text)
    t.update(1)
    return ' '.join([token.lemma_ for token in doc if not token.is_stop and not token.is_punct])

def traitementLinguistique(): 
    print(BOLD+BLUE+"\n\nTraitement linguistique...\n\n*")
    try:
        df['processed_text'] = df['reviewText'].apply(preprocess)
        print(GREEN+"\nTraitement liguistique réussi"+RESET)
    except:
        print(RED+"\nErreur lors du traitement linguistique"+RESET)

traitementLinguistique()

Prétraitement des textes:   0%|          | 2/9963 [00:00<08:18, 19.97it/s]

[1m[94m

Traitement linguistique...

*


Prétraitement des textes: 100%|█████████▉| 9962/9963 [06:40<00:00, 43.19it/s]

[92m
Traitement liguistique réussi[0m


### Vectorisation des commentaires


In [None]:
vector = CountVectorizer()
X = vector.fit_transform(df['processed_text'])
Y = df['label']
print(YELLOW+"VECTOR SHAPE : "+RESET+f"{X.shape}")

# Split data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

[93mVECTOR SHAPE : [0m(9963, 29254)


### Création du classifieur

In [None]:
print(BLUE+"Création du modèle KNN..."+RESET)
try:
    knn = KNeighborsClassifier()
    knnSearch = GridSearchCV(knn, {'n_neighbors': [3, 5, 7, 9]})
    knnSearch.fit(X_train, Y_train)
    print(GREEN+"Modèle KNN créé avec succès"+RESET)
except:
    print(RED+"Erreur lors de la création du modèle KNN"+RESET)

[94mCréation du modèle KNN...[0m
[92mModèle KNN créé avec succès[0m


### Évaluation du classifieur

In [None]:
predictions = knnSearch.predict(X_test)
print(YELLOW+"Résultats du modèle KNN :"+RESET)
print(classification_report(Y_test, predictions))

[93mRésultats du modèle KNN :[0m
              precision    recall  f1-score   support

           0       0.58      0.04      0.08       448
           1       0.78      0.99      0.87      1545

    accuracy                           0.78      1993
   macro avg       0.68      0.52      0.48      1993
weighted avg       0.73      0.78      0.69      1993



### Test du classifieur

In [None]:

test2 = knnSearch.predict(vector.transform(['I love this game !']))
test3 = knnSearch.predict(vector.transform(['I like this game !']))
test4 = knnSearch.predict(vector.transform(["Who created this game ? It's incredible !"]))

print(BLUE+"Test du modèle KNN :"+RESET)
print(f"test2 : {test2}")
print(f"test3 : {test3}")
print(f"test4 : {test4}")

NameError: name 'knnSearch' is not defined

### Vectorisation des commentaires


In [None]:
vector = CountVectorizer()
X = vector.fit_transform(df['processed_text'])
Y = df['overall']
print(YELLOW+"VECTOR SHAPE : "+RESET+f"{X.shape}")

# Split data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

### Création du classifieur multiclasse

In [None]:
print(BLUE+"Création du modèle KNN..."+RESET)
try:
    knn = KNeighborsClassifier()
    knnSearch = GridSearchCV(knn, {'n_neighbors': [3, 5, 7, 9]})
    knnSearch.fit(X_train, Y_train)
    print(GREEN+"Modèle KNN créé avec succès"+RESET)
except:
    print(RED+"Erreur lors de la création du modèle KNN"+RESET)

### Évaluation du classifieur multiclasse

In [None]:
predictions = knnSearch.predict(X_test)
print(YELLOW+"Résultats du modèle KNN :"+RESET)
print(classification_report(Y_test, predictions))