# SAE Modélisation Mathématique

MARIE Nathan <br>
COUET Benjamin

### Packages et librairies

In [None]:
%pip install pandas
%pip install tqdm
%pip install spacy
!python3 -m spacy download en_core_web_sm
!python -m spacy download en_core_web_sm
%pip install scikit-learn

import pandas as pd
import os
import json
import gzip
from tqdm import tqdm
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import spacy
from tqdm import tqdm

### Definition des variables globales

In [None]:
BOLD = '\033[1m' # ACTIONS
BLUE = '\033[94m' # ACTIONSv
RESET = '\033[0m'
RED = '\033[91m' # ERRORS
GREEN = '\033[92m' # SUCCESS
YELLOW = '\033[93m' # INFORMATIONS

### Nettoyage des données

In [None]:
def traitementData():
    print(BOLD+BLUE+"\n\nChargement des données...\n\n")
    
    def parse(path):
        g = gzip.open(path, 'rb')
        for l in g:
            yield json.loads(l)
            
    def getDF(path):
        i = 0
        df = {}
        for d in parse(path):
            df[i] = d
            i += 1
        return pd.DataFrame.from_dict(df, orient='index')
    
    df = getDF("./data/Video_Games.json.gz")
    
    df = df.drop(['style', 'image'], axis=1)
    
    df=df.dropna(subset=["reviewText"])

    df['vote'] = df['vote'].fillna(0.0)

    if not df['reviewText'].notna().all():
        print(RED+"Certains reviewText sont null\n")
    else:
        print(GREEN+"Aucun reviewText null\n")

    df['reviewTime'] = pd.to_datetime(df['reviewTime'], format='%m %d, %Y')

    df['year'] = df['reviewTime'].dt.year

    os.makedirs('./split_data', exist_ok=True)

    for year, group in df.groupby('year'):
        if not os.path.exists(f'./split_data/reviews_{year}.json'):
            group.to_json(f'./split_data/reviews_{year}.json', orient='records', lines=True)
            print(GREEN + f'Fichier JSON créé pour l\'année {year}' + RESET)
        else:
            print(GREEN + f'Fichier JSON pour l\'année {year} existe déjà !' + RESET)
        
        group=group.drop_duplicates(subset=["asin","reviewerID","vote"], keep='last', inplace=True)
        if group!=None:
            print(f"{RED}Problème de doublons sur : {group}")
        else:
            print(f"{YELLOW}PAS DE DOUBLONS")


    print(BLUE+'*\n\n'+RESET+BOLD+"Dimensions du DataFrame => "+YELLOW+f"{df.shape}" + RESET)
    print(BOLD+"\nColonnes => "+YELLOW+f"{df.columns.to_list()}"+RESET)
    print(BOLD+"\nNombre de textes d'avis null => "+YELLOW+f"{df['reviewText'].isnull().sum()}"+RESET)
    print(BOLD+"\nNombre de titres d'avis null => "+YELLOW+f"{df['summary'].isnull().sum()}"+RESET)
    print(BOLD+"\nNombre de notes null => "+YELLOW+f"{df['overall'].isnull().sum()}"+RESET)
    print(BOLD+"\nNombre de votes null => "+YELLOW+f"{df['vote'].isnull().sum()}"+RESET)

In [None]:
traitementData()

### Sélection du jeu de données à utiliser

In [None]:
df=pd.read_json('./split_data/reviews_2000.json', lines=True)
df = df[df['reviewText'] != '']

df['label'] = df['overall'].apply(lambda x: 1 if x > 3 else 0) 

### Traitement linguistique avec Spacy


In [None]:
nlp = spacy.load("en_core_web_sm")
t = tqdm(total=df.shape[0], desc="Prétraitement des textes")


def preprocess(text):
    return [' '.join([token.lemma_ for token in doc if not token.is_stop and not token.is_punct]) 
            for doc in tqdm(nlp.pipe(text, batch_size=50    ), total=len(text), desc="Prétraitement des textes")]

def traitementLinguistique(): 
    #print(BOLD+BLUE+"\n\nTraitement linguistique...\n\n*")
    try:
        df['processed_text'] = preprocess(df['reviewText'].tolist())
        print(GREEN+"\nTraitement liguistique réussi"+RESET)
    except Exception as e:
        print(RED+"\nErreur lors du traitement linguistique => "+RESET+YELLOW+f"{e}"+RESET)

traitementLinguistique()

# Classifieur Binaire

### Vectorisation des commentaires


In [None]:
vector = CountVectorizer()
X = vector.fit_transform(df['processed_text'])
Y = df['label']
print(YELLOW+"VECTOR SHAPE : "+RESET+f"{X.shape}")

try:
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
    print(GREEN+"Données splittées avec succès"+RESET)
except:
    print(RED+"Erreur lors du split des données"+RESET)

### Création du classifieur

In [None]:
print(BLUE+"Création du modèle KNN..."+RESET)
try:
    knn = KNeighborsClassifier()
    knnSearch = GridSearchCV(knn, {'n_neighbors': [3, 5, 7, 9]})
    knnSearch.fit(X_train, Y_train)
    print(GREEN+"Modèle KNN créé avec succès"+RESET)
except:
    print(RED+"Erreur lors de la création du modèle KNN"+RESET)

### Évaluation du classifieur

In [None]:
predictions = knnSearch.predict(X_test)
print(YELLOW+"Résultats du modèle KNN :"+RESET)
print(classification_report(Y_test, predictions))

### Test du classifieur

In [None]:
test1 = knnSearch.predict(vector.transform(preprocess(["I think I've already written a review for this, but I wanted to add that the engine is much improved over the origanal.  In the first unreal I got an average of 10-20 fps DURING gameplay and 50 or so max (not using timedemo 1 during intro) and in UT I get 40-60+ average and even 100+ max.  This game has been so incredebly improved that I get higher frame rates  with better graphics!"])))
test2 = knnSearch.predict(vector.transform(preprocess(['I love this game !'])))
test3 = knnSearch.predict(vector.transform(preprocess(["Where to begin?  How about...not fun.  Bad graphics+bad gameplay+bad sound+no replay value=bad game.  The saddest part is that Eidos put so much money into this game that good studios (i.e. Looking Glass) ended up  shutting down.  Quake II is still more fun than this!  They should have  pulled the plug on Ion Storm a long time ago."])))

print(BLUE+"Test du modèle KNN :"+RESET)
print(YELLOW+f"test1 (expected 1) : {test1}")
print(f"test2 (expected 1) : {test2}")
print(f"test3 (expected 0) : {test3}"+RESET)

# Classifieur Multiclasses

### Vectorisation des commentaires


In [None]:
vector = CountVectorizer()
X = vector.fit_transform(df['processed_text'])
Y = df['overall']
print(YELLOW+"VECTOR SHAPE : "+RESET+f"{X.shape}")

try:
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
    print(GREEN+"Données splittées avec succès"+RESET)
except:
    print(RED+"Erreur lors du split des données"+RESET)

### Création du classifieur multiclasse

In [None]:
print(BLUE+"Création du modèle KNN multiclasse..."+RESET)
try:
    knnMulti = KNeighborsClassifier()
    knnSearchMulti = GridSearchCV(knnMulti, {'n_neighbors': [3, 5, 7, 9]})
    knnSearchMulti.fit(X_train, Y_train)
    print(GREEN+"Modèle KNN multiclasse créé avec succès"+RESET)
except:
    print(RED+"Erreur lors de la création du modèle KNN multiclasse"+RESET)

### Évaluation du classifieur multiclasse

In [None]:
test1 = knnSearchMulti.predict(vector.transform(preprocess(["I think I've already written a review for this, but I wanted to add that the engine is much improved over the origanal.  In the first unreal I got an average of 10-20 fps DURING gameplay and 50 or so max (not using timedemo 1 during intro) and in UT I get 40-60+ average and even 100+ max.  This game has been so incredebly improved that I get higher frame rates  with better graphics!"])))
test2 = knnSearchMulti.predict(vector.transform(preprocess(["Where to begin?  How about...not fun.  Bad graphics+bad gameplay+bad sound+no replay value=bad game.  The saddest part is that Eidos put so much money into this game that good studios (i.e. Looking Glass) ended up  shutting down.  Quake II is still more fun than this!  They should have  pulled the plug on Ion Storm a long time ago."])))
test6 = knnSearchMulti.predict(vector.transform(preprocess(["Well we all now think that the DreamCast has the best game, it's the best there is in the market, and it beats PS2. Well that may be true. Even though I don't have DC, but looking at the records, did to the Sega fans I doubt DreamCast would become the best out in the market. If you remember to the older versions of Sega, what happened to their games? Yup all gone, they don't send anymore to the US evem before the DC came out, most people may have even forgot there was a Sega company. For those people who has\/had Sega Saturn would know, it came out with many title of games and the next two years what happened to those games, well they stopped manufacturing them. Also what happend to the Sega Genesis console games??? Well same thing happened, they just stopped shipments. I dunno to much about history but so far I know one thing, history always repeats but in a different way, this time Sega's \"best console\" will just as well do the same thing like what it did in the past. They just stopped making the games. Well personally if your going to buy any console, I think that you should buy PS2 or wait for the X-Box, but not the next Ninetendo product (Dolphin) because they just don't make good games. Although I must admit Dreamcast does have good graphics. I also think that people who say the graphics of DC is better then PS2, I think they're just to weak to admit the two systems's graphics has no difference."])))

print(BLUE+"Test du modèle KNN multiclasses :"+RESET)
print(YELLOW+f"test1 (expected 5) : {test1}")
print(f"test2 (expected 1) : {test2}")
print(f"test3 (expected 1) : {test6}"+RESET)
predictions = knnSearchMulti.predict(X_test)
print(YELLOW+"Résultats du modèle KNN multiclasse :"+RESET)
print(classification_report(Y_test, predictions))






# Gradient Boosting Classifier

### Création du classifieur gradient boosting

In [None]:
print(BLUE+"Création du modèle GB_Classifier..."+RESET)
try:
    GB_Classifier = GradientBoostingClassifier()
    GB_Classifier.fit(X_train, Y_train)
    print(GREEN+"Modèle GB_Classifier créé avec succès"+RESET)
except:
    print(RED+"Erreur lors de la création du modèle GB_Classifier"+RESET)

### Évaluation du classifieur multiclasse

In [None]:
test1 = GB_Classifier.predict(vector.transform(preprocess(["I think I've already written a review for this, but I wanted to add that the engine is much improved over the origanal.  In the first unreal I got an average of 10-20 fps DURING gameplay and 50 or so max (not using timedemo 1 during intro) and in UT I get 40-60+ average and even 100+ max.  This game has been so incredebly improved that I get higher frame rates  with better graphics!"])))
test2 = GB_Classifier.predict(vector.transform(preprocess(["Where to begin?  How about...not fun.  Bad graphics+bad gameplay+bad sound+no replay value=bad game.  The saddest part is that Eidos put so much money into this game that good studios (i.e. Looking Glass) ended up  shutting down.  Quake II is still more fun than this!  They should have  pulled the plug on Ion Storm a long time ago."])))
test6 = GB_Classifier.predict(vector.transform(preprocess(["Well we all now think that the DreamCast has the best game, it's the best there is in the market, and it beats PS2. Well that may be true. Even though I don't have DC, but looking at the records, did to the Sega fans I doubt DreamCast would become the best out in the market. If you remember to the older versions of Sega, what happened to their games? Yup all gone, they don't send anymore to the US evem before the DC came out, most people may have even forgot there was a Sega company. For those people who has\/had Sega Saturn would know, it came out with many title of games and the next two years what happened to those games, well they stopped manufacturing them. Also what happend to the Sega Genesis console games??? Well same thing happened, they just stopped shipments. I dunno to much about history but so far I know one thing, history always repeats but in a different way, this time Sega's \"best console\" will just as well do the same thing like what it did in the past. They just stopped making the games. Well personally if your going to buy any console, I think that you should buy PS2 or wait for the X-Box, but not the next Ninetendo product (Dolphin) because they just don't make good games. Although I must admit Dreamcast does have good graphics. I also think that people who say the graphics of DC is better then PS2, I think they're just to weak to admit the two systems's graphics has no difference."])))

print(BLUE+"Test du modèle GB :"+RESET)
print(YELLOW+f"test1 (expected 5) : {test1}")
print(f"test2 (expected 1) : {test2}")
print(f"test3 (expected 1) : {test6}"+RESET)
predictions = GB_Classifier.predict(X_test)
print(YELLOW+"Résultats du modèle GB :"+RESET)
print(classification_report(Y_test, predictions))

# Random Forest Classifier

### Création du classifieur random forest

In [None]:
print(BLUE+"Création du modèle GB_Classifier..."+RESET)
try:
    RD_Classifier = RandomForestClassifier(n_estimators=100, random_state=42)
    RD_Classifier.fit(X_train, Y_train)
    print(GREEN+"Modèle RD_Classifier créé avec succès"+RESET)
except:
    print(RED+"Erreur lors de la création du modèle RD_Classifier"+RESET)

### Évaluation du classifieur multiclasse

In [None]:
test1 = RD_Classifier.predict(vector.transform(preprocess(["I think I've already written a review for this, but I wanted to add that the engine is much improved over the origanal.  In the first unreal I got an average of 10-20 fps DURING gameplay and 50 or so max (not using timedemo 1 during intro) and in UT I get 40-60+ average and even 100+ max.  This game has been so incredebly improved that I get higher frame rates  with better graphics!"])))
test2 = RD_Classifier.predict(vector.transform(preprocess(["Where to begin?  How about...not fun.  Bad graphics+bad gameplay+bad sound+no replay value=bad game.  The saddest part is that Eidos put so much money into this game that good studios (i.e. Looking Glass) ended up  shutting down.  Quake II is still more fun than this!  They should have  pulled the plug on Ion Storm a long time ago."])))
test6 = RD_Classifier.predict(vector.transform(preprocess(["Well we all now think that the DreamCast has the best game, it's the best there is in the market, and it beats PS2. Well that may be true. Even though I don't have DC, but looking at the records, did to the Sega fans I doubt DreamCast would become the best out in the market. If you remember to the older versions of Sega, what happened to their games? Yup all gone, they don't send anymore to the US evem before the DC came out, most people may have even forgot there was a Sega company. For those people who has\/had Sega Saturn would know, it came out with many title of games and the next two years what happened to those games, well they stopped manufacturing them. Also what happend to the Sega Genesis console games??? Well same thing happened, they just stopped shipments. I dunno to much about history but so far I know one thing, history always repeats but in a different way, this time Sega's \"best console\" will just as well do the same thing like what it did in the past. They just stopped making the games. Well personally if your going to buy any console, I think that you should buy PS2 or wait for the X-Box, but not the next Ninetendo product (Dolphin) because they just don't make good games. Although I must admit Dreamcast does have good graphics. I also think that people who say the graphics of DC is better then PS2, I think they're just to weak to admit the two systems's graphics has no difference."])))

print(BLUE+"Test du modèle GB :"+RESET)
print(YELLOW+f"test1 (expected 5) : {test1}")
print(f"test2 (expected 1) : {test2}")
print(f"test3 (expected 1) : {test6}"+RESET)
predictions = RD_Classifier.predict(X_test)
print(YELLOW+"Résultats du modèle Random forest :"+RESET)
print(classification_report(Y_test, predictions))