In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
import os
import ipywidgets as widgets
from IPython.display import display
pd.options.display.max_columns = None
pd.set_option('display.max_rows', None)  

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB

In [2]:
df = pd.read_parquet('df_igdb.parquet')

In [3]:
print(df.isna().sum())

id                            0
name                          0
metacritic               344574
released                      0
website                  292717
rating                        0
playtime                      0
ratings_count                 0
suggestions_count             0
game_series_count             0
reviews_count                 0
platforms                     0
developers                 2301
genres                        0
esrb_rating              298929
year                          0
decennie                      0
Card                          0
RPG                           0
Shooter                       0
Strategy                      0
Action                        0
Sports                        0
Massively Multiplayer         0
Platformer                    0
Puzzle                        0
Educational                   0
Family                        0
Casual                        0
Indie                         0
Simulation                    0
Arcade  

In [4]:
df.head(3)

Unnamed: 0,id,name,metacritic,released,website,rating,playtime,ratings_count,suggestions_count,game_series_count,reviews_count,platforms,developers,genres,esrb_rating,year,decennie,Card,RPG,Shooter,Strategy,Action,Sports,Massively Multiplayer,Platformer,Puzzle,Educational,Family,Casual,Indie,Simulation,Arcade,Fighting,Racing,Board Games,Adventure,Unnamed: 37,PlayStation 5,Commodore / Amiga,SEGA Saturn,SNES,Dreamcast,Nintendo Switch,Atari 5200,Xbox Series S/X,3DO,Atari 7800,SEGA CD,Game Boy Advance,Nintendo 64,Xbox,Atari Lynx,Nintendo DS,Xbox 360,Atari Flashback,Neo Geo,Apple II,Nintendo 3DS,NES,PS Vita,Game Boy,Linux,iOS,Genesis,PC,Jaguar,SEGA 32X,Atari ST,Wii U,Web,PlayStation 3,Game Boy Color,Game Gear,Classic Macintosh,PSP,PlayStation 2,macOS,Atari 8-bit,PlayStation 4,Android,Wii,Atari XEGS,PlayStation,SEGA Master System,GameCube,Xbox One,Atari 2600,Nintendo DSi
0,3498,Grand Theft Auto V,97.0,2013-09-17,http://www.rockstargames.com/V/,4.48,69.0,4289.0,426.0,9.0,4334.0,PC||Xbox Series S/X||PlayStation 5||PlayStatio...,Rockstar North,Action,Mature,2013.0,2010.0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0
1,3328,The Witcher 3: Wild Hunt,92.0,2015-05-18,https://thewitcher.com/en/witcher3,4.67,50.0,3939.0,688.0,6.0,3996.0,PC||Xbox One||Nintendo Switch||PlayStation 4,CD PROJEKT RED,Action||Adventure||RPG,Mature,2015.0,2010.0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0
2,4200,Portal 2,95.0,2011-04-18,http://www.thinkwithportals.com/,4.61,11.0,3613.0,589.0,2.0,3645.0,Xbox One||PlayStation 3||PC||Xbox 360||Linux||...,Valve Software,Shooter||Puzzle,Everyone 10+,2011.0,2010.0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0


In [5]:
# Traiter les valeurs manquantes
df_cleaned = df.drop(columns=['metacritic', 'esrb_rating', 'website'])
print(df_cleaned.isna().sum())

# Extraire uniquement les colonnes des genres et des plateformes (exclure les colonnes non numériques)
genre_platform_columns = [col for col in df_cleaned.columns if df_cleaned[col].dtype in ['int64', 'float64']]
X = df_cleaned[genre_platform_columns]

# Cible (par exemple une colonne contenant les labels)
y = df_cleaned['genres']  # Remplace 'target_column' par la vraie colonne cible

# Standardisation des données
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

id                          0
name                        0
released                    0
rating                      0
playtime                    0
ratings_count               0
suggestions_count           0
game_series_count           0
reviews_count               0
platforms                   0
developers               2301
genres                      0
year                        0
decennie                    0
Card                        0
RPG                         0
Shooter                     0
Strategy                    0
Action                      0
Sports                      0
Massively Multiplayer       0
Platformer                  0
Puzzle                      0
Educational                 0
Family                      0
Casual                      0
Indie                       0
Simulation                  0
Arcade                      0
Fighting                    0
Racing                      0
Board Games                 0
Adventure                   0
          

In [6]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.decomposition import PCA


# Réduction de dimension avec PCA
pca = PCA(n_components=50)  # Ajustez le nombre de composantes en fonction de vos besoins
X_reduced = pca.fit_transform(X_scaled)

# Prendre un sous-échantillon des données
X_sample, _, y_sample, _ = train_test_split(X_reduced, y, test_size=0.1, random_state=42)


In [None]:

# Modèles à tester
models = {
    "RandomForest": RandomForestClassifier(n_jobs=1),
    "SVC": SVC(),
    "KNeighbors": KNeighborsClassifier(),
    "LogisticRegression": LogisticRegression(),
    "DecisionTree": DecisionTreeClassifier(),
    "NaiveBayes": GaussianNB()
}

# Validation croisée et comparaison des modèles
best_model = None
best_score = 0

for model_name, model in models.items():
    scores = cross_val_score(model, X_sample, y_sample, cv=3)  # Utiliser un sous-échantillon et moins de plis
    mean_score = scores.mean()
    print(f"{model_name}: Mean Cross-Validation Score = {mean_score}")
    
    if mean_score > best_score:
        best_score = mean_score
        best_model = model_name

print(f"Best model: {best_model} with a score of {best_score}")




In [15]:
import numpy as np

# 1. Saisie de l'utilisateur
jeu_saisi = input("Saisissez le nom d'un jeu : ")

# 2. Recherche du jeu dans le DataFrame
if jeu_saisi in df_cleaned['name'].values:
    jeu_selectionne = df_cleaned[df_cleaned['name'] == jeu_saisi]
    
    # 3. Extraire les caractéristiques du jeu
    X_jeu = jeu_selectionne[genre_platform_columns].values  # colonnes des genres et plateformes
    
    # 4. Calculer la similarité avec tous les autres jeux
    similarities = cosine_similarity(X_scaled, X_jeu)
    
    # 5. Trier les jeux par similarité décroissante
    similar_indices = np.argsort(similarities.flatten())[::-1]
    
    # 6. Afficher les jeux les plus similaires
    top_n = 5  # Par exemple, recommander les 5 jeux les plus similaires
    jeux_similaires = df_cleaned.iloc[similar_indices[:top_n]]['name']
    
    print(f"Jeux recommandés similaires à '{jeu_saisi}':")
    for jeu in jeux_similaires:
        print(jeu)
else:
    print(f"Le jeu '{jeu_saisi}' n'existe pas dans la base de données.")


Saisissez le nom d'un jeu :  Unreal


Jeux recommandés similaires à 'Unreal':
Froggers (treny)
ASDAMMED
PenguinHood
Break Core
Ninja Ninja (Game Jam)
