In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# ============================================
# 1. CHARGER LES 3 DATASETS
# ============================================

# Dataset Symptômes (600 lignes)
df_symptoms = pd.read_csv('../data/Dataset_600_Lignes/dataset_ready_for_ml.csv')
print("="*60)
print("DATASET SYMPTÔMES")
print("="*60)
print(f"Shape: {df_symptoms.shape}")
print(f"\nColonnes:\n{df_symptoms.columns.tolist()}")
print(f"\nAperçu:\n{df_symptoms.head()}")
print(f"\nInfo:\n{df_symptoms.info()}")
print(f"\nDistribution classe:\n{df_symptoms['class'].value_counts()}")

# Dataset Clinique (10000 lignes)
df_clinical = pd.read_csv('../data/Dataset_10000_Lignes/dataset_clinical_ready.csv')
print("\n" + "="*60)
print("DATASET CLINIQUE")
print("="*60)
print(f"Shape: {df_clinical.shape}")
print(f"\nColonnes:\n{df_clinical.columns.tolist()}")
print(f"\nAperçu:\n{df_clinical.head()}")
print(f"\nInfo:\n{df_clinical.info()}")
print(f"\nDistribution classe:\n{df_clinical['Diabetes'].value_counts()}")

# Dataset Pima
df_pima = pd.read_csv('../data/Dataset_Pregnancies/pima_ready_for_ml.csv')
print("\n" + "="*60)
print("DATASET PIMA")
print("="*60)
print(f"Shape: {df_pima.shape}")
print(f"\nColonnes:\n{df_pima.columns.tolist()}")
print(f"\nAperçu:\n{df_pima.head()}")
print(f"\nInfo:\n{df_pima.info()}")
print(f"\nDistribution classe:\n{df_pima['Outcome'].value_counts()}")

# ============================================
# 2. VÉRIFICATIONS
# ============================================

# Vérifier valeurs manquantes
print("\n" + "="*60)
print("VÉRIFICATION - VALEURS MANQUANTES")
print("="*60)
print(f"Symptômes: {df_symptoms.isnull().sum().sum()} valeurs manquantes")
print(f"Clinique: {df_clinical.isnull().sum().sum()} valeurs manquantes")
print(f"Pima: {df_pima.isnull().sum().sum()} valeurs manquantes")

# Vérifier types de données
print("\n" + "="*60)
print("VÉRIFICATION - TYPES DE DONNÉES")
print("="*60)
print("Tous les datasets devraient avoir des valeurs numériques uniquement")
print(f"\nSymptômes - types:\n{df_symptoms.dtypes.value_counts()}")
print(f"\nClinique - types:\n{df_clinical.dtypes.value_counts()}")
print(f"\nPima - types:\n{df_pima.dtypes.value_counts()}")

# ============================================
# 3. RÉSUMÉ POUR MODÉLISATION
# ============================================

datasets_info = pd.DataFrame({
    'Dataset': ['Symptômes', 'Clinique', 'Pima'],
    'Taille': [df_symptoms.shape[0], df_clinical.shape[0], df_pima.shape[0]],
    'Features': [df_symptoms.shape[1]-1, df_clinical.shape[1]-1, df_pima.shape[1]-1],
    'Target_Column': ['class', 'Diabetes', 'Outcome'],
    'Diabétiques': [
        df_symptoms['class'].sum(),
        df_clinical['Diabetes'].sum(),
        df_pima['Outcome'].sum()
    ]
})

datasets_info['% Diabétiques'] = (datasets_info['Diabétiques'] / datasets_info['Taille'] * 100).round(2)

print("\n" + "="*60)
print("RÉSUMÉ DES DATASETS")
print("="*60)
print(datasets_info)

# ✅ Si tout est OK, vous êtes prêts pour la modélisation!
print("\n✅ DONNÉES VÉRIFIÉES - PRÊT POUR MODÉLISATION!")

DATASET SYMPTÔMES
Shape: (251, 17)

Colonnes:
['Age', 'Gender', 'Polyuria', 'Polydipsia', 'sudden weight loss', 'weakness', 'Polyphagia', 'Genital thrush', 'visual blurring', 'Itching', 'Irritability', 'delayed healing', 'partial paresis', 'muscle stiffness', 'Alopecia', 'Obesity', 'class']

Aperçu:
        Age  Gender  Polyuria  Polydipsia  sudden weight loss  weakness  \
0 -0.709103       1         0           1                   0         1   
1  0.730775       1         0           0                   0         1   
2 -0.629110       1         1           0                   0         1   
3 -0.309137       1         0           0                   1         1   
4  0.890761       1         1           1                   1         1   

   Polyphagia  Genital thrush  visual blurring  Itching  Irritability  \
0           0               0                0        1             0   
1           0               0                1        0             0   
2           1               0