In [7]:
import os
os.getwd()

AttributeError: module 'os' has no attribute 'getwd'

In [1]:
"""
Script pour t√©l√©charger automatiquement le dataset Kaggle
Dataset: Binary Classification with a Bank Churn Dataset
"""

import os
import zipfile
from pathlib import Path
import pandas as pd

class KaggleDataLoader:
    """
    Classe pour g√©rer le t√©l√©chargement automatique des donn√©es depuis Kaggle
    """
    
    def __init__(self, dataset_name="gauravtopre/bank-customer-churn-dataset"):
        """
        Initialise le loader avec le nom du dataset Kaggle
        
        Args:
            dataset_name (str): Nom du dataset au format 'username/dataset-name'
        """
        self.dataset_name = dataset_name
        self.data_dir = Path("data/raw")
        self.processed_dir = Path("data/processed")
        
        # Cr√©er les dossiers s'ils n'existent pas
        self.data_dir.mkdir(parents=True, exist_ok=True)
        self.processed_dir.mkdir(parents=True, exist_ok=True)
    
    def setup_kaggle_credentials(self):
        """
        V√©rifie que les credentials Kaggle sont configur√©s
        
        Returns:
            bool: True si les credentials sont pr√©sents, False sinon
        """
        kaggle_json_path = Path.home() / ".kaggle" / "kaggle.json"
        
        if not kaggle_json_path.exists():
            print("‚ùå Fichier kaggle.json non trouv√©!")
            print("\nüìã Instructions pour configurer l'API Kaggle:")
            print("1. Allez sur https://www.kaggle.com/settings/account")
            print("2. Descendez jusqu'√† 'API' et cliquez sur 'Create New Token'")
            print("3. Un fichier kaggle.json sera t√©l√©charg√©")
            print("4. Placez-le dans: ~/.kaggle/kaggle.json (Linux/Mac) ou C:\\Users\\<username>\\.kaggle\\kaggle.json (Windows)")
            print("5. Sur Linux/Mac, ex√©cutez: chmod 600 ~/.kaggle/kaggle.json")
            return False
        
        print("‚úÖ Credentials Kaggle trouv√©s!")
        return True
    
    def download_dataset(self, force_download=False):
        """
        T√©l√©charge le dataset depuis Kaggle
        
        Args:
            force_download (bool): Force le t√©l√©chargement m√™me si les fichiers existent
            
        Returns:
            bool: True si le t√©l√©chargement est r√©ussi, False sinon
        """
        # V√©rifier les credentials
        if not self.setup_kaggle_credentials():
            return False
        
        try:
            # Importer l'API Kaggle
            from kaggle.api.kaggle_api_extended import KaggleApi
            
            # Initialiser l'API
            api = KaggleApi()
            api.authenticate()
            
            # V√©rifier si les donn√©es existent d√©j√†
            csv_files = list(self.data_dir.glob("*.csv"))
            if csv_files and not force_download:
                print(f"‚úÖ Dataset d√©j√† t√©l√©charg√©: {len(csv_files)} fichier(s) trouv√©(s)")
                return True
            
            print(f"üì• T√©l√©chargement du dataset: {self.dataset_name}...")
            
            # T√©l√©charger le dataset
            api.dataset_download_files(
                self.dataset_name,
                path=self.data_dir,
                unzip=True
            )
            
            print("‚úÖ T√©l√©chargement termin√©!")
            
            # Lister les fichiers t√©l√©charg√©s
            files = list(self.data_dir.glob("*"))
            print(f"\nüìÅ Fichiers t√©l√©charg√©s ({len(files)}):")
            for file in files:
                print(f"  - {file.name}")
            
            return True
            
        except ImportError:
            print("‚ùå Le package 'kaggle' n'est pas install√©!")
            print("   Installez-le avec: pip install kaggle")
            return False
            
        except Exception as e:
            print(f"‚ùå Erreur lors du t√©l√©chargement: {str(e)}")
            return False
    
    def load_data(self, filename=None):
        """
        Charge les donn√©es dans un DataFrame pandas
        
        Args:
            filename (str): Nom du fichier √† charger (None pour auto-d√©tection)
            
        Returns:
            pd.DataFrame: DataFrame avec les donn√©es ou None si erreur
        """
        try:
            # Auto-d√©tection du fichier CSV
            if filename is None:
                csv_files = list(self.data_dir.glob("*.csv"))
                
                if not csv_files:
                    print("‚ùå Aucun fichier CSV trouv√©. T√©l√©chargez d'abord le dataset.")
                    return None
                
                # Prendre le premier fichier CSV ou celui qui contient 'churn' dans le nom
                churn_files = [f for f in csv_files if 'churn' in f.name.lower()]
                filename = churn_files[0].name if churn_files else csv_files[0].name
            
            filepath = self.data_dir / filename
            
            print(f"üìä Chargement des donn√©es depuis: {filepath.name}")
            df = pd.read_csv(filepath)
            
            print(f"‚úÖ Donn√©es charg√©es: {df.shape[0]} lignes √ó {df.shape[1]} colonnes")
            
            return df
            
        except Exception as e:
            print(f"‚ùå Erreur lors du chargement: {str(e)}")
            return None
    
    def get_data_info(self, df):
        """
        Affiche des informations basiques sur le dataset
        
        Args:
            df (pd.DataFrame): DataFrame √† analyser
        """
        if df is None:
            print("‚ùå Aucune donn√©e √† analyser")
            return
        
        print("\n" + "="*60)
        print("üìä INFORMATIONS SUR LE DATASET")
        print("="*60)
        
        print(f"\nüî¢ Dimensions: {df.shape[0]} lignes √ó {df.shape[1]} colonnes")
        
        print("\nüìã Colonnes:")
        for i, col in enumerate(df.columns, 1):
            print(f"  {i:2d}. {col:30s} ({df[col].dtype})")
        
        print("\n‚ùì Valeurs manquantes:")
        missing = df.isnull().sum()
        if missing.sum() == 0:
            print("  ‚úÖ Aucune valeur manquante")
        else:
            for col, count in missing[missing > 0].items():
                pct = (count / len(df)) * 100
                print(f"  - {col}: {count} ({pct:.2f}%)")
        
        # D√©tecter la colonne cible (churn)
        churn_cols = [col for col in df.columns if 'churn' in col.lower() or 'exited' in col.lower()]
        
        if churn_cols:
            target_col = churn_cols[0]
            print(f"\nüéØ Variable cible d√©tect√©e: '{target_col}'")
            print(f"   Distribution:")
            value_counts = df[target_col].value_counts()
            for val, count in value_counts.items():
                pct = (count / len(df)) * 100
                print(f"     - {val}: {count} ({pct:.2f}%)")
            
            # Calculer le taux de churn
            if df[target_col].dtype in ['int64', 'float64']:
                churn_rate = df[target_col].mean() * 100
                print(f"\n   üìà Taux de churn: {churn_rate:.2f}%")
        
        print("\n" + "="*60)


# ============================================================================
# FONCTION D'UTILISATION FACILE
# ============================================================================

def get_churn_data(force_download=False):
    """
    Fonction simplifi√©e pour obtenir les donn√©es de churn
    
    Args:
        force_download (bool): Force le re-t√©l√©chargement des donn√©es
        
    Returns:
        pd.DataFrame: DataFrame avec les donn√©es de churn
    
    Usage:
        >>> df = get_churn_data()
        >>> print(df.head())
    """
    loader = KaggleDataLoader()
    
    # T√©l√©charger si n√©cessaire
    if force_download or not list(loader.data_dir.glob("*.csv")):
        loader.download_dataset(force_download=force_download)
    
    # Charger les donn√©es
    df = loader.load_data()
    
    # Afficher les infos
    if df is not None:
        loader.get_data_info(df)
    
    return df


# ============================================================================
# EXEMPLE D'UTILISATION
# ============================================================================

if __name__ == "__main__":
    print("üöÄ Chargement automatique des donn√©es Kaggle\n")
    
    # M√©thode 1: Utilisation simple
    df = get_churn_data()
    
    if df is not None:
        print("\n‚úÖ Donn√©es pr√™tes √† l'emploi!")
        print("\nüîç Aper√ßu des premi√®res lignes:")
        print(df.head())
    else:
        print("\n‚ùå √âchec du chargement des donn√©es")
    
    # M√©thode 2: Utilisation avanc√©e
    # loader = KaggleDataLoader()
    # loader.download_dataset()
    # df = loader.load_data()


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.4.1 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "C:\Users\Easy Services Pro\anaconda3\Lib\site-packages\ipykernel_launcher.py", line 17, in <module>
    app.launch_new_instance()
  File "C:\Users\Easy Services Pro\anaconda3\Lib\site-packages\traitlets\config\application.py", line 1075, in launch_instance
    app.start()
  File "C:\Users\Easy Services Pro\anaconda3\Lib\site-packages\ipykernel\kernelapp.py", line 701, in start
    self.io_loop.start()
  File "C:

ImportError: 
A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.4.1 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.




A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.4.1 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "C:\Users\Easy Services Pro\anaconda3\Lib\site-packages\ipykernel_launcher.py", line 17, in <module>
    app.launch_new_instance()
  File "C:\Users\Easy Services Pro\anaconda3\Lib\site-packages\traitlets\config\application.py", line 1075, in launch_instance
    app.start()
  File "C:\Users\Easy Services Pro\anaconda3\Lib\site-packages\ipykernel\kernelapp.py", line 701, in start
    self.io_loop.start()
  File "C:

ImportError: 
A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.4.1 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.




A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.4.1 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "C:\Users\Easy Services Pro\anaconda3\Lib\site-packages\ipykernel_launcher.py", line 17, in <module>
    app.launch_new_instance()
  File "C:\Users\Easy Services Pro\anaconda3\Lib\site-packages\traitlets\config\application.py", line 1075, in launch_instance
    app.start()
  File "C:\Users\Easy Services Pro\anaconda3\Lib\site-packages\ipykernel\kernelapp.py", line 701, in start
    self.io_loop.start()
  File "C:

AttributeError: _ARRAY_API not found


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.4.1 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "C:\Users\Easy Services Pro\anaconda3\Lib\site-packages\ipykernel_launcher.py", line 17, in <module>
    app.launch_new_instance()
  File "C:\Users\Easy Services Pro\anaconda3\Lib\site-packages\traitlets\config\application.py", line 1075, in launch_instance
    app.start()
  File "C:\Users\Easy Services Pro\anaconda3\Lib\site-packages\ipykernel\kernelapp.py", line 701, in start
    self.io_loop.start()
  File "C:

ImportError: 
A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.4.1 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.



üöÄ Chargement automatique des donn√©es Kaggle

‚ùå Fichier kaggle.json non trouv√©!

üìã Instructions pour configurer l'API Kaggle:
1. Allez sur https://www.kaggle.com/settings/account
2. Descendez jusqu'√† 'API' et cliquez sur 'Create New Token'
3. Un fichier kaggle.json sera t√©l√©charg√©
4. Placez-le dans: ~/.kaggle/kaggle.json (Linux/Mac) ou C:\Users\<username>\.kaggle\kaggle.json (Windows)
5. Sur Linux/Mac, ex√©cutez: chmod 600 ~/.kaggle/kaggle.json
‚ùå Aucun fichier CSV trouv√©. T√©l√©chargez d'abord le dataset.

‚ùå √âchec du chargement des donn√©es
