# Les imports

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col,lit,concat
import os, sys

In [3]:
os.environ["PYSPARK_PYTHON"] = sys.executable
os.environ["PYSPARK_DRIVER_PYTHON"] = sys.executable

In [4]:
spark = (SparkSession.builder
         .master("local[*]")
         .config("spark.pyspark.python", sys.executable)
         .config("spark.pyspark.driver.python", sys.executable)
         .config("spark.python.worker.reuse", "true")
         .config("spark.python.worker.timeout", "300")
         .getOrCreate())

spark.sparkContext.setLogLevel("ERROR")



# Les chemins

In [5]:
dataset = r"C:\spark_project\Dataset"
csv_file = r"C:\spark_project\Dataset\UrbanSound8K.csv"
out_base = r"C:\spark_project\Dataset_Sorted_by_class"

# Lecture de csv

In [6]:
df = (spark.read
    .option("header", True)
    .option("inferSchema", True)
    .csv(csv_file)
    .select(
        col("slice_file_name").alias("file"), #on renomme la colonne par file
        col("fold").cast("int"),
        col("class").alias("class_name")
    )
    )

# construction des chemins source/destination

In [7]:
df_paths =(
    df.withColumn("src", concat(lit(dataset), lit("/fold"), col("fold"), lit("/"), col("file"))) # equivalent en pandas : df["src"] = (dataset+"/fold"+ df["fold"].astype(str)+"/"+df["file"]) // on construit le chemain de source du fichier
    .withColumn("dst_class_dir", concat(lit(out_base), lit("/"), col("class_name"))) # on construit le dossier destination de la classe 
    .withColumn("dst_file", concat(col("dst_class_dir"), lit("/"), col("file"))) # on construit le chemain destination de fichier 
    .select("src", "dst_class_dir", "dst_file")
    )


# conversion en csv 

In [None]:
df_paths.write \
    .mode("overwrite") \
    .option("header", True) \
    .csv(r"C:\spark_project\Dataset\df_paths_csv")


# Copier les audios dans les nvs dossiers

In [None]:
def process_partition(rows):
    """Traite une partition de fichiers"""
    import os
    import shutil
    
    stats = {"copied": 0, "skipped": 0, "errors": 0}
    
    for row in rows:
        try:
            src = row["src"]
            dst_class_dir = row["dst_class_dir"]
            dst_file = row["dst_file"]
            
            # Créer le dossier de destination
            os.makedirs(dst_class_dir, exist_ok=True)
            
            # copier
            if not os.path.exists(src):
                stats["errors"] += 1
                continue
                
            if os.path.exists(dst_file):
                stats["skipped"] += 1
                continue
            
            shutil.copy2(src, dst_file)
            stats["copied"] += 1
            
        except Exception as e:
            stats["errors"] += 1
            print(f"Erreur sur {src}: {e}")
    
    return [stats]

df_paths_optimized = df_paths.repartition(2)

# Exécuter le traitement
results = df_paths_optimized.rdd.mapPartitions(process_partition).collect()

