# Nettoyage du fichier "olympic_result.csv"

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, BooleanType
import os


In [2]:
# Définir les environnements Spark
os.environ['PYSPARK_PYTHON'] = 'C:/Users/yanni/AppData/Local/Programs/Python/Python312/python.exe'
os.environ['PYSPARK_DRIVER_PYTHON'] = 'C:/Users/yanni/AppData/Local/Programs/Python/Python312/python.exe'

In [3]:
# Création d'une session Spark
spark = SparkSession.builder \
    .appName("Hackathon") \
    .config("spark.jars", "postgresql-42.6.0.jar") \
    .getOrCreate()

In [4]:
# Définir les paramètres de connexion
url = "jdbc:postgresql://postgresql-jo2024.alwaysdata.net:5432/jo2024_postgres"
properties = {
    "user": "jo2024_fatima",
    "password": "Ipssi2024!",  # Assurez-vous de protéger cette information sensible
    "driver": "org.postgresql.Driver"
}

In [5]:
# Chemin vers votre fichier CSV
csv_file_path = "./Datas/olympic_results.csv"

In [6]:
# Définir le schéma pour le DataFrame
schema = StructType([
    StructField("discipline_title", StringType(), True),
    StructField("event_title", StringType(), True),
    StructField("slug_game", StringType(), True),
    StructField("country_name", StringType(), True),
    StructField("medal_type", StringType(), True),
    StructField("rank_equal", BooleanType(), True),
    StructField("rank_position", IntegerType(), True),
    StructField("athlete_full_name", StringType(), True),
    StructField("value_unit", StringType(), True),
    StructField("value_type", StringType(), True)
])

In [7]:
# Lire le fichier CSV dans un DataFrame avec le schéma défini
df = spark.read.csv(csv_file_path, header=True)
# Supprimer les colonnes non nécessaires
df = df.drop('Unnamed: 0', 'athletes', 'athlete_url')


In [8]:
# Afficher le schéma du DataFrame pour vérifier les types
df.printSchema()


root
 |-- discipline_title: string (nullable = true)
 |-- event_title: string (nullable = true)
 |-- slug_game: string (nullable = true)
 |-- participant_type: string (nullable = true)
 |-- medal_type: string (nullable = true)
 |-- rank_equal: string (nullable = true)
 |-- rank_position: string (nullable = true)
 |-- country_name: string (nullable = true)
 |-- country_code: string (nullable = true)
 |-- country_3_letter_code: string (nullable = true)
 |-- athlete_full_name: string (nullable = true)
 |-- value_unit: string (nullable = true)
 |-- value_type: string (nullable = true)



In [9]:
# Afficher les 5 premières lignes du DataFrame
df.show(5)

+----------------+-------------+------------+----------------+----------+----------+-------------+-------------+------------+---------------------+-----------------+----------+----------+
|discipline_title|  event_title|   slug_game|participant_type|medal_type|rank_equal|rank_position| country_name|country_code|country_3_letter_code|athlete_full_name|value_unit|value_type|
+----------------+-------------+------------+----------------+----------+----------+-------------+-------------+------------+---------------------+-----------------+----------+----------+
|         Curling|Mixed Doubles|beijing-2022|        GameTeam|      GOLD|     False|            1|        Italy|          IT|                  ITA|             NULL|      NULL|      NULL|
|         Curling|Mixed Doubles|beijing-2022|        GameTeam|    SILVER|     False|            2|       Norway|          NO|                  NOR|             NULL|      NULL|      NULL|
|         Curling|Mixed Doubles|beijing-2022|        GameTea

In [10]:
# Remplir les valeurs manquantes
df_cleaned = df.fillna({
    'rank_equal': False,
    'rank_position': -1,
    'athlete_full_name': 'Unknown',
    'value_unit': 'Unknown',
    'value_type': 'Unknown'
})

In [11]:
# Liste des disciplines aux JO 2024
olympic_disciplines_2024 = [
    "Archery", "Artistic Gymnastics", "Artistic Swimming", "Athletics", "Badminton",
    "Basketball", "Basketball 3x3", "Beach Volleyball", "Boxing", "Breaking",
    "Canoe Slalom", "Canoe Sprint", "Cycling BMX Freestyle", "Cycling BMX Racing",
    "Cycling Mountain Bike", "Cycling Road", "Cycling Track", "Diving", "Equestrian",
    "Fencing", "Football", "Golf", "Handball", "Hockey", "Judo", "Marathon Swimming",
    "Modern Pentathlon", "Rhythmic Gymnastics", "Rowing", "Rugby Sevens", "Sailing",
    "Shooting", "Skateboarding", "Sport Climbing", "Surfing", "Swimming", "Table Tennis",
    "Taekwondo", "Tennis", "Trampoline", "Triathlon", "Volleyball", "Water Polo",
    "Weightlifting", "Wrestling"
]

In [12]:
# Filtrer le DataFrame pour garder uniquement les disciplines des JO 2024
df_cleaned = df_cleaned.filter(df_cleaned.discipline_title.isin(olympic_disciplines_2024))

In [13]:
# Afficher le DataFrame filtré (les 10 premières lignes)
df_cleaned.show(10)

+----------------+---------------+----------+----------------+----------+----------+-------------+--------------------+------------+---------------------+-----------------+----------+----------+
|discipline_title|    event_title| slug_game|participant_type|medal_type|rank_equal|rank_position|        country_name|country_code|country_3_letter_code|athlete_full_name|value_unit|value_type|
+----------------+---------------+----------+----------------+----------+----------+-------------+--------------------+------------+---------------------+-----------------+----------+----------+
|        Shooting|Trap Mixed Team|tokyo-2020|        GameTeam|      GOLD|     False|            1|               Spain|          ES|                  ESP|          Unknown|   Unknown|   Unknown|
|        Shooting|Trap Mixed Team|tokyo-2020|        GameTeam|    SILVER|     False|            2|          San Marino|          SM|                  SMR|          Unknown|   Unknown|   Unknown|
|        Shooting|Trap Mi

In [14]:
# Afficher toutes les disciplines distinctes dans le DataFrame filtré
distinct_disciplines_filtered = df_cleaned.select('discipline_title').distinct()
distinct_disciplines_filtered.show(truncate=False)

+---------------------+
|discipline_title     |
+---------------------+
|Tennis               |
|Boxing               |
|Marathon Swimming    |
|Golf                 |
|Rowing               |
|Judo                 |
|Sailing              |
|Swimming             |
|Cycling BMX Freestyle|
|Basketball           |
|Handball             |
|Rhythmic Gymnastics  |
|Triathlon            |
|Badminton            |
|Canoe Sprint         |
|Athletics            |
|Cycling Track        |
|Beach Volleyball     |
|Skateboarding        |
|Equestrian           |
+---------------------+
only showing top 20 rows



In [None]:
from pyspark.sql import SparkSession

# Initialiser SparkSession
spark = SparkSession.builder \
    .appName("PostgreSQL Test Connection") \
    .config("spark.jars", "postgresql-42.7.3.jar") \
    .getOrCreate()

# Définir les paramètres de connexion
url = "jdbc:postgresql://postgresql-jo2024.alwaysdata.net:5432/jo2024_postgres"
properties = {
    "user": "jo2024_fatima",
    "password": "Ipssi2024!",  # Assurez-vous de protéger cette information sensible
    "driver": "org.postgresql.Driver"
}

# Test de la connexion en lisant une petite table ou en exécutant une requête simple
try:
    df_test = spark.read.jdbc(url=url, table="(SELECT 1) AS test_query", properties=properties)
    df_test.show()
    print("Connexion réussie à la base de données jo2024_postgres !")
except Exception as e:
    print(f"Erreur lors de la connexion : {e}")

In [None]:
from pyspark.sql import SparkSession

# Initialiser SparkSession
spark = SparkSession.builder \
    .appName("PostgreSQL Save DataFrame") \
    .config("spark.jars", "postgresql-42.7.3.jar") \
    .getOrCreate()

# Définir les paramètres de connexion
url = "jdbc:postgresql://postgresql-jo2024.alwaysdata.net:5432/jo2024_postgres"
properties = {
    "user": "jo2024_fatima",
    "password": "Ipssi2024!",  # Assurez-vous de protéger cette information sensible
    "driver": "org.postgresql.Driver"
}

# Assurez-vous que df_cleaned est déjà défini quelque part dans votre script
# Par exemple :
# df_cleaned = ...

# Sauvegarder le DataFrame dans PostgreSQL
try:
    df_cleaned.write \
        .jdbc(url=url, table="your_table_name", mode="overwrite", properties=properties)
    print("DataFrame sauvegardé avec succès dans la table 'your_table_name' de la base de données jo2024_postgres !")
except Exception as e:
    print(f"Erreur lors de l'enregistrement du DataFrame : {e}")
