In [13]:
bucket_name = 'dataproc-staging-us-central1-291694249410-7gsa4pxg'
driver_path = f'gs://{bucket_name}/notebooks/jupyter/postgresql-42.6.2.jar'

spark = SparkSession.builder \
    .appName("FifaProject") \
    .master("local") \
    .config("spark.driver.host", "127.0.0.1") \
    .config("spark.driver.memory", "8g") \
    .config("spark.executor.memory", "4g") \
    .config("spark.jars", driver_path) \
    .getOrCreate()


print("Spark session started successfully with PostgreSQL driver")


Spark session started successfully with PostgreSQL driver


In [16]:
from pyspark.sql.functions import lit, when, col
from pyspark.sql.types import BooleanType, DoubleType, IntegerType

db_properties = {
    'user': 'fifaproject',
    'password': '18763',
    'url': 'jdbc:postgresql://127.0.0.1:5433/fifaproject',
    'dbtable': 'fifa.players_data',
    'driver': 'org.postgresql.Driver'
}


bucket_name = 'dataproc-staging-us-central1-291694249410-7gsa4pxg'
male_files = [f'gs://{bucket_name}/notebooks/jupyter/players_{year}.csv' for year in range(15, 23)]
female_files = [f'gs://{bucket_name}/notebooks/jupyter/female_players_{year}.csv' for year in range(16, 23)]
years = [2000 + year for year in range(16, 23)] + [2000 + year for year in range(15, 23)]
genders = ['Female'] * len(female_files) + ['Male'] * len(male_files)
files = female_files + male_files

for file, year, gender in zip(files, years, genders):
    try:
        df = spark.read.csv(file, header=True, inferSchema=True)
        df = df.withColumn("year", lit(year)).withColumn("sex", lit(gender))
        df = df.select([when(col(c) == "", None).otherwise(col(c)).alias(c) for c in df.columns])

        columns_to_cast = {
            'value_eur': DoubleType(),
            'wage_eur': DoubleType(),
            'club_team_id': IntegerType(),
            'league_level': IntegerType(),
            'club_jersey_number': IntegerType(),
        }
        for column, col_type in columns_to_cast.items():
            if column in df.columns:
                df = df.withColumn(column, col(column).cast(col_type))

        if 'real_face' in df.columns:
            df = df.withColumn("real_face", when(col("real_face") == "Yes", True)
                                          .when(col("real_face") == "No", False)
                                          .otherwise(None).cast(BooleanType()))

        df.write.format("jdbc") \
            .mode("append") \
            .option("url", db_properties['url']) \
            .option("dbtable", db_properties['dbtable']) \
            .option("user", db_properties['user']) \
            .option("password", db_properties['password']) \
            .option("driver", db_properties['driver']) \
            .save()
        
        print(f"Data for year {year}, gender: {gender} written to PostgreSQL.")

    except Exception as e:
        print(f"An error occurred while processing year {year}, gender: {gender}: {e}")

try:
    sample_df = spark.read \
        .format("jdbc") \
        .option("url", db_properties['url']) \
        .option("dbtable", "(SELECT * FROM fifa.players_data LIMIT 5) AS sample") \
        .option("user", db_properties['user']) \
        .option("password", db_properties['password']) \
        .option("driver", db_properties['driver']) \
        .load()
    sample_df.show()

except Exception as e:
    print(f"An error occurred while reading sample data: {e}")


'\nbucket_name = \'dataproc-staging-us-central1-291694249410-7gsa4pxg\'\nmale_files = [f\'gs://{bucket_name}/notebooks/jupyter/players_{year}.csv\' for year in range(15, 23)]\nfemale_files = [f\'gs://{bucket_name}/notebooks/jupyter/female_players_{year}.csv\' for year in range(16, 23)]\nyears = [2000 + year for year in range(16, 23)] + [2000 + year for year in range(15, 23)]\ngenders = [\'Female\'] * len(female_files) + [\'Male\'] * len(male_files)\nfiles = female_files + male_files\n\nfor file, year, gender in zip(files, years, genders):\n    try:\n        df = spark.read.csv(file, header=True, inferSchema=True)\n        df = df.withColumn("year", lit(year)).withColumn("sex", lit(gender))\n        df = df.select([when(col(c) == "", None).otherwise(col(c)).alias(c) for c in df.columns])\n\n        columns_to_cast = {\n            \'value_eur\': DoubleType(),\n            \'wage_eur\': DoubleType(),\n            \'club_team_id\': IntegerType(),\n            \'league_level\': IntegerType