In [68]:
from pyspark.sql import SparkSession

def create_spark_session() -> SparkSession:
    spark = (
        SparkSession.builder
        .appName("SilverLayer")
        .enableHiveSupport()
        .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
        .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
        .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
        .config("spark.hadoop.fs.s3minio.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
        .config("fs.s3a.access.key","admin")
        .config("fs.s3a.secret.key","senhasegura",)
        .config("fs.s3a.endpoint","http://minio:9000",)
        .config("fs.s3a.path.style.access","true",)
        .config("fs.s3a.connection.ssl.enabled","false")
        .getOrCreate()
    )
    return spark

spark = create_spark_session()

 

In [54]:
df_team = spark.read.parquet("s3a://bucket-bronze-zone/team/")
df_team_attributes = spark.read.parquet("s3a://bucket-bronze-zone/team_attributes/")
df_match = spark.read.parquet("s3a://bucket-bronze-zone/match/")
#df_team.show()
#df_team_attributes.show()
#df_match.show()

print(f'Team: {df_team.columns}\n')
print(f'Team_attributes: {df_team_attributes.columns}\n')
print(f'Match: {df_match.columns}\n')


Team: ['_airbyte_raw_id', '_airbyte_extracted_at', '_airbyte_meta', '_airbyte_generation_id', 'id', 'team_api_id', 'team_long_name', 'team_short_name', 'team_fifa_api_id']

Team_attributes: ['_airbyte_raw_id', '_airbyte_extracted_at', '_airbyte_meta', '_airbyte_generation_id', 'id', 'date', 'team_api_id', 'defencePressure', 'buildUpPlaySpeed', 'defenceTeamWidth', 'team_fifa_api_id', 'defenceAggression', 'buildUpPlayPassing', 'buildUpPlayDribbling', 'defencePressureClass', 'buildUpPlaySpeedClass', 'chanceCreationPassing', 'defenceTeamWidthClass', 'chanceCreationCrossing', 'chanceCreationShooting', 'defenceAggressionClass', 'buildUpPlayPassingClass', 'defenceDefenderLineClass', 'buildUpPlayDribblingClass', 'chanceCreationPassingClass', 'buildUpPlayPositioningClass', 'chanceCreationCrossingClass', 'chanceCreationShootingClass', 'chanceCreationPositioningClass']

Match: ['_airbyte_raw_id', '_airbyte_extracted_at', '_airbyte_meta', '_airbyte_generation_id', 'id', 'BSA', 'BSD', 'BSH', 'BWA',

In [55]:
#TO DO Excluir colunas
#df_team = df_team.drop("team_long_name", "team_fifa_api_id")
#print(df_team.columns)

# = APos finalizar o projeto excluir colunas não utilizadas
# df_team_attributes = df_team_attributes.drop("_airbyte_raw_id", "_airbyte_extracted_at", "_airbyte_meta", "_airbyte_generation_id")
# print(df_team_attributes.columns) 

# df_match = df_match.drop("_airbyte_raw_id", "_airbyte_extracted_at", "_airbyte_meta", "_airbyte_generation_id")
# print(df_match.columns)


In [56]:
#Renomeando as colunas = Apos utilizar as colunas nas querys fazer a mudança
df_team = df_team.withColumnRenamed("team_short_name", "time")
df_team_attributes = df_team_attributes.withColumnRenamed("date", "data")
#df_match = df_match.withColumnRenamed("date", "data").withColumnRenamed("XXX", "xxxx")
df_team.columns

['_airbyte_raw_id',
 '_airbyte_extracted_at',
 '_airbyte_meta',
 '_airbyte_generation_id',
 'id',
 'team_api_id',
 'team_long_name',
 'time',
 'team_fifa_api_id']

In [57]:
#Verificando dados ausentes
from pyspark.sql.functions import col, sum, mean


df_team.select([sum(col(c).isNull().cast("int")).alias(c) for c in df_team.columns]).show()
df_team.count()

+---------------+---------------------+-------------+----------------------+---+-----------+--------------+----+----------------+
|_airbyte_raw_id|_airbyte_extracted_at|_airbyte_meta|_airbyte_generation_id| id|team_api_id|team_long_name|time|team_fifa_api_id|
+---------------+---------------------+-------------+----------------------+---+-----------+--------------+----+----------------+
|              0|                    0|            0|                     0|  0|          0|             0|   0|              11|
+---------------+---------------------+-------------+----------------------+---+-----------+--------------+----+----------------+



299

In [58]:
from pyspark.sql.types import NumericType

# Identificar colunas numéricas
numeric_cols_team = [c for c, t in df_team.dtypes if isinstance(df_team.schema[c].dataType, NumericType)]

# Calcular a média apenas para colunas numéricas
mean_values_team = df_team.select([mean(col(c)).alias(c) for c in numeric_cols_team]).collect()[0].asDict()

# Substituir valores nulos nas colunas numéricas pela média correspondente
df_team = df_team.fillna(mean_values_team)

df_team.select([sum(col(c).isNull().cast("int")).alias(c) for c in df_team.columns]).show()
df_team.show(5)


+---------------+---------------------+-------------+----------------------+---+-----------+--------------+----+----------------+
|_airbyte_raw_id|_airbyte_extracted_at|_airbyte_meta|_airbyte_generation_id| id|team_api_id|team_long_name|time|team_fifa_api_id|
+---------------+---------------------+-------------+----------------------+---+-----------+--------------+----+----------------+
|              0|                    0|            0|                     0|  0|          0|             0|   0|               0|
+---------------+---------------------+-------------+----------------------+---+-----------+--------------+----+----------------+

+--------------------+---------------------+--------------+----------------------+---+-----------+-----------------+----+----------------+
|     _airbyte_raw_id|_airbyte_extracted_at| _airbyte_meta|_airbyte_generation_id| id|team_api_id|   team_long_name|time|team_fifa_api_id|
+--------------------+---------------------+--------------+------------

In [59]:
#Verificando dados ausentes
df_team_attributes.select([sum(col(c).isNull().cast("int")).alias(c) for c in df_team_attributes.columns]).show()
df_team_attributes.count()

+---------------+---------------------+-------------+----------------------+---+----+-----------+---------------+----------------+----------------+----------------+-----------------+------------------+--------------------+--------------------+---------------------+---------------------+---------------------+----------------------+----------------------+----------------------+-----------------------+------------------------+-------------------------+--------------------------+---------------------------+---------------------------+---------------------------+------------------------------+
|_airbyte_raw_id|_airbyte_extracted_at|_airbyte_meta|_airbyte_generation_id| id|data|team_api_id|defencePressure|buildUpPlaySpeed|defenceTeamWidth|team_fifa_api_id|defenceAggression|buildUpPlayPassing|buildUpPlayDribbling|defencePressureClass|buildUpPlaySpeedClass|chanceCreationPassing|defenceTeamWidthClass|chanceCreationCrossing|chanceCreationShooting|defenceAggressionClass|buildUpPlayPassingClass|de

1458

In [60]:
# Identificar colunas numéricas
numeric_cols_team_attributes = [c for c, t in df_team_attributes.dtypes if isinstance(df_team_attributes.schema[c].dataType, NumericType)]
   
# Calcular a média apenas para colunas numéricas
mean_values_team_attributes = df_team_attributes.select([mean(col(c)).alias(c) for c in numeric_cols_team_attributes]).collect()[0].asDict()

# Substituir valores nulos nas colunas numéricas pela média correspondente
df_team_attributes = df_team_attributes.fillna(mean_values_team_attributes)

df_team_attributes.select([sum(col(c).isNull().cast("int")).alias(c) for c in df_team_attributes.columns]).show()
df_team_attributes.show(5)

+---------------+---------------------+-------------+----------------------+---+----+-----------+---------------+----------------+----------------+----------------+-----------------+------------------+--------------------+--------------------+---------------------+---------------------+---------------------+----------------------+----------------------+----------------------+-----------------------+------------------------+-------------------------+--------------------------+---------------------------+---------------------------+---------------------------+------------------------------+
|_airbyte_raw_id|_airbyte_extracted_at|_airbyte_meta|_airbyte_generation_id| id|data|team_api_id|defencePressure|buildUpPlaySpeed|defenceTeamWidth|team_fifa_api_id|defenceAggression|buildUpPlayPassing|buildUpPlayDribbling|defencePressureClass|buildUpPlaySpeedClass|chanceCreationPassing|defenceTeamWidthClass|chanceCreationCrossing|chanceCreationShooting|defenceAggressionClass|buildUpPlayPassingClass|de

In [61]:
#Verificando dados ausentes
df_match.select([sum(col(c).isNull().cast("int")).alias(c) for c in df_match.columns]).show()
df_match.count()

+---------------+---------------------+-------------+----------------------+---+-----+-----+-----+----+----+----+-----+-----+-----+----+----+----+----+----+----+-----+-----+-----+----+----+----+----+----+----+----+----+----+-----+----+-----+-----+-----+-----+-----+-----+------+------+------+-------+---------+----------+----------+----------+------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+--

25979

In [62]:
# Identificar colunas numéricas
numeric_cols_match = [c for c, t in df_match.dtypes if isinstance(df_match.schema[c].dataType, NumericType)]
   
# Calcular a média apenas para colunas numéricas
mean_values_match = df_match.select([mean(col(c)).alias(c) for c in numeric_cols_match]).collect()[0].asDict()

# Substituir valores nulos nas colunas numéricas pela média correspondente
df_match = df_match.fillna(mean_values_match)

df_match.select([sum(col(c).isNull().cast("int")).alias(c) for c in df_match.columns]).show()
df_match.show(5)

+---------------+---------------------+-------------+----------------------+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+-----+----+-----+-----+-----+-----+-----+-----+------+------+------+-------+---------+----------+----------+----------+------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+--------

In [63]:
#Excluir colunas 
#df_country = df_country.drop("_airbyte_raw_id", "_airbyte_extracted_at", "_airbyte_meta", "_airbyte_generation_id")
#print(df_country.columns)

#df_league = df_league.drop("_airbyte_raw_id", "_airbyte_extracted_at", "_airbyte_meta", "_airbyte_generation_id")
#print(df_league.columns)

#df_player = df_player.drop("_airbyte_raw_id", "_airbyte_extracted_at", "_airbyte_meta", "_airbyte_generation_id")
#print(df_player.columns)



In [64]:
#Renomeando as colunas
#df_country = df_country.withColumnRenamed("name", "nome")
#df_league = df_league.withColumnRenamed("name", "nome").withColumnRenamed("country_id", "país")
#df_player = df_player.withColumnRenamed("height", "altura").withColumnRenamed("weight", "peso").withColumnRenamed("birthday", "nascimento").withColumnRenamed("player_name", "nome")

In [65]:
#Gravação no minio
(
    df_match.write
    .format("delta")
    .options(**minio_delta_options)
    .mode("overwrite")
    .save("s3a://bucket-silver-zone/match/")
)

(
    df_team.write
    .format("delta")
    .options(**minio_delta_options)
    .mode("overwrite")
    .save("s3a://bucket-silver-zone/team/")
)

(
    df_team_attributes.write
    .format("delta")
    .options(**minio_delta_options)
    .mode("overwrite")
    .save("s3a://bucket-silver-zone/team_attributes/")
)
print("Tabelas Delta gravadas na zona silver com sucesso no MinIO!")

spark.stop()

Tabelas Delta gravadas na zona silver com sucesso no MinIO!
