## Bonus on Google Cloud

In [1]:
import pyspark
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.functions import *
import pandas as pd

from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("fifa") \
    .config("spark.jars", "/usr/lib/spark/jars/postgresql-42.2.20.jar") \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/11/14 19:37:48 INFO SparkEnv: Registering MapOutputTracker
24/11/14 19:37:48 INFO SparkEnv: Registering BlockManagerMaster
24/11/14 19:37:49 INFO SparkEnv: Registering BlockManagerMasterHeartbeat
24/11/14 19:37:49 INFO SparkEnv: Registering OutputCommitCoordinator


In [2]:
ls /usr/lib/spark/jars/postgresql-42.2.20.jar

/usr/lib/spark/jars/postgresql-42.2.20.jar


In [3]:
# Read one of data to get the headers
data_sample = pd.read_csv("gs://myspark-bucket/data/players_15.csv", nrows=2)
column_names = list(data_sample.columns)

In [4]:
def infer_sql_type(dtype):
    """Map pandas data type to PostgreSQL"""
    if pd.api.types.is_integer_dtype(dtype):
        return "INT"
    elif pd.api.types.is_float_dtype(dtype):
        return "FLOAT"
    elif pd.api.types.is_bool_dtype(dtype):
        return "BOOLEAN"
    else:
        return "VARCHAR"

# Infer data type for each column in Postgres
column_types = {col: infer_sql_type(dtype) for col, dtype in zip(data_sample.columns, data_sample.dtypes)}

# Create a new column of year
column_types['year'] = "INT"

# Create a new column of gender
column_types['gender'] = "VARCHAR"

# Print the crrent result
for col, col_type in column_types.items():
    print(f"{col}: {col_type}")

sofifa_id: INT
player_url: VARCHAR
short_name: VARCHAR
long_name: VARCHAR
player_positions: VARCHAR
overall: INT
potential: INT
value_eur: FLOAT
wage_eur: FLOAT
age: INT
dob: VARCHAR
height_cm: INT
weight_kg: INT
club_team_id: FLOAT
club_name: VARCHAR
league_name: VARCHAR
league_level: INT
club_position: VARCHAR
club_jersey_number: INT
club_loaned_from: FLOAT
club_joined: VARCHAR
club_contract_valid_until: INT
nationality_id: INT
nationality_name: VARCHAR
nation_team_id: FLOAT
nation_position: VARCHAR
nation_jersey_number: INT
preferred_foot: VARCHAR
weak_foot: INT
skill_moves: INT
international_reputation: INT
work_rate: VARCHAR
body_type: VARCHAR
real_face: VARCHAR
release_clause_eur: FLOAT
player_tags: VARCHAR
player_traits: VARCHAR
pace: INT
shooting: INT
passing: INT
dribbling: INT
defending: INT
physic: INT
attacking_crossing: INT
attacking_finishing: INT
attacking_heading_accuracy: INT
attacking_short_passing: INT
attacking_volleys: INT
skill_dribbling: INT
skill_curve: INT
skil

In [5]:
#Create a new table names "fifa"
table = "dataset" 
columns = ",\n    ".join([f'"{col}" {dtype}' for col, dtype in column_types.items()])

create_table = f"""
CREATE SCHEMA fifa;
CREATE TABLE fifa.{table} (
    id SERIAL PRIMARY KEY,
    {columns}
);
"""

print(create_table)


CREATE SCHEMA fifa;
CREATE TABLE fifa.dataset (
    id SERIAL PRIMARY KEY,
    "sofifa_id" INT,
    "player_url" VARCHAR,
    "short_name" VARCHAR,
    "long_name" VARCHAR,
    "player_positions" VARCHAR,
    "overall" INT,
    "potential" INT,
    "value_eur" FLOAT,
    "wage_eur" FLOAT,
    "age" INT,
    "dob" VARCHAR,
    "height_cm" INT,
    "weight_kg" INT,
    "club_team_id" FLOAT,
    "club_name" VARCHAR,
    "league_name" VARCHAR,
    "league_level" INT,
    "club_position" VARCHAR,
    "club_jersey_number" INT,
    "club_loaned_from" FLOAT,
    "club_joined" VARCHAR,
    "club_contract_valid_until" INT,
    "nationality_id" INT,
    "nationality_name" VARCHAR,
    "nation_team_id" FLOAT,
    "nation_position" VARCHAR,
    "nation_jersey_number" INT,
    "preferred_foot" VARCHAR,
    "weak_foot" INT,
    "skill_moves" INT,
    "international_reputation" INT,
    "work_rate" VARCHAR,
    "body_type" VARCHAR,
    "real_face" VARCHAR,
    "release_clause_eur" FLOAT,
    "player_

In [6]:
# Read male data
total_data = spark.read.csv('gs://myspark-bucket/data/players_15.csv', header=True, inferSchema=True)
total_data = total_data.withColumn("year", lit(2015))
total_data = total_data.withColumn("gender", lit("male"))

for i in range(16,17):
    data = spark.read.csv(f"gs://myspark-bucket/data/players_{i}.csv",header=True, inferSchema=True)
    data = data.toDF(*column_names)
    data = data.withColumn("year", lit(2000+i))
    data = data.withColumn("gender", lit("male"))
    total_data = total_data.union(data)
print("total male players: ",total_data.count())

#Read female data
for i in range(16,23):
    data = spark.read.csv(f'gs://myspark-bucket/data/players_{i}.csv',header=True, inferSchema=True)
    data = data.toDF(*column_names)
    data = data.withColumn("year", lit(2000+i))
    data = data.withColumn("gender", lit("female"))
    total_data = total_data.union(data)
    
print("total female players: ",total_data.count())   

# Ensure every record can be uniquely identified in the database table
print("total unique records: ",total_data.distinct().count())

                                                                                

total male players:  31778


                                                                                

total female players:  157702


24/11/14 19:38:56 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.

total unique records:  157702


                                                                                

In [7]:
# Ingest data to PostgresDB
postgres_url = "jdbc:postgresql://172.25.240.4:5432/postgres"
postgres_properties = {
    "user": "postgres",
    "password": "Liujiayi",
    "driver": "org.postgresql.Driver"
}

# Write data to DB
total_data.write \
    .option("driver", "org.postgresql.Driver")\
    .jdbc(url=postgres_url, table="fifa", mode="overwrite", properties=postgres_properties)

# Read data from DB
data_postgres = spark.read \
    .jdbc(url=postgres_url, table="fifa", properties=postgres_properties)

data_postgres.printSchema()
data_postgres.show(5, vertical=True)

                                                                                

root
 |-- sofifa_id: integer (nullable = true)
 |-- player_url: string (nullable = true)
 |-- short_name: string (nullable = true)
 |-- long_name: string (nullable = true)
 |-- player_positions: string (nullable = true)
 |-- overall: integer (nullable = true)
 |-- potential: integer (nullable = true)
 |-- value_eur: double (nullable = true)
 |-- wage_eur: double (nullable = true)
 |-- age: integer (nullable = true)
 |-- dob: date (nullable = true)
 |-- height_cm: integer (nullable = true)
 |-- weight_kg: integer (nullable = true)
 |-- club_team_id: double (nullable = true)
 |-- club_name: string (nullable = true)
 |-- league_name: string (nullable = true)
 |-- league_level: integer (nullable = true)
 |-- club_position: string (nullable = true)
 |-- club_jersey_number: integer (nullable = true)
 |-- club_loaned_from: string (nullable = true)
 |-- club_joined: date (nullable = true)
 |-- club_contract_valid_until: integer (nullable = true)
 |-- nationality_id: integer (nullable = true)
 

                                                                                

-RECORD 0-------------------------------------------
 sofifa_id                   | 215085               
 player_url                  | https://sofifa.co... 
 short_name                  | R. Naranjo           
 long_name                   | Rodrigo Felipe Na... 
 player_positions            | GK                   
 overall                     | 66                   
 potential                   | 66                   
 value_eur                   | 475000.0             
 wage_eur                    | 4000.0               
 age                         | 34                   
 dob                         | 1979-08-30           
 height_cm                   | 192                  
 weight_kg                   | 89                   
 club_team_id                | 112531.0             
 club_name                   | Deportes Iquique     
 league_name                 | Chilian Campeonat... 
 league_level                | 1                    
 club_position               | GK             

In [8]:
data_postgres.tail(5)

                                                                                

[Row(sofifa_id=264232, player_url='https://sofifa.com/player/264232/abdulkareem-al-sultan/220002', short_name='A. Al Sultan', long_name='Abdulkareem Al Sultan', player_positions='CB', overall=52, potential=63, value_eur=170000.0, wage_eur=2000.0, age=21, dob=datetime.date(2000, 6, 24), height_cm=186, weight_kg=76, club_team_id=112572.0, club_name='Al Tai', league_name='Saudi Abdul L. Jameel League', league_level=1, club_position='RES', club_jersey_number=4, club_loaned_from=None, club_joined=datetime.date(2020, 8, 1), club_contract_valid_until=2025, nationality_id=183, nationality_name='Saudi Arabia', nation_team_id=None, nation_position=None, nation_jersey_number=None, preferred_foot='Right', weak_foot=3, skill_moves=2, international_reputation=1, work_rate='Medium/Medium', body_type='Lean (185+)', real_face='No', release_clause_eur='315000', player_tags=None, player_traits=None, pace=59, shooting=25, passing=38, dribbling=38, defending=53, physic=59, attacking_crossing=39, attacking_