<h2>Initialize SparkSQL Application - Create SQL Context</h2>

In [16]:
# Uncomment the following lines if you are using Windows!

import findspark
findspark.init()
findspark.find()

import pyspark

from pyspark.sql import SparkSession
from pyspark import SparkContext, SQLContext

properties = {
    'username': 'postgres',
    'password': '20020202',
    'url': "jdbc:postgresql://localhost:5432/postgres",
    'table': 'fifa.player_data',
    'driver': 'org.postgresql.Driver'
}

def write_to_pgadmin(df, mode='overwrite'):
    df.write.format('jdbc').mode(mode)\
        .option("url", properties['url'])\
        .option("dbtable", properties['table'])\
        .option("user", properties['username'])\
        .option("password", properties['password'])\
        .option("Driver", properties['driver'])\
        .save()

def read_from_pgadmin():
    return spark.read.format("jdbc")\
        .option("url", properties['url'])\
        .option("dbtable", properties['table'])\
        .option("user", properties['username'])\
        .option("password", properties['password'])\
        .option("Driver", properties['driver'])\
        .load()

appName = "Big Data Analytics"
master = "local"

# Create Configuration object for Spark.
conf = pyspark.SparkConf()\
    .set('spark.driver.host','127.0.0.1')\
    .setAppName(appName)\
    .setMaster(master)

# Create Spark Context with the new configurations rather than relying on the default one
sc = SparkContext.getOrCreate(conf=conf)

# You need to create SQL Context to conduct some database operations like what we will see later.
sqlContext = SQLContext(sc)

# If you have SQL context, you create the session from the Spark Context
spark = sqlContext.sparkSession.builder.getOrCreate()

<h2>Read-in Dataset</h2>

In [10]:
# !pip install pandas
# import pandas as pd

# df_15 = pd.read_csv('data\players_15.csv')
# df_15

In [17]:
import os
from pyspark.sql.functions import lit
from pyspark.sql.functions import monotonically_increasing_id
root = 'data'
soccer_data = df = spark.read.csv(os.path.join(root, 'players_15.csv'), header=True, inferSchema=True)
schema = soccer_data.schema
soccer_data = soccer_data.withColumn('year', lit(2015))
for file in sorted(os.listdir(root)):
    if file.startswith('player') and file.split('_')[1][:-4] != '15':
        df = spark.read.csv(os.path.join(root, file), header=True, schema=schema)
        df = df.withColumn('year', lit(2000+int(file.split('_')[1][:-4])))
        soccer_data = soccer_data.union(df)

soccer_data = soccer_data.withColumn('unique_id', monotonically_increasing_id())
soccer_data.show(5)
soccer_data.printSchema()

+---------+--------------------+-----------------+--------------------+----------------+-------+---------+---------+--------+---+----------+---------+---------+------------+-------------------+--------------------+------------+-------------+------------------+----------------+-----------+-------------------------+--------------+----------------+--------------+---------------+--------------------+--------------+---------+-----------+------------------------+-------------+----------------+---------+------------------+--------------------+--------------------+----+--------+-------+---------+---------+------+------------------+-------------------+--------------------------+-----------------------+-----------------+---------------+-----------+-----------------+------------------+------------------+---------------------+---------------------+----------------+------------------+----------------+----------------+-------------+-------------+--------------+----------------+--------------------+-

In [18]:
write_to_pgadmin(soccer_data)