# Dataframes
**By Jorge S. Ruiz**
 - This is an introduction about how to use Dataframes in Spark.
 - Dataframes can be used as SQL tables.
 - Dataframes have better optimization because they use Catalyst as query optimization and Tugsten as execution engine.
 

## Creating a dataframe from csv file

In [3]:
# Libraries
from pyspark import SparkContext
from pyspark.sql import SparkSession

# Libraries for Datatypes (dataframes)
from pyspark.sql.types import StructType, StructField 
from pyspark.sql.types import IntegerType, StringType, FloatType
from pyspark.sql.types import Row

# Library for SQL
from pyspark.sql import SQLContext


In [4]:
# Initializing Spark
spark = SparkContext(master='local', appName='Dataframes')
# Initializing SQL Context
sqlContext = SQLContext(spark)

In [5]:
!ls /home/lastorder/Documents/curso-apache-spark-platzi/files

deporte.csv	 deportistaError.csv  modelo_relacional.jpg
deportista2.csv  evento.csv	      paises.csv
deportista.csv	 juegos.csv	      resultados.csv


In [6]:
# Linux command to check the file content
!head -n 4 /home/lastorder/Documents/curso-apache-spark-platzi/files/juegos.csv

,nombre_juego,annio,temporada,ciudad
1,1896 Verano,1896,Verano,Athina
2,1900 Verano,1900,Verano,Paris
3,1904 Verano,1904,Verano,St. Louis


In [7]:
# Creating the PATH to csv files.
path = '/home/lastorder/Documents/curso-apache-spark-platzi/files/'


In [8]:
# First we need to create a schema with the information of the columns:
# Struct file helpus to indicate the parameters of the columns of the DF
# The fields are, name of the column, datatype and if the column is an optional field.
# False indicates that the column is a necessary field and true indicates that is optional.

gameSchema = StructType([
    StructField('game_id',IntegerType(),False),
    StructField('year',StringType(),False),
    StructField('season',StringType(),False),
    StructField('city',StringType(),False)
])

# Now we can create the dataframe using our previous Schema.

gameDF = sqlContext.read.schema(gameSchema).option('header','true') \
    .csv(path+'juegos.csv')

In [9]:
# In dataframes we can use "show" to obtain a better data visualization
gameDF.show(10)

+-------+-------------+------+--------+
|game_id|         year|season|    city|
+-------+-------------+------+--------+
|      1|  1896 Verano|  1896|  Verano|
|      2|  1900 Verano|  1900|  Verano|
|      3|  1904 Verano|  1904|  Verano|
|      4|  1906 Verano|  1906|  Verano|
|      5|  1908 Verano|  1908|  Verano|
|      6|  1912 Verano|  1912|  Verano|
|      7|  1920 Verano|  1920|  Verano|
|      8|1924 Invierno|  1924|Invierno|
|      9|  1924 Verano|  1924|  Verano|
|     10|1928 Invierno|  1928|Invierno|
+-------+-------------+------+--------+
only showing top 10 rows



In [10]:
# To access to Spark UI console, we use just "spark" command and click on Spark UI
spark

## Using a Extract from RDDs Notebook

In [11]:
# Exporting 2 RDDs, the first one contains the header and the second one contains the data.
OlimpicAthleteRDD = spark.textFile(path+'deportista.csv').map(lambda l : l.split(','))
OlimpicAthleteRDD2 = spark.textFile(path+'deportista2.csv').map(lambda l : l.split(','))

# To make a union between the RDDs we can use:
OlimpicAthleteRDD = OlimpicAthleteRDD.union(OlimpicAthleteRDD2)


In [12]:
# To make sure that the data is not corrupted, we can use count() to verify that spark is working correctly
# with that data
OlimpicAthleteRDD.count()

135572

In [13]:
# To see the first 10 rows of the RDD, we use top function (similar to SQL)
OlimpicAthleteRDD.take(5)

[['deportista_id', 'nombre', 'genero', 'edad', 'altura', 'peso', 'equipo_id'],
 ['1', 'A Dijiang', '1', '24', '180', '80', '199'],
 ['2', 'A Lamusi', '1', '23', '170', '60', '199'],
 ['3', 'Gunnar Nielsen Aaby', '1', '24', '0', '0', '273'],
 ['4', 'Edgar Lindenau Aabye', '1', '34', '0', '0', '278']]

In [14]:
# Removing the header of the RDD
# 'iter' function, returns all values of we process in the function

def removeHeader(index, iterator):
    """A fuction that removes the header from a dataset or RDD"""
    return iter(list(iterator)[1:])

In [15]:
# We map the RDD assigning a index (this maps rows and columns per index)
# Now we can use a function that will take action in all rows and columns on the RDD.
OlimpicAthleteRDD_clean = OlimpicAthleteRDD.mapPartitionsWithIndex(removeHeader)

In [16]:
# To see if the header is gone
OlimpicAthleteRDD_clean.take(5)

[['1', 'A Dijiang', '1', '24', '180', '80', '199'],
 ['2', 'A Lamusi', '1', '23', '170', '60', '199'],
 ['3', 'Gunnar Nielsen Aaby', '1', '24', '0', '0', '273'],
 ['4', 'Edgar Lindenau Aabye', '1', '34', '0', '0', '278'],
 ['5', 'Christine Jacoba Aaftink', '2', '21', '185', '82', '705']]

In [20]:
# To transform the values of the RDD
# For that we are goint to make a mapping

OlimpicAthleteRDD_clean = OlimpicAthleteRDD_clean.map(lambda l: (
    int(l[0]),
    l[1],
    int(l[2]),
    int(l[3]),
    int(l[4]),
    float(l[5]),
    int(l[6])    
))


In [17]:
# We ned to define a new Schema:

schema = StructType([
    StructField('athlete_id', IntegerType(),False),
    StructField('name', StringType(),False),
    StructField('gender', IntegerType(),False),
    StructField('age', IntegerType(),False),
    StructField('height', IntegerType(),False),
    StructField('weight', FloatType(),False),
    StructField('team_id', IntegerType(),False),
])

In [21]:
# Creating a dataframe with sqlContext, using an existing RDD and a Schema
AthleteDF = sqlContext.createDataFrame(OlimpicAthleteRDD_clean, schema)

In [22]:
# To prove everything is correct with our new DF, we can use show function
AthleteDF.show(10)

+----------+--------------------+------+---+------+------+-------+
|athlete_id|                name|gender|age|height|weight|team_id|
+----------+--------------------+------+---+------+------+-------+
|         1|           A Dijiang|     1| 24|   180|  80.0|    199|
|         2|            A Lamusi|     1| 23|   170|  60.0|    199|
|         3| Gunnar Nielsen Aaby|     1| 24|     0|   0.0|    273|
|         4|Edgar Lindenau Aabye|     1| 34|     0|   0.0|    278|
|         5|Christine Jacoba ...|     2| 21|   185|  82.0|    705|
|         6|     Per Knut Aaland|     1| 31|   188|  75.0|   1096|
|         7|        John Aalberg|     1| 31|   183|  72.0|   1096|
|         8|Cornelia Cor Aalt...|     2| 18|   168|   0.0|    705|
|         9|    Antti Sami Aalto|     1| 26|   186|  96.0|    350|
|        10|Einar Ferdinand E...|     1| 26|     0|   0.0|    350|
+----------+--------------------+------+---+------+------+-------+
only showing top 10 rows



In [34]:
# To see the explicit values without format we use "take" function
AthleteDF.take(10)

[Row(athlete_id=1, name='A Dijiang', gender=1, age=24, height=180, weight=80.0, team_id=199),
 Row(athlete_id=2, name='A Lamusi', gender=1, age=23, height=170, weight=60.0, team_id=199),
 Row(athlete_id=3, name='Gunnar Nielsen Aaby', gender=1, age=24, height=0, weight=0.0, team_id=273),
 Row(athlete_id=4, name='Edgar Lindenau Aabye', gender=1, age=34, height=0, weight=0.0, team_id=278),
 Row(athlete_id=5, name='Christine Jacoba Aaftink', gender=2, age=21, height=185, weight=82.0, team_id=705),
 Row(athlete_id=6, name='Per Knut Aaland', gender=1, age=31, height=188, weight=75.0, team_id=1096),
 Row(athlete_id=7, name='John Aalberg', gender=1, age=31, height=183, weight=72.0, team_id=1096),
 Row(athlete_id=8, name='Cornelia Cor Aalten Strannood ', gender=2, age=18, height=168, weight=0.0, team_id=705),
 Row(athlete_id=9, name='Antti Sami Aalto', gender=1, age=26, height=186, weight=96.0, team_id=350),
 Row(athlete_id=10, name='Einar Ferdinand Einari Aalto', gender=1, age=26, height=0, we

In [24]:
# Creating another Dataframe from RDD
CountriesRDD = spark.textFile(path+"paises.csv")\
    .map(lambda line : line.split(","))

CountriesRDD = CountriesRDD.mapPartitionsWithIndex(removeHeader)


CountriesRDD = CountriesRDD.map(lambda l: (
    int(l[0]),
    l[1],
    l[2]
))

CountriesSchema = StructType([
    StructField('team_id', IntegerType(),False),
    StructField('team_name', StringType(),False),
    StructField('country_name', StringType(),False)
])

CountriesDF = sqlContext.createDataFrame(CountriesRDD, CountriesSchema)

In [25]:
CountriesDF.show(10)

+-------+--------------------+------------+
|team_id|           team_name|country_name|
+-------+--------------------+------------+
|      1|         30. Februar|         AUT|
|      2|A North American ...|         MEX|
|      3|           Acipactli|         MEX|
|      4|             Acturus|         ARG|
|      5|         Afghanistan|         AFG|
|      6|            Akatonbo|         IRL|
|      7|            Alain IV|         SUI|
|      8|             Albania|         ALB|
|      9|              Alcaid|         POR|
|     10|            Alcyon-6|         FRA|
+-------+--------------------+------------+
only showing top 10 rows



In [26]:
# Creating a Dataframefrom csv with a schema (without using an RDD)
OlympicSportsRDDSchema = StructType([
    StructField('sport_id', IntegerType(),False),
    StructField('sport_name', StringType(),False)
])

sportsDF = sqlContext.read.schema(OlympicSportsRDDSchema).option('header','true') \
    .csv(path+'deporte.csv')

In [27]:
sportsDF.show(10)

+--------+--------------------+
|sport_id|          sport_name|
+--------+--------------------+
|       1|          Basketball|
|       2|                Judo|
|       3|            Football|
|       4|          Tug-Of-War|
|       5|       Speed Skating|
|       6|Cross Country Skiing|
|       7|           Athletics|
|       8|          Ice Hockey|
|       9|            Swimming|
|      10|           Badminton|
+--------+--------------------+
only showing top 10 rows



In [28]:
# In the same way, we can create as many datasets as we want.
# All we need is to define an schema and import the csv file.
EventsRDDSchema = StructType([
    StructField('event_id', IntegerType(),False),
    StructField('event_name', StringType(),False),
    StructField('sport_id', IntegerType(),False)
])

OlympicEventsDF = sqlContext.read.schema(EventsRDDSchema).option('header','true') \
    .csv(path+'evento.csv')

In [29]:
OlympicEventsDF.show(10)

+--------+--------------------+--------+
|event_id|          event_name|sport_id|
+--------+--------------------+--------+
|       1|Basketball Men's ...|       1|
|       2|Judo Men's Extra-...|       2|
|       3|Football Men's Fo...|       3|
|       4|Tug-Of-War Men's ...|       4|
|       5|Speed Skating Wom...|       5|
|       6|Speed Skating Wom...|       5|
|       7|Cross Country Ski...|       6|
|       8|Cross Country Ski...|       6|
|       9|Cross Country Ski...|       6|
|      10|Cross Country Ski...|       6|
+--------+--------------------+--------+
only showing top 10 rows



In [30]:
GamesRDDSchema = StructType([
    StructField('game_id', IntegerType(),False),
    StructField('year', StringType(),False),
    StructField('season', StringType(),False),
    StructField('city', StringType(),False)
])

GamesDF = sqlContext.read.schema(GamesRDDSchema).option('header','true') \
    .csv(path+'juegos.csv')

In [31]:
GamesDF.show(10)

+-------+-------------+------+--------+
|game_id|         year|season|    city|
+-------+-------------+------+--------+
|      1|  1896 Verano|  1896|  Verano|
|      2|  1900 Verano|  1900|  Verano|
|      3|  1904 Verano|  1904|  Verano|
|      4|  1906 Verano|  1906|  Verano|
|      5|  1908 Verano|  1908|  Verano|
|      6|  1912 Verano|  1912|  Verano|
|      7|  1920 Verano|  1920|  Verano|
|      8|1924 Invierno|  1924|Invierno|
|      9|  1924 Verano|  1924|  Verano|
|     10|1928 Invierno|  1928|Invierno|
+-------+-------------+------+--------+
only showing top 10 rows



In [32]:
ResultsRDDSchema = StructType([
    StructField('result_id', IntegerType(),False),
    StructField('medal', StringType(),False),
    StructField('athlete_id', IntegerType(),False),
    StructField('game_id', IntegerType(),False),
    StructField('event_id', IntegerType(),False)
])

ResultsDF = sqlContext.read.schema(ResultsRDDSchema).option('header','true') \
    .csv(path+'resultados.csv')

In [33]:
ResultsDF.take(10)

[Row(result_id=1, medal='NA', athlete_id=1, game_id=39, event_id=1),
 Row(result_id=2, medal='NA', athlete_id=2, game_id=49, event_id=2),
 Row(result_id=3, medal='NA', athlete_id=3, game_id=7, event_id=3),
 Row(result_id=4, medal='Gold', athlete_id=4, game_id=2, event_id=4),
 Row(result_id=5, medal='NA', athlete_id=5, game_id=36, event_id=5),
 Row(result_id=6, medal='NA', athlete_id=5, game_id=36, event_id=6),
 Row(result_id=7, medal='NA', athlete_id=5, game_id=38, event_id=5),
 Row(result_id=8, medal='NA', athlete_id=5, game_id=38, event_id=6),
 Row(result_id=9, medal='NA', athlete_id=5, game_id=40, event_id=5),
 Row(result_id=10, medal='NA', athlete_id=5, game_id=40, event_id=6)]