# Dataframes
**By Jorge S. Ruiz**
 - This is an introduction about how to use Dataframes in Spark.
 - Dataframes can be used as SQL tables.
 - Dataframes have better optimization because they use Catalyst as query optimization and Tugsten as execution engine.
 

## Creating a dataframe from csv file

In [2]:
# Libraries
from pyspark import SparkContext
from pyspark.sql import SparkSession

# Libraries for Datatypes (dataframes)
from pyspark.sql.types import StructType, StructField 
from pyspark.sql.types import IntegerType, StringType, FloatType
from pyspark.sql.types import Row

# Library for SQL
from pyspark.sql import SQLContext


In [3]:
# Initializing Spark
spark = SparkContext(master='local', appName='Dataframes')
# Initializing SQL Context
sqlContext = SQLContext(spark)

In [4]:
!ls /home/lastorder/Documents/curso-apache-spark-platzi/files

deporte.csv	 deportistaError.csv  modelo_relacional.jpg
deportista2.csv  evento.csv	      paises.csv
deportista.csv	 juegos.csv	      resultados.csv


In [7]:
# Linux command to check the file content
!head -n 4 /home/lastorder/Documents/curso-apache-spark-platzi/files/juegos.csv

,nombre_juego,annio,temporada,ciudad
1,1896 Verano,1896,Verano,Athina
2,1900 Verano,1900,Verano,Paris
3,1904 Verano,1904,Verano,St. Louis


In [9]:
# Creating the PATH to csv files.
path = '/home/lastorder/Documents/curso-apache-spark-platzi/files/'


In [10]:
# First we need to create a schema with the information of the columns:
# Struct file helpus to indicate the parameters of the columns of the DF
# The fields are, name of the column, datatype and if the column is an optional field.
# False indicates that the column is a necessary field and true indicates that is optional.

gameSchema = StructType([
    StructField('game_id',IntegerType(),False),
    StructField('year',StringType(),False),
    StructField('season',StringType(),False),
    StructField('city',StringType(),False)
])

# Now we can create the dataframe using our previous Schema.

gameDF = sqlContext.read.schema(gameSchema).option('header','true') \
    .csv(path+'juegos.csv')

In [11]:
# In dataframes we can use "show" to obtain a better data visualization
gameDF.show(10)

+-------+-------------+------+--------+
|game_id|         year|season|    city|
+-------+-------------+------+--------+
|      1|  1896 Verano|  1896|  Verano|
|      2|  1900 Verano|  1900|  Verano|
|      3|  1904 Verano|  1904|  Verano|
|      4|  1906 Verano|  1906|  Verano|
|      5|  1908 Verano|  1908|  Verano|
|      6|  1912 Verano|  1912|  Verano|
|      7|  1920 Verano|  1920|  Verano|
|      8|1924 Invierno|  1924|Invierno|
|      9|  1924 Verano|  1924|  Verano|
|     10|1928 Invierno|  1928|Invierno|
+-------+-------------+------+--------+
only showing top 10 rows



In [13]:
# To access to Spark UI console, we use just "spark" command and click on Spark UI
spark

## Using a Extract from RDDs Notebook

In [15]:
# Exporting 2 RDDs, the first one contains the header and the second one contains the data.
OlimpicAthleteRDD = spark.textFile(path+'deportista.csv').map(lambda l : l.split(','))
OlimpicAthleteRDD2 = spark.textFile(path+'deportista2.csv').map(lambda l : l.split(','))

# To make a union between the RDDs we can use:
OlimpicAthleteRDD = OlimpicAthleteRDD.union(OlimpicAthleteRDD2)


In [16]:
# To make sure that the data is not corrupted, we can use count() to verify that spark is working correctly
# with that data
OlimpicAthleteRDD.count()

135572

In [22]:
# To see the first 10 rows of the RDD, we use top function (similar to SQL)
OlimpicAthleteRDD.take(5)

[['deportista_id', 'nombre', 'genero', 'edad', 'altura', 'peso', 'equipo_id'],
 ['1', 'A Dijiang', '1', '24', '180', '80', '199'],
 ['2', 'A Lamusi', '1', '23', '170', '60', '199'],
 ['3', 'Gunnar Nielsen Aaby', '1', '24', '0', '0', '273'],
 ['4', 'Edgar Lindenau Aabye', '1', '34', '0', '0', '278']]

In [23]:
# Removing the header of the RDD
# 'iter' function, returns all values of we process in the function

def removeHeader(index, iterator):
    """A fuction that removes the header from a dataset or RDD"""
    return iter(list(iterator)[1:])

In [25]:
# We map the RDD assigning a index (this maps rows and columns per index)
# Now we can use a function that will take action in all rows and columns on the RDD.
OlimpicAthleteRDD_clean = OlimpicAthleteRDD.mapPartitionsWithIndex(removeHeader)

In [26]:
# To see if the header is gone
OlimpicAthleteRDD_clean.take(5)

[['1', 'A Dijiang', '1', '24', '180', '80', '199'],
 ['2', 'A Lamusi', '1', '23', '170', '60', '199'],
 ['3', 'Gunnar Nielsen Aaby', '1', '24', '0', '0', '273'],
 ['4', 'Edgar Lindenau Aabye', '1', '34', '0', '0', '278'],
 ['5', 'Christine Jacoba Aaftink', '2', '21', '185', '82', '705']]

In [30]:
# To transform the values of the RDD
# For that we are goint to make a mapping

OlimpicAthleteRDD_clean = OlimpicAthleteRDD_clean.map(lambda l: (
    int(l[0]),
    l[1],
    int(l[2]),
    int(l[3]),
    int(l[4]),
    float(l[5]),
    int(l[6])    
))


In [31]:
# We ned to define a new Schema:

schema = StructType([
    StructField('athlete_id', IntegerType(),False),
    StructField('name', StringType(),False),
    StructField('gender', IntegerType(),False),
    StructField('age', IntegerType(),False),
    StructField('height', IntegerType(),False),
    StructField('weight', FloatType(),False),
    StructField('team_id', IntegerType(),False),
])

In [33]:
# Creating a dataframe with sqlContext, using an existing RDD and a Schema
AthleteDF = sqlContext.createDataFrame(OlimpicAthleteRDD_clean, schema)

In [35]:
# To prove everything is correcto with our new DF, we can use show function
AthleteDF.show(10)

+----------+--------------------+------+---+------+------+-------+
|athlete_id|                name|gender|age|height|weight|team_id|
+----------+--------------------+------+---+------+------+-------+
|         1|           A Dijiang|     1| 24|   180|  80.0|    199|
|         2|            A Lamusi|     1| 23|   170|  60.0|    199|
|         3| Gunnar Nielsen Aaby|     1| 24|     0|   0.0|    273|
|         4|Edgar Lindenau Aabye|     1| 34|     0|   0.0|    278|
|         5|Christine Jacoba ...|     2| 21|   185|  82.0|    705|
|         6|     Per Knut Aaland|     1| 31|   188|  75.0|   1096|
|         7|        John Aalberg|     1| 31|   183|  72.0|   1096|
|         8|Cornelia Cor Aalt...|     2| 18|   168|   0.0|    705|
|         9|    Antti Sami Aalto|     1| 26|   186|  96.0|    350|
|        10|Einar Ferdinand E...|     1| 26|     0|   0.0|    350|
+----------+--------------------+------+---+------+------+-------+
only showing top 10 rows

