In [1]:
from pyspark import SparkContext
from pyspark.sql import SparkSession

from pyspark.sql.types import StructType, StructField, IntegerType, StringType, FloatType
from pyspark.sql.types import Row

from pyspark.sql import SQLContext

In [2]:
spark = SparkContext(master='local', appName='dataframes')
sqlContext = SQLContext(spark)

In [3]:
path = '/home/robert/Project/spark-2/transformations_actions/files/'

In [10]:
deportista_error = spark.textFile(path + 'deportistaError.csv') \
    .map(lambda l: l.split(','))

In [11]:
def remove_head(index, iterator):
    return iter(list(iterator)[1:])

In [12]:
deportista_error = deportista_error.mapPartitionsWithIndex(remove_head)

In [13]:
deportista_error.take(2)

[['1', 'A Dijiang', '1', '24', '180', '80', '199'],
 ['2', 'A Lamusi', '1', '23', '170', '60', '199']]

In [16]:
deportista_error = deportista_error.map(lambda l: 
                    (l[0],
                    l[1],
                    l[2],
                    l[3],
                    l[4],
                    l[5],
                    l[6]))
schema = StructType([
    StructField('deportista_id', StringType(), False),
    StructField('nombre', StringType(), False),
    StructField('genero', StringType(), False),
    StructField('edad', StringType(), False),
    StructField('altura', StringType(), False),
    StructField('peso', StringType(), False),
    StructField('equipo_id', StringType(), False)
])

deportista_error_df = sqlContext.createDataFrame(deportista_error, schema)

In [17]:
deportista_error_df.show()

+-------------+--------------------+------+----+------+----+---------+
|deportista_id|              nombre|genero|edad|altura|peso|equipo_id|
+-------------+--------------------+------+----+------+----+---------+
|            1|           A Dijiang|     1|  24|   180|  80|      199|
|            2|            A Lamusi|     1|  23|   170|  60|      199|
|            3| Gunnar Nielsen Aaby|     1|  24|      |    |      273|
|            4|Edgar Lindenau Aabye|     1|  34|      |    |      278|
|            5|Christine Jacoba ...|     2|  21|   185|  82|      705|
|            6|     Per Knut Aaland|     1|  31|   188|  75|     1096|
|            7|        John Aalberg|     1|  31|   183|  72|     1096|
|            8|"Cornelia ""Cor""...|     2|  18|   168|    |      705|
|            9|    Antti Sami Aalto|     1|  26|   186|  96|      350|
|           10|"Einar Ferdinand ...|     1|  26|      |    |      350|
|           11|  Jorma Ilmari Aalto|     1|  22|   182|76.5|      350|
|     

In [18]:
from pyspark.sql.functions import udf

def int_convert(value):
    return int(value) if len(value) > 0 else None

int_convert_udf = udf(lambda z: int_convert(z), IntegerType())
sqlContext.udf.register('int_convert_udf', int_convert_udf)

<function __main__.<lambda>(z)>

In [21]:
deportista_error_df.select(int_convert_udf('altura').alias('altura_udf')).show()

+----------+
|altura_udf|
+----------+
|       180|
|       170|
|      null|
|      null|
|       185|
|       188|
|       183|
|       168|
|       186|
|      null|
|       182|
|       172|
|       159|
|       171|
|      null|
|       184|
|       175|
|       189|
|      null|
|       176|
+----------+
only showing top 20 rows



In [22]:
from pyspark.storagelevel import StorageLevel