In [2]:
import numpy as np
import pandas as pd 
import pyspark.sql.types as T
import pyspark.sql.functions as F
from pyspark.sql import SparkSession
from pyspark.sql.window import Window

In [3]:
# initialize spark session
spark = SparkSession.builder \
            .master("local[*]") \
            .appName("ShortNSimple") \
            .getOrCreate()
spark

In [26]:
data = spark.sparkContext.textFile("C:\\iris.txt")
data.collect()[:10]

['5.1,3.5,1.4,0.2,Iris-setosa',
 '4.9,3,1.4,0.2,Iris-setosa',
 '4.7,3.2,1.3,0.2,Iris-setosa',
 '4.6,3.1,1.5,0.2,Iris-setosa',
 '5,3.6,1.4,0.2,Iris-setosa',
 '5.4,3.9,1.7,0.4,Iris-setosa',
 '4.6,3.4,1.4,0.3,Iris-setosa',
 '5,3.4,1.5,0.2,Iris-setosa',
 '4.4,2.9,1.4,0.2,Iris-setosa',
 '4.9,3.1,1.5,0.1,Iris-setosa']

In [17]:
type(data)

pyspark.rdd.RDD

In [21]:
len(data.take(1))

1

In [22]:
len(data.collect())

150

In [20]:
type(data.take(1)[0])

str

## Clean up the data

In [34]:
row = [5.1, 3.5, 1.4, 0.2, 'Iris-setosa']
[float(val) for val in row[:-1]]

[5.1, 3.5, 1.4, 0.2]

In [36]:
print(row[-1])

Iris-setosa


In [37]:
[row[-1]]

['Iris-setosa']

In [38]:
[float(val) for val in row[:-1]] + [row[-1]]

[5.1, 3.5, 1.4, 0.2, 'Iris-setosa']

In [33]:
data_splitted = data.map(lambda row: row.split(','))
data_splitted = data_splitted.map(lambda row: [float(val) for val in row[:-1]] + [row[-1]])
data_splitted.take(2)

[[5.1, 3.5, 1.4, 0.2, 'Iris-setosa'], [4.9, 3.0, 1.4, 0.2, 'Iris-setosa']]

In [39]:
type(data_splitted.take(1)[0])

list

## Create the schema

In [29]:
schema = T.StructType([
    T.StructField("sepal_length", T.FloatType(), True),
    T.StructField("sepal_width", T.FloatType(), True),
    T.StructField("petal_length", T.FloatType(), True),
    T.StructField("petal_width", T.FloatType(), True),
    T.StructField("class_label", T.StringType(), True)
])

## Create DataFrame

In [40]:
spark_df = spark.createDataFrame(data_splitted, schema=schema)
spark_df.show(10, False)

+------------+-----------+------------+-----------+-----------+
|sepal_length|sepal_width|petal_length|petal_width|class_label|
+------------+-----------+------------+-----------+-----------+
|5.1         |3.5        |1.4         |0.2        |Iris-setosa|
|4.9         |3.0        |1.4         |0.2        |Iris-setosa|
|4.7         |3.2        |1.3         |0.2        |Iris-setosa|
|4.6         |3.1        |1.5         |0.2        |Iris-setosa|
|5.0         |3.6        |1.4         |0.2        |Iris-setosa|
|5.4         |3.9        |1.7         |0.4        |Iris-setosa|
|4.6         |3.4        |1.4         |0.3        |Iris-setosa|
|5.0         |3.4        |1.5         |0.2        |Iris-setosa|
|4.4         |2.9        |1.4         |0.2        |Iris-setosa|
|4.9         |3.1        |1.5         |0.1        |Iris-setosa|
+------------+-----------+------------+-----------+-----------+
only showing top 10 rows



In [41]:
spark_df.printSchema()

root
 |-- sepal_length: float (nullable = true)
 |-- sepal_width: float (nullable = true)
 |-- petal_length: float (nullable = true)
 |-- petal_width: float (nullable = true)
 |-- class_label: string (nullable = true)

