# Configuration

In [1]:
# Most straightforward method
import pyspark
import pandas as pd

In [2]:
spark = pyspark.sql.SparkSession.builder \
        .master('local') \
        .appName('Spark Datatype') \
        .getOrCreate()

# Import data

In [3]:
auto = spark.read.csv('auto-mpg.csv', header=True)

In [4]:
type(auto)

pyspark.sql.dataframe.DataFrame

In [5]:
auto.show()

+----+---+-----+---+------+-----+---+------+--------------------+-----+------------------+------+
| mpg|cyl|displ| hp|weight|accel| yr|origin|                name|color|              size|marker|
+----+---+-----+---+------+-----+---+------+--------------------+-----+------------------+------+
|18.0|  6|250.0| 88|  3139| 14.5| 71|    US|        ford mustang|  red|27.370336111111108|     o|
| 9.0|  8|304.0|193|  4732| 18.5| 70|    US|            hi 1200d|green|62.199511111111114|     o|
|36.1|  4| 91.0| 60|  1800| 16.4| 78|  Asia|    honda civic cvcc| blue|               9.0|     x|
|18.5|  6|250.0| 98|  3525| 19.0| 77|    US|        ford granada|  red|         34.515625|     o|
|34.3|  4| 97.0| 78|  2188| 15.8| 80|Europe|           audi 4000| blue|13.298177777777777|     s|
|32.9|  4|119.0|100|  2615| 14.8| 81|  Asia|        datsun 200sx| blue|18.995069444444447|     x|
|32.2|  4|108.0| 75|  2265| 15.2| 80|  Asia|      toyota corolla| blue|         14.250625|     x|
|22.0|  4|121.0| 76|

In [6]:
auto.toPandas()

Unnamed: 0,mpg,cyl,displ,hp,weight,accel,yr,origin,name,color,size,marker
0,18.0,6,250.0,88,3139,14.5,71,US,ford mustang,red,27.370336111111108,o
1,9.0,8,304.0,193,4732,18.5,70,US,hi 1200d,green,62.199511111111114,o
2,36.1,4,91.0,60,1800,16.4,78,Asia,honda civic cvcc,blue,9.0,x
3,18.5,6,250.0,98,3525,19.0,77,US,ford granada,red,34.515625,o
4,34.3,4,97.0,78,2188,15.8,80,Europe,audi 4000,blue,13.298177777777777,s
5,32.9,4,119.0,100,2615,14.8,81,Asia,datsun 200sx,blue,18.995069444444447,x
6,32.2,4,108.0,75,2265,15.2,80,Asia,toyota corolla,blue,14.250625,x
7,22.0,4,121.0,76,2511,18.0,72,Europe,volkswagen 411 (sw),blue,17.514224999999996,s
8,15.0,8,302.0,130,4295,14.9,77,US,mercury cougar brougham,green,51.24173611111111,o
9,17.0,8,302.0,140,3449,10.5,70,US,ford torino,green,33.04333611111111,o


# Data type
## Convert Spark Dataframe to RDD

In [7]:
auto_rdd = auto.rdd
auto_rdd

MapPartitionsRDD[16] at javaToPython at NativeMethodAccessorImpl.java:0

In [8]:
auto_rdd.collect()

[Row(mpg='18.0', cyl='6', displ='250.0', hp='88', weight='3139', accel='14.5', yr='71', origin='US', name='ford mustang', color='red', size='27.370336111111108', marker='o'),
 Row(mpg='9.0', cyl='8', displ='304.0', hp='193', weight='4732', accel='18.5', yr='70', origin='US', name='hi 1200d', color='green', size='62.199511111111114', marker='o'),
 Row(mpg='36.1', cyl='4', displ='91.0', hp='60', weight='1800', accel='16.4', yr='78', origin='Asia', name='honda civic cvcc', color='blue', size='9.0', marker='x'),
 Row(mpg='18.5', cyl='6', displ='250.0', hp='98', weight='3525', accel='19.0', yr='77', origin='US', name='ford granada', color='red', size='34.515625', marker='o'),
 Row(mpg='34.3', cyl='4', displ='97.0', hp='78', weight='2188', accel='15.8', yr='80', origin='Europe', name='audi 4000', color='blue', size='13.298177777777777', marker='s'),
 Row(mpg='32.9', cyl='4', displ='119.0', hp='100', weight='2615', accel='14.8', yr='81', origin='Asia', name='datsun 200sx', color='blue', size=

In [9]:
type(auto_rdd)

pyspark.rdd.RDD

## Dataframe sample will be List

In [10]:
auto_list = auto.take(1)
auto_list

[Row(mpg='18.0', cyl='6', displ='250.0', hp='88', weight='3139', accel='14.5', yr='71', origin='US', name='ford mustang', color='red', size='27.370336111111108', marker='o')]

In [11]:
type(auto_list)

list

## Convert List to Dataframe

In [12]:
auto_df = spark.sparkContext.parallelize(auto_list).toDF()
auto_df.show()

+----+---+-----+---+------+-----+---+------+------------+-----+------------------+------+
| mpg|cyl|displ| hp|weight|accel| yr|origin|        name|color|              size|marker|
+----+---+-----+---+------+-----+---+------+------------+-----+------------------+------+
|18.0|  6|250.0| 88|  3139| 14.5| 71|    US|ford mustang|  red|27.370336111111108|     o|
+----+---+-----+---+------+-----+---+------+------------+-----+------------------+------+



In [13]:
type(auto_df)

pyspark.sql.dataframe.DataFrame

## Convert List to RDD

In [14]:
auto_rdd = spark.sparkContext.parallelize(auto_list)
auto_rdd

ParallelCollectionRDD[28] at parallelize at PythonRDD.scala:489

In [15]:
auto_rdd.collect()

[Row(mpg='18.0', cyl='6', displ='250.0', hp='88', weight='3139', accel='14.5', yr='71', origin='US', name='ford mustang', color='red', size='27.370336111111108', marker='o')]

In [16]:
type(auto_rdd)

pyspark.rdd.RDD

## Convert RDD to Dataframe

In [17]:
auto_df = auto_rdd.toDF()
auto_df.show()

+----+---+-----+---+------+-----+---+------+------------+-----+------------------+------+
| mpg|cyl|displ| hp|weight|accel| yr|origin|        name|color|              size|marker|
+----+---+-----+---+------+-----+---+------+------------+-----+------------------+------+
|18.0|  6|250.0| 88|  3139| 14.5| 71|    US|ford mustang|  red|27.370336111111108|     o|
+----+---+-----+---+------+-----+---+------+------------+-----+------------------+------+



In [18]:
type(auto_df)

pyspark.sql.dataframe.DataFrame

## Column

In [19]:
auto['marker']

Column<b'marker'>

In [20]:
auto['marker'].getField('0')

Column<b"marker['0']">

In [21]:
auto['marker'].getItem(0)

Column<b'marker[0]'>