# 4.2 用 Pyspark 建立第一個RDD

In [1]:
from __future__ import print_function, division

## import pyspark

In [37]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession

## 啟動 spark

In [38]:
spark = SparkSession.builder.master("local") \
   .appName("test") \
   .enableHiveSupport() \
   .getOrCreate()

sc = spark.sparkContext

## Part1. Create a RDD from sparkContext

In [4]:
wordsList = ['cat', 'elephant', 'rat', 'rat', 'cat']
wordsRDD = sc.parallelize(wordsList, 4)
# Print out the type of wordsRDD
print(type(wordsRDD))


<class 'pyspark.rdd.RDD'>


In [5]:
wordsRDD.collect()

['cat', 'elephant', 'rat', 'rat', 'cat']

## Part2 Create a Dataframe from hdfs

## put data into HDFS

In [7]:
!ls ../data

'ls' 不是内部或外部命令，也不是可运行的程序
或批处理文件。


In [5]:
!head ../data/NASA_access_log_Jul95_100

199.72.81.55 - - [01/Jul/1995:00:00:01 -0400] "GET /history/apollo/ HTTP/1.0" 200 6245
unicomp6.unicomp.net - - [01/Jul/1995:00:00:06 -0400] "GET /shuttle/countdown/ HTTP/1.0" 200 3985
199.120.110.21 - - [01/Jul/1995:00:00:09 -0400] "GET /shuttle/missions/sts-73/mission-sts-73.html HTTP/1.0" 200 4085
burger.letters.com - - [01/Jul/1995:00:00:11 -0400] "GET /shuttle/countdown/liftoff.html HTTP/1.0" 304 0
199.120.110.21 - - [01/Jul/1995:00:00:11 -0400] "GET /shuttle/missions/sts-73/sts-73-patch-small.gif HTTP/1.0" 200 4179
burger.letters.com - - [01/Jul/1995:00:00:12 -0400] "GET /images/NASA-logosmall.gif HTTP/1.0" 304 0
burger.letters.com - - [01/Jul/1995:00:00:12 -0400] "GET /shuttle/countdown/video/livevideo.gif HTTP/1.0" 200 0
205.212.115.106 - - [01/Jul/1995:00:00:12 -0400] "GET /shuttle/countdown/countdown.html HTTP/1.0" 200 3985
d104.aa.net - - [01/Jul/1995:00:00:13 -0400] "GET /shuttle/countdown/ HTTP/1.0" 200 3985
129.94.144.152 - - [01/Jul/1995:00:00:13 -0400] "GET / H

In [6]:
!hadoop fs -ls /

Found 2 items
drwx-wx-wx   - vagrant supergroup          0 2017-08-25 08:22 /tmp
drwxr-xr-x   - vagrant supergroup          0 2017-08-25 08:22 /user


In [9]:
!hadoop fs -put ../data/NASA_access_log_Jul95_100 /tmp

put: `/tmp/NASA_access_log_Jul95_100': File exists


In [16]:
!hadoop fs -ls /tmp

Found 2 items
-rw-r--r--   3 vagrant supergroup      10851 2017-09-25 03:44 /tmp/NASA_access_log_Jul95_100
drwx-wx-wx   - vagrant supergroup          0 2017-08-25 08:22 /tmp/hive


### 從 HDFS 中讀取資料

In [96]:
textFromHDFS = spark.read.text("hdfs:///tmp/NASA_access_log_Jul95_100")

In [95]:
print(type(textFromHDFS))

<class 'pyspark.sql.dataframe.DataFrame'>


In [97]:
textFromHDFS.head()

Row(value=u'199.72.81.55 - - [01/Jul/1995:00:00:01 -0400] "GET /history/apollo/ HTTP/1.0" 200 6245')

## Part3 Read csv format

In [26]:
!hadoop fs -tail /tmp/ratings.csv

userid,movieid,rating,ts
3,6539,5,1133571238
3,7153,4,1133571171
3,7155,3.5,1164885564
3,8529,4,1136075616
3,8533,4.5,1136418593
3,8783,5,1136075857
3,27821,4.5,1136418616
3,33750,3.5,1164885688
4,21,3,844416980
4,34,5,844416936
4,39,3,844417037
4,110,5,844416866
4,150,5,844416656
4,153,5,844416699
4,161,5,844416835
4,165,5,844416699
4,208,3,844416866
4,231,1,844416742
4,253,3,844416834
4,266,5,844417070
4,292,3,844416796
4,316,5,844416742
4,317,5,844417037
4,329,5,844416796
4,344,2,844416699
4,349,3,844416699


In [98]:
path = "hdfs:///tmp/ratings.csv"
schema = None 
sep = None
header = True

In [13]:
import os 
print(os.getcwd())

C:\Users\Administrator\Documents\jupyter notebook\Learning_spark\spark_core


In [6]:
path = r"C:\Users\Administrator\Documents\jupyter notebook\Learning_spark\data\ratings.csv"
schema = None 
sep = None
header = True

In [7]:
csvDF = spark.read.csv(path = path, schema = schema, sep = sep, header = header)

In [8]:
print(type(csvDF))

<class 'pyspark.sql.dataframe.DataFrame'>


In [9]:
print(csvDF)

DataFrame[userid: string, movieid: string, rating: string, ts: string]


In [10]:
csvDF.head()

Row(userid='3', movieid='6539', rating='5', ts='1133571238')

In [11]:
csvDF.take(4)

[Row(userid='3', movieid='6539', rating='5', ts='1133571238'),
 Row(userid='3', movieid='7153', rating='4', ts='1133571171'),
 Row(userid='3', movieid='7155', rating='3.5', ts='1164885564'),
 Row(userid='3', movieid='8529', rating='4', ts='1136075616')]

### comapre with read.text

In [12]:
textDF = spark.read.text(paths = path)

In [13]:
textDF

DataFrame[value: string]

In [14]:
textDF.head()

Row(value='userid,movieid,rating,ts')

In [15]:
textDF.take(5)

[Row(value='userid,movieid,rating,ts'),
 Row(value='3,6539,5,1133571238'),
 Row(value='3,7153,4,1133571171'),
 Row(value='3,7155,3.5,1164885564'),
 Row(value='3,8529,4,1136075616')]

## Part4. Read Json file

In [16]:
jsonDF = spark.read.json(r'C:\Users\Administrator\Documents\jupyter notebook\Learning_spark\data\json_example.json')

In [17]:
jsonDF

DataFrame[movieid: string, rating: bigint, userid: string]

In [22]:
jsonDF.head()

Row(movieid='001', rating=4, userid='1')

In [23]:
type(jsonDF)

pyspark.sql.dataframe.DataFrame

## Part5. RDD 與 DataFrame 的轉換

In [24]:
jsonRDD = jsonDF.rdd

In [27]:
print(type(jsonDF))
print(type(jsonRDD))

<class 'pyspark.sql.dataframe.DataFrame'>
<class 'pyspark.rdd.RDD'>


In [28]:
jsonRDD.collect()

[Row(movieid='001', rating=4, userid='1'),
 Row(movieid='002', rating=3, userid='1'),
 Row(movieid='001', rating=4, userid='2'),
 Row(movieid='003', rating=2, userid='2')]

In [30]:
jsonRDD.count()

Py4JError: An error occurred while calling None.org.apache.spark.api.python.PythonRDD. Trace:
py4j.Py4JException: Constructor org.apache.spark.api.python.PythonRDD([class org.apache.spark.rdd.MapPartitionsRDD, class org.apache.spark.api.python.PythonFunction, class java.lang.Boolean]) does not exist
	at py4j.reflection.ReflectionEngine.getConstructor(ReflectionEngine.java:179)
	at py4j.reflection.ReflectionEngine.getConstructor(ReflectionEngine.java:196)
	at py4j.Gateway.invoke(Gateway.java:237)
	at py4j.commands.ConstructorCommand.invokeConstructor(ConstructorCommand.java:80)
	at py4j.commands.ConstructorCommand.execute(ConstructorCommand.java:69)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)



In [31]:
jsonRDD.take(1)

Py4JError: An error occurred while calling None.org.apache.spark.api.python.PythonRDD. Trace:
py4j.Py4JException: Constructor org.apache.spark.api.python.PythonRDD([class org.apache.spark.rdd.MapPartitionsRDD, class org.apache.spark.api.python.PythonFunction, class java.lang.Boolean]) does not exist
	at py4j.reflection.ReflectionEngine.getConstructor(ReflectionEngine.java:179)
	at py4j.reflection.ReflectionEngine.getConstructor(ReflectionEngine.java:196)
	at py4j.Gateway.invoke(Gateway.java:237)
	at py4j.commands.ConstructorCommand.invokeConstructor(ConstructorCommand.java:80)
	at py4j.commands.ConstructorCommand.execute(ConstructorCommand.java:69)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)



In [32]:
jsonDF2 = spark.createDataFrame(jsonRDD)

Py4JError: An error occurred while calling None.org.apache.spark.api.python.PythonRDD. Trace:
py4j.Py4JException: Constructor org.apache.spark.api.python.PythonRDD([class org.apache.spark.rdd.MapPartitionsRDD, class org.apache.spark.api.python.PythonFunction, class java.lang.Boolean]) does not exist
	at py4j.reflection.ReflectionEngine.getConstructor(ReflectionEngine.java:179)
	at py4j.reflection.ReflectionEngine.getConstructor(ReflectionEngine.java:196)
	at py4j.Gateway.invoke(Gateway.java:237)
	at py4j.commands.ConstructorCommand.invokeConstructor(ConstructorCommand.java:80)
	at py4j.commands.ConstructorCommand.execute(ConstructorCommand.java:69)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)



In [123]:
type(jsonDF2)

pyspark.sql.dataframe.DataFrame

In [130]:
jsonDF2.head()

Row(movieid=u'001', rating=4, userid=u'1')

In [39]:
sc.stop()