In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('RW opn').getOrCreate()

In [3]:
data = [["Sunil","Maths",85],["Sravan","Science", 89], ["Sravanika", "English",78], ["Ramesh","History",85],
["Sushil","Maths",75],["Matthew","Science",96]]

columns = ["Name", "Major", "Score"]

df = spark.createDataFrame(data).toDF(*columns)
df.show()
df.printSchema()
print("The total number of partions is: ",df.rdd.getNumPartitions())

#changing the partitions so that we can have 2 files in hdfs
df = df.repartition(3)
df.rdd.getNumPartitions()


                                                                                

+---------+-------+-----+
|     Name|  Major|Score|
+---------+-------+-----+
|    Sunil|  Maths|   85|
|   Sravan|Science|   89|
|Sravanika|English|   78|
|   Ramesh|History|   85|
|   Sushil|  Maths|   75|
|  Matthew|Science|   96|
+---------+-------+-----+

root
 |-- Name: string (nullable = true)
 |-- Major: string (nullable = true)
 |-- Score: long (nullable = true)

The total number of partions is:  12


3

In [4]:
#save/write csv file to hdfs----
df.write.option("header",True).csv("hdfs://localhost:9000/spark/csv/")

#save write textfile to hdfs. for textfile, we must convert the dataframe to one column or change it to an rdd
df.rdd.saveAsTextFile("hdfs://localhost:9000/spark/text/")           #converting to an rdd

#write to hdfs in json format------
df.write.json(path='hdfs://localhost:9000/spark/json/')

#write to parquet file
df.write.parquet("hdfs://localhost:9000/spark/paquet/")

#write to avro file---
df.write.format("avro").save("hdfs://localhost:9000/spark/avro/")


                                                                                

In [11]:
#read different format files from hdfs--

#reading csv file that we just stored earlier

print("csv--")
new_df = spark.read.option('header',True).csv("hdfs://localhost:9000/spark/csv/*.csv")
new_df.show()

#reading textfile as an rdd
print("text--")
new_df = spark.sparkContext.textFile("hdfs://localhost:9000/spark/*.txt")
print(new_df.collect())

#reading json file from hdfs
print("json---")
new_df = spark.read.json("hdfs://localhost:9000/spark/json/*.json")
new_df.show()
new_df.printSchema()

#reading json file from local
new_df = spark.read.option("multiline",'true').json("/home/hdoop/Testing/StudentJson.json")
new_df.show(15)
new_df.printSchema()

#reading parquet file from hdfs
print("parquet--")
new_df = spark.read.parquet("hdfs://localhost:9000/spark/paquet/*.parquet")
new_df.show()
new_df.printSchema()

#reading parquet file from local
new_df = spark.read.parquet("/home/hdoop/Testing/ParquetSample.parquet")
new_df.show(15)
new_df.printSchema()


#reading avro file from hdfs
print("avro----")
new_df = spark.read.format("avro").load("hdfs://localhost:9000/spark/avro/*.avro")
new_df.show()
new_df.printSchema()


#reading avro from local
new_df = spark.read.format("avro").load("/home/hdoop/Testing/AvroSample.avro")
new_df.show(15)
new_df.printSchema()


csv--
+---------+-------+-----+
|     Name|  Major|Score|
+---------+-------+-----+
|   Sravan|Science|   89|
|Sravanika|English|   78|
|    Sunil|  Maths|   85|
|  Matthew|Science|   96|
|   Ramesh|History|   85|
|   Sushil|  Maths|   75|
+---------+-------+-----+

text--
['this is just a simple text file generated to check the working of importing', 'from hdfs using pyspark']
json---
+-------+---------+-----+
|  Major|     Name|Score|
+-------+---------+-----+
|Science|   Sravan|   89|
|English|Sravanika|   78|
|  Maths|    Sunil|   85|
|Science|  Matthew|   96|
|History|   Ramesh|   85|
|  Maths|   Sushil|   75|
+-------+---------+-----+

root
 |-- Major: string (nullable = true)
 |-- Name: string (nullable = true)
 |-- Score: long (nullable = true)

+---+--------------+--------+---------+------+------+------+---+--------+--------+----+--------------+-------------+
|Age|          City| Country|FirstName|Gender|Grade |Height| ID|LastName|   Major| SAT|         State|StudentStatus|
+-

In [4]:
#reading parquet file from local
new_df = spark.read.parquet("/home/hdoop/Testing/ParquetSample.parquet")
new_df.show(15)
new_df.printSchema()



#reading avro from local
new_df = spark.read.format("avro").load("/home/hdoop/Testing/AvroSample.avro")
new_df.show(15)
new_df.printSchema()


new_df.createOrReplaceTempView('avro')

avrodf = spark.sql('select * from')


                                                                                

+-------------------+---+----------+----------+--------------------+------+---------------+-------------------+---------+----------+---------+--------------------+--------------------+
|  registration_dttm| id|first_name| last_name|               email|gender|     ip_address|                 cc|  country| birthdate|   salary|               title|            comments|
+-------------------+---+----------+----------+--------------------+------+---------------+-------------------+---------+----------+---------+--------------------+--------------------+
|2016-02-03 21:37:46|  1|    Ernest|    Fuller|efuller0@examiner...|  Male|   106.72.28.74|   5610608195667267|   Israel|          |140639.36|                    |                    |
|2016-02-04 03:22:07|  2|   Anthony|    Foster|  afoster1@weibo.com|  Male|156.243.130.166|   4508242795214771|Indonesia| 1/16/1998|172843.61|        Developer II|👾 🙇 💁 🙅 🙆 🙋...|
|2016-02-03 07:52:19|  3|      Ryan|Montgomery|rmontgomery2@mozi...|  Male|  28.5