## Import Libraries

In [11]:
import pyspark
from pyspark import SparkContext
sc = SparkContext.getOrCreate()
spark = pyspark.sql.SparkSession(sc, jsparkSession=None)

## Raw Read without Schema

In [3]:
df = spark.read.csv('file:/home/jovyan/work/learning-apache-spark-main/data/bookcontents.csv')

In [4]:
df.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)



In [5]:
df.show()

+-------+--------------------+----+
|    _c0|                 _c1| _c2|
+-------+--------------------+----+
|Chapter|                Name|Page|
|      1|        Introduction|  11|
|      2|Basic Engineering...|  19|
|      3|Advanced Engineer...|  28|
|      4|     Hands On Course|  60|
|      5|        Case Studies|  62|
|      6|Best Practices Cl...|  73|
|      7|130+ Data Sources...|  77|
|      8|1001 Interview Qu...|  82|
|      9|Recommended Books...|  87|
+-------+--------------------+----+



## Infer a schema without no header

In [8]:
headerDf = spark.read.option('inferSchema','true').option('header','true').csv('file:/home/jovyan/work/learning-apache-spark-main/data/bookcontents.csv')

In [9]:
headerDf.printSchema()

root
 |-- Chapter: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Page: integer (nullable = true)



In [10]:
headerDf.show()

+-------+--------------------+----+
|Chapter|                Name|Page|
+-------+--------------------+----+
|      1|        Introduction|  11|
|      2|Basic Engineering...|  19|
|      3|Advanced Engineer...|  28|
|      4|     Hands On Course|  60|
|      5|        Case Studies|  62|
|      6|Best Practices Cl...|  73|
|      7|130+ Data Sources...|  77|
|      8|1001 Interview Qu...|  82|
|      9|Recommended Books...|  87|
+-------+--------------------+----+



## Create a Manual Schema 

In [12]:
from pyspark.sql.types import *

In [14]:
# Test the schema

spark.read.option('inferSchema','true').csv('file:/home/jovyan/work/learning-apache-spark-main/data/bookcontentsNoHeader.csv').printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: integer (nullable = true)



In [15]:
columns = [
StructField("Chapter", IntegerType()),
StructField("Name", StringType()),
StructField("Page", IntegerType())]
csvschema = StructType(columns)

In [None]:
manual_schema_df = spark.read.schema(csvschema).csv('file:/home/jovyan/work/learning-apache-spark-main/data/bookcontentsNoHeader.csv')