In [1]:

# Import SparkSession
from pyspark.sql import SparkSession

# Create SparkSession 
spark = SparkSession.builder \
      .master("local[1]") \
      .appName("practice 2") \
      .getOrCreate() 


In [2]:
spark

### 1) Create Empty DataFrame with Schema

In [3]:
from pyspark.sql.types import StructType,StructField, StringType

schema1 = StructType([
  StructField('firstname', StringType(), True),
  StructField('middlename', StringType(), True),
  StructField('lastname', StringType(), True)
  ])


In [4]:
#Create empty DataFrame directly.
df2 = spark.createDataFrame([], schema1)
df2.printSchema()


root
 |-- firstname: string (nullable = true)
 |-- middlename: string (nullable = true)
 |-- lastname: string (nullable = true)



### 2) Create empty DatFrame with no schema (no columns)

In [5]:

df3 = spark.createDataFrame([], StructType([]))
df3.printSchema()

#print below empty schema
#root


root



### 3) DataFrame from RDD

#### RDD

In [6]:
from datetime import date

a=[  (1,1.0,'shafi',date(2022,1,19)),
     (3,4.0,'ravi',date(2022,12,31)),
     (4,5.0,'karan',date(2022,11,1)),
     (2,14.78,'nadeem',date(2022,11,4)),
     (22,13.0,'mir',date(2022,7,9))
  ]


rdd2=spark.sparkContext.parallelize(a)
type(rdd2)

pyspark.rdd.RDD

#### a. using toDF()

In [7]:
cols = ['Sno', 'GPA','Name','DOJ']

df = rdd2.toDF(cols)

df.printSchema()
df.show()


root
 |-- Sno: long (nullable = true)
 |-- GPA: double (nullable = true)
 |-- Name: string (nullable = true)
 |-- DOJ: date (nullable = true)

+---+-----+------+----------+
|Sno|  GPA|  Name|       DOJ|
+---+-----+------+----------+
|  1|  1.0| shafi|2022-01-19|
|  3|  4.0|  ravi|2022-12-31|
|  4|  5.0| karan|2022-11-01|
|  2|14.78|nadeem|2022-11-04|
| 22| 13.0|   mir|2022-07-09|
+---+-----+------+----------+



#### b. using createDataFrame(rdd,schema)

In [8]:
df=spark.createDataFrame(rdd2,schema=cols)

df.show()


+---+-----+------+----------+
|Sno|  GPA|  Name|       DOJ|
+---+-----+------+----------+
|  1|  1.0| shafi|2022-01-19|
|  3|  4.0|  ravi|2022-12-31|
|  4|  5.0| karan|2022-11-01|
|  2|14.78|nadeem|2022-11-04|
| 22| 13.0|   mir|2022-07-09|
+---+-----+------+----------+



#### c. using createDataFrame along with StructType

In [9]:
from pyspark.sql.types import StructField,StructType, IntegerType, DateType

deptSchema = StructType([       
     StructField('S no', IntegerType(), True),
     StructField('CGPA', StringType(), True),
     StructField('Name', StringType(), True),
     StructField('DOJ', DateType(), False)
])
df=spark.createDataFrame(rdd2,schema=deptSchema)


In [10]:
df.show()

+----+-----+------+----------+
|S no| CGPA|  Name|       DOJ|
+----+-----+------+----------+
|   1|  1.0| shafi|2022-01-19|
|   3|  4.0|  ravi|2022-12-31|
|   4|  5.0| karan|2022-11-01|
|   2|14.78|nadeem|2022-11-04|
|  22| 13.0|   mir|2022-07-09|
+----+-----+------+----------+



In [11]:
df.printSchema()

root
 |-- S no: integer (nullable = true)
 |-- CGPA: string (nullable = true)
 |-- Name: string (nullable = true)
 |-- DOJ: date (nullable = false)



### 4) DataFrame with Adhoc Data (on the fly)

In [12]:

data = [('James','','Smith','1991-04-01','M',3000),
  ('Michael','Rose','','2000-05-19','M',4000),
  ('Robert','','Williams','1978-09-05','M',4000),
  ('Maria','Anne','Jones','1967-12-01','F',4000),
  ('Jen','Mary','Brown','1980-02-17','F',-1)
]

columns = ["firstname","middlename","lastname","dob","gender","salary"]

df = spark.createDataFrame(data=data, schema = columns)


In [13]:
df.printSchema()

root
 |-- firstname: string (nullable = true)
 |-- middlename: string (nullable = true)
 |-- lastname: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: long (nullable = true)



### 5) DataFrame from external data sources

In [14]:
df = spark.read.csv("S:/Datasets/csv/titanic.csv",header=True,inferSchema=True)
df.printSchema()


root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



In [15]:
df.show(4)

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| null|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| null|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
only showing top 4 rows



# THE END