In [1]:
import pyspark
from pyspark.sql import SparkSession

In [2]:
# This is not mandatory, but when you work in Jupyter notebooks, using the below lines of code avoid some errors
import findspark
findspark.init()

In [3]:
#creating sparksession
spark = SparkSession.builder.master("local").appName("MySchoolApp").getOrCreate()

In [4]:
spark

In [5]:
type(spark)

pyspark.sql.session.SparkSession

In [6]:
spark.version

'3.3.0'

In [7]:
spark.sparkContext.version

'3.3.0'

In [8]:
spark.sparkContext.master

'local'

In [9]:
spark.sparkContext.appName

'MySchoolApp'

In [10]:
#Reading the data from CSV file
schlDF1 = spark.read.csv(path="MySchool_v1.csv") 

In [11]:
type(schlDF1)

pyspark.sql.dataframe.DataFrame

In [12]:
#Showing the data
schlDF1.show() 

+------+-----+-----+------+--------+-------------+-----------+---------------+
|   _c0|  _c1|  _c2|   _c3|     _c4|          _c5|        _c6|            _c7|
+------+-----+-----+------+--------+-------------+-----------+---------------+
|RollNo| Name|Class|Gender|Phone No|English Marks|Maths Marks|Computers Marks|
|   8_1|Nikhi|    8|     M|  912345|           95|        100|             90|
|   8_2| Akhi|    8|     M|  912346|           96|        100|             85|
|   8_3|Sakhi|    8|     F|  912347|           79|        100|             80|
|   8_4|Rakhi|    8|     M|  912348|           66|         95|             98|
|   9_1| Kaki|    9|     F|  912349|           88|         85|             93|
|   9_2| Jaki|    9|     M|  912350|           92|         82|             94|
|   9_3| Maki|    9|     F|  912351|           93|         83|             88|
|  10_1| Paki|   10|     M|  912352|           82|         94|             84|
|  10_2|Bakhi|   10|     M|  912353|           87|  

In [13]:
schlDF1.show(5) 

+------+-----+-----+------+--------+-------------+-----------+---------------+
|   _c0|  _c1|  _c2|   _c3|     _c4|          _c5|        _c6|            _c7|
+------+-----+-----+------+--------+-------------+-----------+---------------+
|RollNo| Name|Class|Gender|Phone No|English Marks|Maths Marks|Computers Marks|
|   8_1|Nikhi|    8|     M|  912345|           95|        100|             90|
|   8_2| Akhi|    8|     M|  912346|           96|        100|             85|
|   8_3|Sakhi|    8|     F|  912347|           79|        100|             80|
|   8_4|Rakhi|    8|     M|  912348|           66|         95|             98|
+------+-----+-----+------+--------+-------------+-----------+---------------+
only showing top 5 rows



From the above shown data, it is not taking first row as Header. So, to consider first row as Header, we need to take header=True as an argument inside read.csv(), i.e, read.csv(header=True) 

In [14]:
#Reading the data from CSV file with the header
schlDF2 = spark.read.csv(path="MySchool_v1.csv",header=True) 

In [15]:
schlDF2.show(5)

+------+-----+-----+------+--------+-------------+-----------+---------------+
|RollNo| Name|Class|Gender|Phone No|English Marks|Maths Marks|Computers Marks|
+------+-----+-----+------+--------+-------------+-----------+---------------+
|   8_1|Nikhi|    8|     M|  912345|           95|        100|             90|
|   8_2| Akhi|    8|     M|  912346|           96|        100|             85|
|   8_3|Sakhi|    8|     F|  912347|           79|        100|             80|
|   8_4|Rakhi|    8|     M|  912348|           66|         95|             98|
|   9_1| Kaki|    9|     F|  912349|           88|         85|             93|
+------+-----+-----+------+--------+-------------+-----------+---------------+
only showing top 5 rows



In [16]:
schlDF2.show(5,truncate=False) #by default truncate=True
#this shows the total text present in the each cell of row

+------+-----+-----+------+--------+-------------+-----------+---------------+
|RollNo|Name |Class|Gender|Phone No|English Marks|Maths Marks|Computers Marks|
+------+-----+-----+------+--------+-------------+-----------+---------------+
|8_1   |Nikhi|8    |M     |912345  |95           |100        |90             |
|8_2   |Akhi |8    |M     |912346  |96           |100        |85             |
|8_3   |Sakhi|8    |F     |912347  |79           |100        |80             |
|8_4   |Rakhi|8    |M     |912348  |66           |95         |98             |
|9_1   |Kaki |9    |F     |912349  |88           |85         |93             |
+------+-----+-----+------+--------+-------------+-----------+---------------+
only showing top 5 rows



In [17]:
schlDF2.show(5,truncate=1) #this display only one character from each cell

+------+----+-----+------+--------+-------------+-----------+---------------+
|RollNo|Name|Class|Gender|Phone No|English Marks|Maths Marks|Computers Marks|
+------+----+-----+------+--------+-------------+-----------+---------------+
|     8|   N|    8|     M|       9|            9|          1|              9|
|     8|   A|    8|     M|       9|            9|          1|              8|
|     8|   S|    8|     F|       9|            7|          1|              8|
|     8|   R|    8|     M|       9|            6|          9|              9|
|     9|   K|    9|     F|       9|            8|          8|              9|
+------+----+-----+------+--------+-------------+-----------+---------------+
only showing top 5 rows



In [18]:
#showing the first row
schlDF2.head()

Row(RollNo='8_1', Name='Nikhi', Class='8', Gender='M', Phone No='912345', English Marks='95', Maths Marks='100', Computers Marks='90')

In [19]:
type(schlDF2.head())

pyspark.sql.types.Row

In [20]:
schlDF2.head(2)

[Row(RollNo='8_1', Name='Nikhi', Class='8', Gender='M', Phone No='912345', English Marks='95', Maths Marks='100', Computers Marks='90'),
 Row(RollNo='8_2', Name='Akhi', Class='8', Gender='M', Phone No='912346', English Marks='96', Maths Marks='100', Computers Marks='85')]

In [21]:
#showing the last rows
schlDF2.tail(2) 

[Row(RollNo='10_4', Name='Laki', Class='10', Gender='F', Phone No='912355', English Marks='98', Maths Marks='96', Computers Marks='96'),
 Row(RollNo='10_5', Name='Gakhi', Class='10', Gender='F', Phone No='912356', English Marks='95', Maths Marks='95', Computers Marks='95')]

In [22]:
schlDF2.tail()  # this is not like head(), we must need to pass a number as an argument

TypeError: tail() missing 1 required positional argument: 'num'

In [23]:
# take() is also same as head, but we need to pass a number as an argument, otherwise, error will be shown
schlDF2.take(2) 

[Row(RollNo='8_1', Name='Nikhi', Class='8', Gender='M', Phone No='912345', English Marks='95', Maths Marks='100', Computers Marks='90'),
 Row(RollNo='8_2', Name='Akhi', Class='8', Gender='M', Phone No='912346', English Marks='96', Maths Marks='100', Computers Marks='85')]

In [24]:
schlDF2.take() 

TypeError: take() missing 1 required positional argument: 'num'

In [25]:
schlDF2.collect() #collect() shows all the rows of dataset

[Row(RollNo='8_1', Name='Nikhi', Class='8', Gender='M', Phone No='912345', English Marks='95', Maths Marks='100', Computers Marks='90'),
 Row(RollNo='8_2', Name='Akhi', Class='8', Gender='M', Phone No='912346', English Marks='96', Maths Marks='100', Computers Marks='85'),
 Row(RollNo='8_3', Name='Sakhi', Class='8', Gender='F', Phone No='912347', English Marks='79', Maths Marks='100', Computers Marks='80'),
 Row(RollNo='8_4', Name='Rakhi', Class='8', Gender='M', Phone No='912348', English Marks='66', Maths Marks='95', Computers Marks='98'),
 Row(RollNo='9_1', Name='Kaki', Class='9', Gender='F', Phone No='912349', English Marks='88', Maths Marks='85', Computers Marks='93'),
 Row(RollNo='9_2', Name='Jaki', Class='9', Gender='M', Phone No='912350', English Marks='92', Maths Marks='82', Computers Marks='94'),
 Row(RollNo='9_3', Name='Maki', Class='9', Gender='F', Phone No='912351', English Marks='93', Maths Marks='83', Computers Marks='88'),
 Row(RollNo='10_1', Name='Paki', Class='10', Gend

In [26]:
# checking the columns available in the dataset
schlDF2.columns

['RollNo',
 'Name',
 'Class',
 'Gender',
 'Phone No',
 'English Marks',
 'Maths Marks',
 'Computers Marks']

In [27]:
type(schlDF2.columns)

list

In [28]:
len(schlDF2.columns)

8

In [29]:
#checking the column and thier type of data
schlDF2.dtypes

[('RollNo', 'string'),
 ('Name', 'string'),
 ('Class', 'string'),
 ('Gender', 'string'),
 ('Phone No', 'string'),
 ('English Marks', 'string'),
 ('Maths Marks', 'string'),
 ('Computers Marks', 'string')]

In [30]:
#checking the schema of the dataset
schlDF2.printSchema()

root
 |-- RollNo: string (nullable = true)
 |-- Name: string (nullable = true)
 |-- Class: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Phone No: string (nullable = true)
 |-- English Marks: string (nullable = true)
 |-- Maths Marks: string (nullable = true)
 |-- Computers Marks: string (nullable = true)



In [31]:
#checking the number rows present inside the dataset
schlDF2.count()

12

In [32]:
schlDF2.show(5)

+------+-----+-----+------+--------+-------------+-----------+---------------+
|RollNo| Name|Class|Gender|Phone No|English Marks|Maths Marks|Computers Marks|
+------+-----+-----+------+--------+-------------+-----------+---------------+
|   8_1|Nikhi|    8|     M|  912345|           95|        100|             90|
|   8_2| Akhi|    8|     M|  912346|           96|        100|             85|
|   8_3|Sakhi|    8|     F|  912347|           79|        100|             80|
|   8_4|Rakhi|    8|     M|  912348|           66|         95|             98|
|   9_1| Kaki|    9|     F|  912349|           88|         85|             93|
+------+-----+-----+------+--------+-------------+-----------+---------------+
only showing top 5 rows



In [33]:
#showing only required columns
schlDF2.select("RollNo","Name").show(5)

+------+-----+
|RollNo| Name|
+------+-----+
|   8_1|Nikhi|
|   8_2| Akhi|
|   8_3|Sakhi|
|   8_4|Rakhi|
|   9_1| Kaki|
+------+-----+
only showing top 5 rows



In [34]:
schlDF2.select(["RollNo","Name"]).show(5)

+------+-----+
|RollNo| Name|
+------+-----+
|   8_1|Nikhi|
|   8_2| Akhi|
|   8_3|Sakhi|
|   8_4|Rakhi|
|   9_1| Kaki|
+------+-----+
only showing top 5 rows



In [35]:
schlDF2.select(schlDF2.RollNo,schlDF2.Name,schlDF2.Class).show(5)

+------+-----+-----+
|RollNo| Name|Class|
+------+-----+-----+
|   8_1|Nikhi|    8|
|   8_2| Akhi|    8|
|   8_3|Sakhi|    8|
|   8_4|Rakhi|    8|
|   9_1| Kaki|    9|
+------+-----+-----+
only showing top 5 rows



In [36]:
schlDF2.select(schlDF2.RollNo,schlDF2.Name,schlDF2.Phone No).show(5)

SyntaxError: invalid syntax (<ipython-input-36-61f69aa9eac2>, line 1)

In [37]:
schlDF2.select(schlDF2.RollNo,schlDF2.Name,"Phone No").show(5)

+------+-----+--------+
|RollNo| Name|Phone No|
+------+-----+--------+
|   8_1|Nikhi|  912345|
|   8_2| Akhi|  912346|
|   8_3|Sakhi|  912347|
|   8_4|Rakhi|  912348|
|   9_1| Kaki|  912349|
+------+-----+--------+
only showing top 5 rows



In [38]:
from pyspark.sql.functions import col

schlDF2.select(schlDF2.RollNo,schlDF2.Name,col("Phone No")).show(5)

+------+-----+--------+
|RollNo| Name|Phone No|
+------+-----+--------+
|   8_1|Nikhi|  912345|
|   8_2| Akhi|  912346|
|   8_3|Sakhi|  912347|
|   8_4|Rakhi|  912348|
|   9_1| Kaki|  912349|
+------+-----+--------+
only showing top 5 rows



In [39]:
#describe() displays all the details like count of rows, mean, min, max, standard deviation
schlDF2.describe().show()

+-------+------+----+-----------------+------+-----------------+-----------------+-----------------+-----------------+
|summary|RollNo|Name|            Class|Gender|         Phone No|    English Marks|      Maths Marks|  Computers Marks|
+-------+------+----+-----------------+------+-----------------+-----------------+-----------------+-----------------+
|  count|    12|  12|               12|    12|               12|               12|               12|               12|
|   mean|  null|null|9.083333333333334|  null|         912350.5|87.66666666666667|             92.0|89.08333333333333|
| stddev|  null|null|0.900336637378521|  null|3.605551275463989|9.306237725636532|6.822422923379536| 6.74817596005608|
|    min|  10_1|Akhi|               10|     F|           912345|               66|              100|               76|
|    max|   9_3|Taki|                9|     M|           912356|               98|               96|               98|
+-------+------+----+-----------------+------+--

In [40]:
#summary() displays all the details like count of rows, mean, min, max, standard deviation, 25%,50%,75% rows of the dataset

schlDF2.summary().show()

+-------+------+----+-----------------+------+-----------------+-----------------+-----------------+-----------------+
|summary|RollNo|Name|            Class|Gender|         Phone No|    English Marks|      Maths Marks|  Computers Marks|
+-------+------+----+-----------------+------+-----------------+-----------------+-----------------+-----------------+
|  count|    12|  12|               12|    12|               12|               12|               12|               12|
|   mean|  null|null|9.083333333333334|  null|         912350.5|87.66666666666667|             92.0|89.08333333333333|
| stddev|  null|null|0.900336637378521|  null|3.605551275463989|9.306237725636532|6.822422923379536| 6.74817596005608|
|    min|  10_1|Akhi|               10|     F|           912345|               66|              100|               76|
|    25%|  null|null|              8.0|  null|         912347.0|             81.0|             85.0|             84.0|
|    50%|  null|null|              9.0|  null|  

In [41]:
#based on requirement, we can take our own %s
schlDF2.summary("0%","1%","10%","20%","25%","30%").show()

+-------+------+----+-----+------+--------+-------------+-----------+---------------+
|summary|RollNo|Name|Class|Gender|Phone No|English Marks|Maths Marks|Computers Marks|
+-------+------+----+-----+------+--------+-------------+-----------+---------------+
|     0%|  null|null|  8.0|  null|912345.0|         66.0|       82.0|           76.0|
|     1%|  null|null|  8.0|  null|912345.0|         66.0|       82.0|           76.0|
|    10%|  null|null|  8.0|  null|912346.0|         79.0|       83.0|           80.0|
|    20%|  null|null|  8.0|  null|912347.0|         81.0|       85.0|           84.0|
|    25%|  null|null|  8.0|  null|912347.0|         81.0|       85.0|           84.0|
|    30%|  null|null|  8.0|  null|912348.0|         82.0|       86.0|           85.0|
+-------+------+----+-----+------+--------+-------------+-----------+---------------+



In [42]:
schlDF2.show(5)

+------+-----+-----+------+--------+-------------+-----------+---------------+
|RollNo| Name|Class|Gender|Phone No|English Marks|Maths Marks|Computers Marks|
+------+-----+-----+------+--------+-------------+-----------+---------------+
|   8_1|Nikhi|    8|     M|  912345|           95|        100|             90|
|   8_2| Akhi|    8|     M|  912346|           96|        100|             85|
|   8_3|Sakhi|    8|     F|  912347|           79|        100|             80|
|   8_4|Rakhi|    8|     M|  912348|           66|         95|             98|
|   9_1| Kaki|    9|     F|  912349|           88|         85|             93|
+------+-----+-----+------+--------+-------------+-----------+---------------+
only showing top 5 rows



In [43]:
# changing column name

#schlDF2.select(col("Name").alias("Student Name")).show(2)

schlDF2.select(schlDF2.Name.alias("Student Name")).show(2)

+------------+
|Student Name|
+------------+
|       Nikhi|
|        Akhi|
+------------+
only showing top 2 rows



In [44]:
# changing column name
schlDF2.withColumnRenamed("Name","NewName").show(3)

+------+-------+-----+------+--------+-------------+-----------+---------------+
|RollNo|NewName|Class|Gender|Phone No|English Marks|Maths Marks|Computers Marks|
+------+-------+-----+------+--------+-------------+-----------+---------------+
|   8_1|  Nikhi|    8|     M|  912345|           95|        100|             90|
|   8_2|   Akhi|    8|     M|  912346|           96|        100|             85|
|   8_3|  Sakhi|    8|     F|  912347|           79|        100|             80|
+------+-------+-----+------+--------+-------------+-----------+---------------+
only showing top 3 rows



In [46]:
# Adding constant value column to the dataframe

from pyspark.sql.functions import lit

schlDF2.withColumn("ConstantColumn",lit("100")).show(5)

+------+-----+-----+------+--------+-------------+-----------+---------------+--------------+
|RollNo| Name|Class|Gender|Phone No|English Marks|Maths Marks|Computers Marks|ConstantColumn|
+------+-----+-----+------+--------+-------------+-----------+---------------+--------------+
|   8_1|Nikhi|    8|     M|  912345|           95|        100|             90|           100|
|   8_2| Akhi|    8|     M|  912346|           96|        100|             85|           100|
|   8_3|Sakhi|    8|     F|  912347|           79|        100|             80|           100|
|   8_4|Rakhi|    8|     M|  912348|           66|         95|             98|           100|
|   9_1| Kaki|    9|     F|  912349|           88|         85|             93|           100|
+------+-----+-----+------+--------+-------------+-----------+---------------+--------------+
only showing top 5 rows



In [47]:
# adding new column

schlDF2.withColumn("Total",col("English Marks")+col("Maths Marks")+col("Computers Marks")).show(2)



+------+-----+-----+------+--------+-------------+-----------+---------------+-----+
|RollNo| Name|Class|Gender|Phone No|English Marks|Maths Marks|Computers Marks|Total|
+------+-----+-----+------+--------+-------------+-----------+---------------+-----+
|   8_1|Nikhi|    8|     M|  912345|           95|        100|             90|285.0|
|   8_2| Akhi|    8|     M|  912346|           96|        100|             85|281.0|
+------+-----+-----+------+--------+-------------+-----------+---------------+-----+
only showing top 2 rows



In [48]:
schlDF2.show(2)

+------+-----+-----+------+--------+-------------+-----------+---------------+
|RollNo| Name|Class|Gender|Phone No|English Marks|Maths Marks|Computers Marks|
+------+-----+-----+------+--------+-------------+-----------+---------------+
|   8_1|Nikhi|    8|     M|  912345|           95|        100|             90|
|   8_2| Akhi|    8|     M|  912346|           96|        100|             85|
+------+-----+-----+------+--------+-------------+-----------+---------------+
only showing top 2 rows



In [49]:
# adding muliple columns by using withColumn()

schlDF2.withColumn("Total",col("English Marks")+col("Maths Marks")+col("Computers Marks"))\
.withColumn("constantColumn",lit("Contant")).show(2)

+------+-----+-----+------+--------+-------------+-----------+---------------+-----+--------------+
|RollNo| Name|Class|Gender|Phone No|English Marks|Maths Marks|Computers Marks|Total|constantColumn|
+------+-----+-----+------+--------+-------------+-----------+---------------+-----+--------------+
|   8_1|Nikhi|    8|     M|  912345|           95|        100|             90|285.0|       Contant|
|   8_2| Akhi|    8|     M|  912346|           96|        100|             85|281.0|       Contant|
+------+-----+-----+------+--------+-------------+-----------+---------------+-----+--------------+
only showing top 2 rows



In [62]:
# adding muliple columns by using withColumns()

D = {"Total":col("English Marks")+col("Maths Marks")+col("Computers Marks"),
    "constantColumn":lit("Contant")
    }
schlDF2.withColumns(D).show(2)

+------+-----+-----+------+--------+-------------+-----------+---------------+-----+--------------+
|RollNo| Name|Class|Gender|Phone No|English Marks|Maths Marks|Computers Marks|Total|constantColumn|
+------+-----+-----+------+--------+-------------+-----------+---------------+-----+--------------+
|   8_1|Nikhi|    8|     M|  912345|           95|        100|             90|285.0|       Contant|
|   8_2| Akhi|    8|     M|  912346|           96|        100|             85|281.0|       Contant|
+------+-----+-----+------+--------+-------------+-----------+---------------+-----+--------------+
only showing top 2 rows



In [63]:
schlDF3 = schlDF2.withColumns(D)
schlDF3.show(2)

+------+-----+-----+------+--------+-------------+-----------+---------------+-----+--------------+
|RollNo| Name|Class|Gender|Phone No|English Marks|Maths Marks|Computers Marks|Total|constantColumn|
+------+-----+-----+------+--------+-------------+-----------+---------------+-----+--------------+
|   8_1|Nikhi|    8|     M|  912345|           95|        100|             90|285.0|       Contant|
|   8_2| Akhi|    8|     M|  912346|           96|        100|             85|281.0|       Contant|
+------+-----+-----+------+--------+-------------+-----------+---------------+-----+--------------+
only showing top 2 rows



In [65]:
#dropping unwanted column/columns

#schlDF3.drop("constantColumn").show(2)

schlDF3.drop("Total","constantColumn").show(2)


+------+-----+-----+------+--------+-------------+-----------+---------------+
|RollNo| Name|Class|Gender|Phone No|English Marks|Maths Marks|Computers Marks|
+------+-----+-----+------+--------+-------------+-----------+---------------+
|   8_1|Nikhi|    8|     M|  912345|           95|        100|             90|
|   8_2| Akhi|    8|     M|  912346|           96|        100|             85|
+------+-----+-----+------+--------+-------------+-----------+---------------+
only showing top 2 rows



In [66]:
# Sorting the data based on provided column/columns


#schlDF3.sort(col("Total")).show() #by default it takes ascending order
schlDF3.sort(col("Total").desc()).show()

#schlDF3.sort(schlDF3.Total.desc()).show()
#schlDF3.sort(schlDF3.Total.asc()).show()
#schlDF3.sort(col("Name").asc(),col("Total").desc()).show()

+------+-----+-----+------+--------+-------------+-----------+---------------+-----+--------------+
|RollNo| Name|Class|Gender|Phone No|English Marks|Maths Marks|Computers Marks|Total|constantColumn|
+------+-----+-----+------+--------+-------------+-----------+---------------+-----+--------------+
|  10_4| Laki|   10|     F|  912355|           98|         96|             96|290.0|       Contant|
|   8_1|Nikhi|    8|     M|  912345|           95|        100|             90|285.0|       Contant|
|  10_5|Gakhi|   10|     F|  912356|           95|         95|             95|285.0|       Contant|
|   8_2| Akhi|    8|     M|  912346|           96|        100|             85|281.0|       Contant|
|   9_2| Jaki|    9|     M|  912350|           92|         82|             94|268.0|       Contant|
|   9_1| Kaki|    9|     F|  912349|           88|         85|             93|266.0|       Contant|
|   9_3| Maki|    9|     F|  912351|           93|         83|             88|264.0|       Contant|


In [67]:
schlDF2.show()

+------+-----+-----+------+--------+-------------+-----------+---------------+
|RollNo| Name|Class|Gender|Phone No|English Marks|Maths Marks|Computers Marks|
+------+-----+-----+------+--------+-------------+-----------+---------------+
|   8_1|Nikhi|    8|     M|  912345|           95|        100|             90|
|   8_2| Akhi|    8|     M|  912346|           96|        100|             85|
|   8_3|Sakhi|    8|     F|  912347|           79|        100|             80|
|   8_4|Rakhi|    8|     M|  912348|           66|         95|             98|
|   9_1| Kaki|    9|     F|  912349|           88|         85|             93|
|   9_2| Jaki|    9|     M|  912350|           92|         82|             94|
|   9_3| Maki|    9|     F|  912351|           93|         83|             88|
|  10_1| Paki|   10|     M|  912352|           82|         94|             84|
|  10_2|Bakhi|   10|     M|  912353|           87|         86|             90|
|  10_3| Taki|   10|     F|  912354|           81|  

In [68]:
# display Unique values
# schlDF2.select("Gender").distinct().count()

schlDF2.select("Gender").distinct().show()

+------+
|Gender|
+------+
|     F|
|     M|
+------+



In [69]:
#converting spark dataframe to pandas dataframe
pandas_df = schlDF2.toPandas()
type(pandas_df)

pandas.core.frame.DataFrame

In [70]:
pandas_df.shape

(12, 8)

In [71]:
pandas_df.head(2)

Unnamed: 0,RollNo,Name,Class,Gender,Phone No,English Marks,Maths Marks,Computers Marks
0,8_1,Nikhi,8,M,912345,95,100,90
1,8_2,Akhi,8,M,912346,96,100,85


In [72]:
pandas_df.describe()

Unnamed: 0,RollNo,Name,Class,Gender,Phone No,English Marks,Maths Marks,Computers Marks
count,12,12,12,12,12,12,12,12
unique,12,12,3,2,12,11,9,11
top,10_1,Kaki,10,F,912352,95,100,90
freq,1,1,5,6,1,2,3,2


In [73]:
#creating a dataframe

c1 = ("India","Asia",91) # row 1
c2 = ("Nepal","Asia",977) # row 2

conDF1 = spark.createDataFrame([c1,c2])

In [74]:
type(conDF1)

pyspark.sql.dataframe.DataFrame

In [75]:
conDF1.show()

+-----+----+---+
|   _1|  _2| _3|
+-----+----+---+
|India|Asia| 91|
|Nepal|Asia|977|
+-----+----+---+



In [76]:
conDF1.dtypes

[('_1', 'string'), ('_2', 'string'), ('_3', 'bigint')]

In [77]:
# creating a dataframe with our own schema
from pyspark.sql.types import StructField, StructType, StringType, IntegerType

conSchema = StructType(
[
    StructField("Country Name",StringType(),nullable=True), #nullable=True by default,it means,it may have empty cells also
    StructField("Continent Name",StringType(),nullable=True),
    StructField("Country Ph Code",IntegerType(),nullable=False)    
    
]
)

In [78]:

conDF2 = spark.createDataFrame([c1,c2],schema=conSchema)
conDF2.show()

+------------+--------------+---------------+
|Country Name|Continent Name|Country Ph Code|
+------------+--------------+---------------+
|       India|          Asia|             91|
|       Nepal|          Asia|            977|
+------------+--------------+---------------+



In [79]:
conDF2.dtypes

[('Country Name', 'string'),
 ('Continent Name', 'string'),
 ('Country Ph Code', 'int')]

In [80]:
# creating dataframe with our own schema
NewConDF = spark.createDataFrame([("SriLanka","Asia",94),("China","Asia",86)],schema=conSchema)

NewConDF.show()

+------------+--------------+---------------+
|Country Name|Continent Name|Country Ph Code|
+------------+--------------+---------------+
|    SriLanka|          Asia|             94|
|       China|          Asia|             86|
+------------+--------------+---------------+



In [81]:
# appeding one dataframe to other dataframe

finalDF = conDF2.union(NewConDF)

finalDF.show()

+------------+--------------+---------------+
|Country Name|Continent Name|Country Ph Code|
+------------+--------------+---------------+
|       India|          Asia|             91|
|       Nepal|          Asia|            977|
|    SriLanka|          Asia|             94|
|       China|          Asia|             86|
+------------+--------------+---------------+



In [82]:
finalDF.collect()

[Row(Country Name='India', Continent Name='Asia', Country Ph Code=91),
 Row(Country Name='Nepal', Continent Name='Asia', Country Ph Code=977),
 Row(Country Name='SriLanka', Continent Name='Asia', Country Ph Code=94),
 Row(Country Name='China', Continent Name='Asia', Country Ph Code=86)]

In [83]:
type(finalDF.collect())

list

In [86]:
# we can append in below way also

L = finalDF.collect()
L.append(
{
   "Country Name" : "Bhutan",
    "Continent Name": "Asia",
    "Country Ph Code": 975    
}
)
spark.createDataFrame(L).show()

+------------+--------------+---------------+
|Country Name|Continent Name|Country Ph Code|
+------------+--------------+---------------+
|       India|          Asia|             91|
|       Nepal|          Asia|            977|
|    SriLanka|          Asia|             94|
|       China|          Asia|             86|
|      Bhutan|          Asia|            975|
+------------+--------------+---------------+



In [87]:
# adding some duplicate rows 

L.extend(
[
{
   "Country Name" : "Bhutan",
    "Continent Name": "Asia",
    "Country Ph Code": 975    
}
,
{
   "Country Name" : "Bhutan",
    "Continent Name": "Asia",
    "Country Ph Code": 975    
}
]
)

In [88]:
L

[Row(Country Name='India', Continent Name='Asia', Country Ph Code=91),
 Row(Country Name='Nepal', Continent Name='Asia', Country Ph Code=977),
 Row(Country Name='SriLanka', Continent Name='Asia', Country Ph Code=94),
 Row(Country Name='China', Continent Name='Asia', Country Ph Code=86),
 {'Country Name': 'Bhutan', 'Continent Name': 'Asia', 'Country Ph Code': 975},
 {'Country Name': 'Bhutan', 'Continent Name': 'Asia', 'Country Ph Code': 975},
 {'Country Name': 'Bhutan', 'Continent Name': 'Asia', 'Country Ph Code': 975}]

In [89]:
spark.createDataFrame(L).show()

+------------+--------------+---------------+
|Country Name|Continent Name|Country Ph Code|
+------------+--------------+---------------+
|       India|          Asia|             91|
|       Nepal|          Asia|            977|
|    SriLanka|          Asia|             94|
|       China|          Asia|             86|
|      Bhutan|          Asia|            975|
|      Bhutan|          Asia|            975|
|      Bhutan|          Asia|            975|
+------------+--------------+---------------+



In [91]:
#dropping duplicates based on provided list column/columns

spark.createDataFrame(L).dropDuplicates().show()

# spark.createDataFrame(L).dropDuplicates(["Country Name"]).show()

# spark.createDataFrame(L).dropDuplicates(["Country Name","Continent Name"]).show()

+------------+--------------+---------------+
|Country Name|Continent Name|Country Ph Code|
+------------+--------------+---------------+
|       India|          Asia|             91|
|    SriLanka|          Asia|             94|
|       China|          Asia|             86|
|       Nepal|          Asia|            977|
|      Bhutan|          Asia|            975|
+------------+--------------+---------------+

