In [None]:
import pyspark
from pyspark import SparkContext
from pyspark.sql import SparkSession

sc = SparkContext.getOrCreate()
spark = SparkSession.builder.appName("PySpark Dataframe using external and rdd").getOrCreate()

df = spark.createDataFrame([("Swathi",22), ("Chamm",21), ("Amma", 51), ("Ekkoo", 27), ("James", 4), ("Katherina", 16)], ["Name", "Age"])
df.show()

+---------+---+
|     Name|Age|
+---------+---+
|   Swathi| 22|
|    Chamm| 21|
|     Amma| 51|
|    Ekkoo| 27|
|    James|  4|
|Katherina| 16|
+---------+---+



In [None]:
# Col function and when function
from pyspark.sql.functions import col, when

df1 = df.withColumn("Stage_by_age",
                   when(col("Age") < 13, "Child")
                   .when(col("Age").between(13, 19), "Teenager")
                   .otherwise("Adult"),
                   )
df1.show()

+---------+---+------------+
|     Name|Age|Stage_by_age|
+---------+---+------------+
|   Swathi| 22|       Adult|
|    Chamm| 21|       Adult|
|     Amma| 51|       Adult|
|    Ekkoo| 27|       Adult|
|    James|  4|       Child|
|Katherina| 16|    Teenager|
+---------+---+------------+



In [None]:
# Filtering Data
df1.where(df1["Stage_by_age"].isin(["Teenager", "Child"])).show()

+---------+---+------------+
|     Name|Age|Stage_by_age|
+---------+---+------------+
|    James|  4|       Child|
|Katherina| 16|    Teenager|
+---------+---+------------+



In [None]:
# Group By Function
df1.groupBy("Stage_by_age").avg().show()

+------------+--------+
|Stage_by_age|avg(Age)|
+------------+--------+
|       Adult|   30.25|
|    Teenager|    16.0|
|       Child|     4.0|
+------------+--------+



In [None]:
from pyspark.sql.functions import col, round

# Finding the average
avg_df = df1.groupBy("Stage_by_age").avg()

# Rounding Off
for column in avg_df.columns:
  if column != "Stage_by_age":
    avg_df = avg_df.withColumn(column, round(col(column)).cast("int"))

avg_df.show()

+------------+--------+
|Stage_by_age|avg(Age)|
+------------+--------+
|       Adult|      30|
|    Teenager|      16|
|       Child|       4|
+------------+--------+



In [None]:
# Reading from csv file
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Practice for importing data").getOrCreate()

mark_df = spark.read.csv("/content/drive/MyDrive/Marks_data.csv")
print(mark_df)
mark_df.show()
mark_df.printSchema()
print(type(mark_df))

DataFrame[_c0: string, _c1: string, _c2: string, _c3: string]
+----+--------+--------+---+
| _c0|     _c1|     _c2|_c3|
+----+--------+--------+---+
|Name|M1 Score|M2 Score|age|
|Alex|      62|      80| 20|
|Brad|      45|      56| 19|
|Joey|      85|      98| 21|
|NULL|      54|      79| 20|
|abhi|    NULL|    NULL| 20|
+----+--------+--------+---+

root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)
 |-- _c3: string (nullable = true)

<class 'pyspark.sql.dataframe.DataFrame'>


In [None]:
# Reading data
mark_df1 = spark.read.csv("/content/drive/MyDrive/Marks_data.csv", header = True, inferSchema= True)
mark_df1.show()

+----+--------+--------+---+
|Name|M1 Score|M2 Score|age|
+----+--------+--------+---+
|Alex|      62|      80| 20|
|Brad|      45|      56| 19|
|Joey|      85|      98| 21|
|NULL|      54|      79| 20|
|abhi|    NULL|    NULL| 20|
+----+--------+--------+---+



In [None]:
# Filtering Function
mark_df1.filter(mark_df1["M1 Score"] > 60).show()

+----+--------+--------+---+
|Name|M1 Score|M2 Score|age|
+----+--------+--------+---+
|Alex|      62|      80| 20|
|Joey|      85|      98| 21|
+----+--------+--------+---+



In [None]:
# Average Function
from pyspark.sql.functions import avg

mark_df1.select(avg("M1 Score").alias("AVG_M1"), avg("M2 Score").alias("AVG_M2")).show()

+------+------+
|AVG_M1|AVG_M2|
+------+------+
|  61.5| 78.25|
+------+------+



In [None]:
# Revealing the first few columns of the data
mark_df1.head()

Row(Name='Alex', M1 Score=62, M2 Score=80, age=20)

In [None]:
# Revealing the columns in the data
mark_df1.columns

['Name', 'M1 Score', 'M2 Score', 'age']

In [None]:
# Text Dataframe
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Practice for importing text").getOrCreate()
text_df = spark.read.text("/content/drive/MyDrive/practice.txt")
print(type(text_df))
text_df.show(truncate = False)
text_df.printSchema()

<class 'pyspark.sql.dataframe.DataFrame'>
+----------------------------------------------------------------+
|value                                                           |
+----------------------------------------------------------------+
|Hey! I am Swathi Baskaran.                                      |
|I am currently part of a Data Engineering training at Hexaware. |
+----------------------------------------------------------------+

root
 |-- value: string (nullable = true)



In [None]:
# Json dataframe
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Practice for importing json").getOrCreate()
json_df = spark.read.json("/content/drive/MyDrive/practice.json")
print(type(json_df))
json_df.show(truncate = False)
json_df.printSchema()

<class 'pyspark.sql.dataframe.DataFrame'>
+---+--------------+-------------+
|id |product_name  |product_price|
+---+--------------+-------------+
|1  |Pencil        |5            |
|2  |Ball Pen      |10           |
|3  |Eraser        |5            |
|4  |Parker Ink Pen|50           |
+---+--------------+-------------+

root
 |-- id: long (nullable = true)
 |-- product_name: string (nullable = true)
 |-- product_price: long (nullable = true)



In [None]:
# Converting to Pandas
files = ['/content/drive/MyDrive/salary.csv', '/content/drive/MyDrive/orders (1).csv']
df = spark.read.csv(files, sep = ',', inferSchema = True, header = True)
pandasdf = df.toPandas()
print(type(pandasdf))
print(type(df))
print(pandasdf.head())
df.show(5)

<class 'pandas.core.frame.DataFrame'>
<class 'pyspark.sql.dataframe.DataFrame'>
  cust_id cust_fname cust_lname cust_order cust_status
0       1       john        doe          5      active
1       2       jane      smith          8      active
2       3    micheal    jhonson          3    inactive
3       4       abhi    wiliams          1      active
4       5        ram      brown          4    inactive
+-------+----------+----------+----------+-----------+
|cust_id|cust_fname|cust_lname|cust_order|cust_status|
+-------+----------+----------+----------+-----------+
|      1|      john|       doe|         5|     active|
|      2|      jane|     smith|         8|     active|
|      3|   micheal|   jhonson|         3|   inactive|
|      4|      abhi|   wiliams|         1|     active|
|      5|       ram|     brown|         4|   inactive|
+-------+----------+----------+----------+-----------+
only showing top 5 rows



In [None]:
# Using Lit
import pyspark
from pyspark.sql.functions import lit
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('sparkdf').getOrCreate()

data = [["1", "sravan", "company 1"],
["2", "ojaswi", "company 1"],
["3", "rohith", "company 2"],
["4", "sridevi", "company 1"],
["5", "bobby", "company 1"]]

columns = ['ID', 'NAME', 'Company']
df = spark.createDataFrame(data, columns)

df.withColumn("salary",lit(30000)).show()

+---+-------+---------+------+
| ID|   NAME|  Company|salary|
+---+-------+---------+------+
|  1| sravan|company 1| 30000|
|  2| ojaswi|company 1| 30000|
|  3| rohith|company 2| 30000|
|  4|sridevi|company 1| 30000|
|  5|  bobby|company 1| 30000|
+---+-------+---------+------+



In [None]:
# Using Concat
from pyspark.sql.functions import concat
df.withColumn("Name_Company", concat(df["NAME"], lit(" - "), df["Company"])).show()

+---+-------+---------+-------------------+
| ID|   NAME|  Company|       Name_Company|
+---+-------+---------+-------------------+
|  1| sravan|company 1| sravan - company 1|
|  2| ojaswi|company 1| ojaswi - company 1|
|  3| rohith|company 2| rohith - company 2|
|  4|sridevi|company 1|sridevi - company 1|
|  5|  bobby|company 1|  bobby - company 1|
+---+-------+---------+-------------------+



In [None]:
# Finding the count
import pyspark
from pyspark import SparkContext

sc = SparkContext.getOrCreate()
count_rdd = sc.parallelize([2,3,4,5,6,7,8,9])
print(count_rdd.count())

8


In [None]:
# Finding the first element
print(count_rdd.first())

2


In [None]:
# Taking a certain number of elements
print(count_rdd.take(4))

[2, 3, 4, 5]


In [None]:
# Reduce - Takes 2 elements from the given RDD and operates
reduce_rdd = sc.parallelize([1,2,6,8,3])
print(reduce_rdd.reduce(lambda x,y: x*y))

288


In [None]:
# Saving as Text file
count_rdd.saveAsTextFile("numbers.txt")