In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("MySparkApp").getOrCreate()

In [None]:
# content = spark.read.text('words.txt')

data = ((1,'Udit'),(2,'Ekanth'),(3,'Raghu'))
columns = ['id','name']

df = spark.createDataFrame(data, columns)

df.show()
df.printSchema()

+---+------+
| id|  name|
+---+------+
|  1|  Udit|
|  2|Ekanth|
|  3| Raghu|
+---+------+

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)



In [None]:
!pip install pyspark



In [None]:
from pyspark import SparkContext

data1 = ((1,'Udit'),(2,'Ekanth'),(3,'Raghu'))
rdd = spark.sparkContext.parallelize(data1)
columns1 = ['id','name']
print(type(rdd))
df2 = rdd.toDF(columns1)
print(type(df2))
df.show()



<class 'pyspark.rdd.RDD'>
<class 'pyspark.sql.dataframe.DataFrame'>
+---+------+
| id|  name|
+---+------+
|  1|  Udit|
|  2|Ekanth|
|  3| Raghu|
+---+------+



In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

# Spark session
# spark = SparkSession.builder.appName("rdd-to-df").getOrCreate()
# sc = spark.sparkContext

In [None]:
# Sample RDD
rdd = sc.parallelize([
    (1, "Alice"),
    (2, "Bob"),
    (3, "Charlie")
])

# Define schema
schema = StructType([
    StructField("id", IntegerType(), False),
    StructField("name", StringType(), True)
])

In [None]:
# Convert RDD to DataFrame
df = spark.createDataFrame(rdd, schema)

df.show()
df.printSchema()

+---+-------+
| id|   name|
+---+-------+
|  1|  Alice|
|  2|    Bob|
|  3|Charlie|
+---+-------+

root
 |-- id: integer (nullable = false)
 |-- name: string (nullable = true)



### Task 2: Perform aggregation

In [None]:
# Task 2

data = [
    ("charan", 1000),
    ("bharat", 1000),
    ("charan", 500),
    ("bharat", 2000),
    ("hari", 9000),
    ("vikram", 1500)
]

rdd = sc.parallelize(data)

result = rdd.reduceByKey(lambda a, b: a + b).collect()

print(result)

[('charan', 1500), ('bharat', 3000), ('hari', 9000), ('vikram', 1500)]


### Read Files

In [None]:
par_df = spark.read.parquet('data.parquet')
# par_df = spark.read.format('parquet').load('data.parquet')
par_df.show()
par_df.printSchema()

+--------+----------+--------+----------------+
|      id|     tdate|category|         product|
+--------+----------+--------+----------------+
|00000000|06-26-2011|Exercise|Gymnastics Rings|
|00000002|06-01-2011|Exercise|Gymnastics Rings|
+--------+----------+--------+----------------+

root
 |-- id: string (nullable = true)
 |-- tdate: string (nullable = true)
 |-- category: string (nullable = true)
 |-- product: string (nullable = true)



In [None]:
orc_df = spark.read.orc('data.orc')
# par_df = spark.read.format('orc').load('data.orc')
orc_df.show()
orc_df.printSchema()

+----------+---------+--------------------+------------------+-----------+----------+-----+-----+---+------------+------------+--------------------+--------------------+
|first_name|last_name|        company_name|           address|       city|    county|state|  zip|age|      phone1|      phone2|               email|                 web|
+----------+---------+--------------------+------------------+-----------+----------+-----+-----+---+------------+------------+--------------------+--------------------+
|   Solange|   Shinko|   Mosocco, Ronald A|       426 Wolf St|   Metairie| Jefferson|   LA|70002| 21|504-979-9175|504-265-8174|  solange@shinko.com|http://www.mosocc...|
|    Arlene|  Klusman|Beck Horizon Buil...|        3 Secor Rd|New Orleans|   Orleans|   LA|70112| 20|504-710-5840|504-946-1807|arlene_klusman@gm...|http://www.beckho...|
|     Larae|   Gudroe|Lehigh Furn Divsn...| 6651 Municipal Rd|      Houma|Terrebonne|   LA|70360| 33|985-890-7262|985-261-5783|larae_gudroe@gmai...|ht

In [None]:
from pyspark.sql.functions import min, max, avg, col

orc_df = spark.read.orc('data.orc')
# par_df = spark.read.format('orc').load('data.orc')
df1= orc_df.withColumn('age', col('age').cast(IntegerType()))
df1.printSchema()

df = orc_df.agg(min("age").alias('min_age'),
                max("age").alias('max_age'),
                avg('age').alias('avg_age'))

df.show()

root
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- company_name: string (nullable = true)
 |-- address: string (nullable = true)
 |-- city: string (nullable = true)
 |-- county: string (nullable = true)
 |-- state: string (nullable = true)
 |-- zip: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- phone1: string (nullable = true)
 |-- phone2: string (nullable = true)
 |-- email: string (nullable = true)
 |-- web: string (nullable = true)

+-------+-------+-------+
|min_age|max_age|avg_age|
+-------+-------+-------+
|     11|     33| 21.375|
+-------+-------+-------+



In [None]:


avro_df = spark.read.format("avro").load("data.avro")

avro_df.show()

AnalysisException: Failed to find data source: avro. Avro is built-in but external data source module since Spark 2.4. Please deploy the application as per the deployment section of Apache Avro Data Source Guide.

In [None]:
orc_df.show()

+----------+---------+--------------------+------------------+-----------+----------+-----+-----+---+------------+------------+--------------------+--------------------+
|first_name|last_name|        company_name|           address|       city|    county|state|  zip|age|      phone1|      phone2|               email|                 web|
+----------+---------+--------------------+------------------+-----------+----------+-----+-----+---+------------+------------+--------------------+--------------------+
|   Solange|   Shinko|   Mosocco, Ronald A|       426 Wolf St|   Metairie| Jefferson|   LA|70002| 21|504-979-9175|504-265-8174|  solange@shinko.com|http://www.mosocc...|
|    Arlene|  Klusman|Beck Horizon Buil...|        3 Secor Rd|New Orleans|   Orleans|   LA|70112| 20|504-710-5840|504-946-1807|arlene_klusman@gm...|http://www.beckho...|
|     Larae|   Gudroe|Lehigh Furn Divsn...| 6651 Municipal Rd|      Houma|Terrebonne|   LA|70360| 33|985-890-7262|985-261-5783|larae_gudroe@gmai...|ht

In [None]:
from pyspark.sql.functions import *

In [None]:
orc_df = orc_df.withColumn('status',expr(""" case
                                              when age <= 17 then 'minor'
                                              when age >= 18 and age <= 60 then 'major'
                                              else 'sr.citizen'
                                             end """))
orc_df.show()

+----------+---------+--------------------+------------------+-----------+----------+-----+-----+---+------------+------------+--------------------+--------------------+------+
|first_name|last_name|        company_name|           address|       city|    county|state|  zip|age|      phone1|      phone2|               email|                 web|status|
+----------+---------+--------------------+------------------+-----------+----------+-----+-----+---+------------+------------+--------------------+--------------------+------+
|   Solange|   Shinko|   Mosocco, Ronald A|       426 Wolf St|   Metairie| Jefferson|   LA|70002| 21|504-979-9175|504-265-8174|  solange@shinko.com|http://www.mosocc...| major|
|    Arlene|  Klusman|Beck Horizon Buil...|        3 Secor Rd|New Orleans|   Orleans|   LA|70112| 20|504-710-5840|504-946-1807|arlene_klusman@gm...|http://www.beckho...| major|
|     Larae|   Gudroe|Lehigh Furn Divsn...| 6651 Municipal Rd|      Houma|Terrebonne|   LA|70360| 33|985-890-7262|9

In [None]:
results = orc_df.withColumn('new_status',when(orc_df.age <= 17, 'minor')\
                            .when((orc_df.age >= 18) & (orc_df.age <= 60),'major')\
                            .otherwise('Sr.Citizen'))



results.show()

+----------+---------+--------------------+------------------+-----------+----------+-----+-----+---+------------+------------+--------------------+--------------------+------+----------+
|first_name|last_name|        company_name|           address|       city|    county|state|  zip|age|      phone1|      phone2|               email|                 web|status|new_status|
+----------+---------+--------------------+------------------+-----------+----------+-----+-----+---+------------+------------+--------------------+--------------------+------+----------+
|   Solange|   Shinko|   Mosocco, Ronald A|       426 Wolf St|   Metairie| Jefferson|   LA|70002| 21|504-979-9175|504-265-8174|  solange@shinko.com|http://www.mosocc...| major|     major|
|    Arlene|  Klusman|Beck Horizon Buil...|        3 Secor Rd|New Orleans|   Orleans|   LA|70112| 20|504-710-5840|504-946-1807|arlene_klusman@gm...|http://www.beckho...| major|     major|
|     Larae|   Gudroe|Lehigh Furn Divsn...| 6651 Municipal R

In [None]:
# 3rd scenario -> min 3 orders find that customer
# 4th scenario -> 6 matches in o/p without repeat