#Apply Transformations & actions in pyspark

In [5]:
from pyspark import SparkContext
from pyspark.sql import SparkSession

spark=SparkSession.builder.appName('Coding_Challenge').getOrCreate()
df=spark.read.csv("/content/LoanData (1).csv",header=True,inferSchema=True)
df.show()
df.count()

+--------+------+-------+----------+------------+-------------+---------------+-----------------+----------+----------------+--------------+-------------+-----------+
| Loan_ID|Gender|Married|Dependents|   Education|Self_Employed|ApplicantIncome|CoapplicantIncome|LoanAmount|Loan_Amount_Term|Credit_History|Property_Area|Loan_Status|
+--------+------+-------+----------+------------+-------------+---------------+-----------------+----------+----------------+--------------+-------------+-----------+
|LP001002|  Male|     No|         0|    Graduate|           No|           5849|              0.0|      NULL|             360|             1|        Urban|          Y|
|LP001003|  Male|    Yes|         1|    Graduate|           No|           4583|           1508.0|       128|             360|             1|        Rural|          N|
|LP001005|  Male|    Yes|         0|    Graduate|          Yes|           3000|              0.0|        66|             360|             1|        Urban|          Y

614

#Transformations

##Filter

In [41]:
from pyspark.sql.functions import col
df.filter(col("Self_Employed")=="Yes").show()

+--------+------+-------+----------+------------+-------------+---------------+-----------------+----------+----------------+--------------+-------------+-----------+
| Loan_ID|Gender|Married|Dependents|   Education|Self_Employed|ApplicantIncome|CoapplicantIncome|LoanAmount|Loan_Amount_Term|Credit_History|Property_Area|Loan_Status|
+--------+------+-------+----------+------------+-------------+---------------+-----------------+----------+----------------+--------------+-------------+-----------+
|LP001005|  Male|    Yes|         0|    Graduate|          Yes|           3000|              0.0|        66|             360|             1|        Urban|          Y|
|LP001011|  Male|    Yes|         2|    Graduate|          Yes|           5417|           4196.0|       267|             360|             1|        Urban|          Y|
|LP001066|  Male|    Yes|         0|    Graduate|          Yes|           9560|              0.0|       191|             360|             1|    Semiurban|          Y

##Join

In [10]:
ProductDf=spark.read.csv("/content/products.csv",header=True,inferSchema=True)
salesDf=spark.read.csv("/content/sales.csv",header=True,inferSchema=True)
ProductDf.show()
salesDf.show()
ProductDf.join(salesDf,ProductDf.product_id==salesDf.product_id).show()

+----------+------------+-----------+
|product_id|product_name|   category|
+----------+------------+-----------+
|       101|      Laptop|Electronics|
|       102|    Keyboard|Electronics|
|       103|       Mouse|Electronics|
|       104|        Desk|  Furniture|
|       105|       Chair|  Furniture|
|       106|     Monitor|Electronics|
|       107|    Notebook| Stationery|
+----------+------------+-----------+

+-------+----------+--------+----------+
|sale_id|product_id|quantity| sale_date|
+-------+----------+--------+----------+
|      1|       101|       2|2023-01-01|
|      2|       102|       1|2023-01-02|
|      3|       103|       4|2023-01-03|
|      4|       108|       1|2023-01-04|
|      5|       109|       2|2023-01-05|
|      6|       101|       1|2023-01-06|
|      7|       104|       3|2023-01-07|
+-------+----------+--------+----------+

+----------+------------+-----------+-------+----------+--------+----------+
|product_id|product_name|   category|sale_id|product

##Simple Aggregations

In [16]:
from pyspark.sql.functions import sum,max,min,avg
df.agg(sum("ApplicantIncome").alias ("Total Income")).show()
df.agg(max("ApplicantIncome").alias ("Max Income")).show()
df.agg(min("ApplicantIncome").alias ("Min Income")).show()
df.agg(avg("ApplicantIncome").alias ("Avg Income")).show()

+------------+
|Total Income|
+------------+
|     3317724|
+------------+

+----------+
|Max Income|
+----------+
|     81000|
+----------+

+----------+
|Min Income|
+----------+
|       150|
+----------+

+-----------------+
|       Avg Income|
+-----------------+
|5403.459283387622|
+-----------------+



##GroupBy

In [17]:
df.groupBy("Education").agg(sum("ApplicantIncome").alias ("Total Income")).show()

+------------+------------+
|   Education|Total Income|
+------------+------------+
|Not Graduate|      506156|
|    Graduate|     2811568|
+------------+------------+



##Window Function

In [24]:
from pyspark.sql.window import Window
from pyspark.sql.functions import rank
windowSpec = Window.partitionBy("Property_Area").orderBy(col("ApplicantIncome").desc())
df_ranked = df.withColumn("Income_Rank", rank().over(windowSpec))
df_ranked.select("Loan_ID", "Property_Area", "ApplicantIncome", "Income_Rank").show()

+--------+-------------+---------------+-----------+
| Loan_ID|Property_Area|ApplicantIncome|Income_Rank|
+--------+-------------+---------------+-----------+
|LP002317|        Rural|          81000|          1|
|LP001448|        Rural|          23803|          2|
|LP001922|        Rural|          20667|          3|
|LP001996|        Rural|          20233|          4|
|LP002191|        Rural|          19730|          5|
|LP002699|        Rural|          17500|          6|
|LP002527|        Rural|          16525|          7|
|LP002065|        Rural|          15000|          8|
|LP001859|        Rural|          14683|          9|
|LP001401|        Rural|          14583|         10|
|LP001100|        Rural|          12500|         11|
|LP001519|        Rural|          10000|         12|
|LP002050|        Rural|          10000|         12|
|LP002945|        Rural|           9963|         14|
|LP001935|        Rural|           9508|         15|
|LP002262|        Rural|           9504|      

##sortBy()

In [28]:
rdd=df.rdd
sorted_rdd = rdd.sortBy(lambda row: row["ApplicantIncome"], ascending=False)
for row in sorted_rdd.take(5):
    print(row["Loan_ID"], row["ApplicantIncome"])


LP002317 81000
LP002101 63337
LP001585 51763
LP001536 39999
LP001640 39147


#Actions

##collect()



In [30]:
print(df.collect())

[Row(Loan_ID='LP001002', Gender='Male', Married='No', Dependents='0', Education='Graduate', Self_Employed='No', ApplicantIncome=5849, CoapplicantIncome=0.0, LoanAmount=None, Loan_Amount_Term=360, Credit_History=1, Property_Area='Urban', Loan_Status='Y'), Row(Loan_ID='LP001003', Gender='Male', Married='Yes', Dependents='1', Education='Graduate', Self_Employed='No', ApplicantIncome=4583, CoapplicantIncome=1508.0, LoanAmount=128, Loan_Amount_Term=360, Credit_History=1, Property_Area='Rural', Loan_Status='N'), Row(Loan_ID='LP001005', Gender='Male', Married='Yes', Dependents='0', Education='Graduate', Self_Employed='Yes', ApplicantIncome=3000, CoapplicantIncome=0.0, LoanAmount=66, Loan_Amount_Term=360, Credit_History=1, Property_Area='Urban', Loan_Status='Y'), Row(Loan_ID='LP001006', Gender='Male', Married='Yes', Dependents='0', Education='Not Graduate', Self_Employed='No', ApplicantIncome=2583, CoapplicantIncome=2358.0, LoanAmount=120, Loan_Amount_Term=360, Credit_History=1, Property_Area=

##count()

In [31]:
df.count()

614

##first()

In [35]:
df.first()

Row(Loan_ID='LP001002', Gender='Male', Married='No', Dependents='0', Education='Graduate', Self_Employed='No', ApplicantIncome=5849, CoapplicantIncome=0.0, LoanAmount=None, Loan_Amount_Term=360, Credit_History=1, Property_Area='Urban', Loan_Status='Y')

##saveAsTextFile()

In [39]:
rdd.saveAsTextFile('file.txt')