In [1]:
import pyspark

In [2]:
from pyspark.sql import SparkSession

In [6]:
spark = SparkSession.builder.appName("Distinct").getOrCreate()

In [22]:
sc = spark.sparkContext

In [9]:
data = [("James","Sales",3000),("Michael","Sales",4600),("Robert","Sales",4100),("James","Sales",3000)]

In [10]:
columns = ["employee_name","department","salary"]

In [11]:
df = spark.createDataFrame(data = data,schema = columns);

In [12]:
df.printSchema()

root
 |-- employee_name: string (nullable = true)
 |-- department: string (nullable = true)
 |-- salary: long (nullable = true)



In [13]:
distinct_df = df.distinct()

In [14]:
distinct_df.show()

                                                                                

+-------------+----------+------+
|employee_name|department|salary|
+-------------+----------+------+
|        James|     Sales|  3000|
|      Michael|     Sales|  4600|
|       Robert|     Sales|  4100|
+-------------+----------+------+



In [15]:
df.createOrReplaceTempView("Employee")

In [16]:
spark.sql(""" select * from Employee """).show()

+-------------+----------+------+
|employee_name|department|salary|
+-------------+----------+------+
|        James|     Sales|  3000|
|      Michael|     Sales|  4600|
|       Robert|     Sales|  4100|
|        James|     Sales|  3000|
+-------------+----------+------+



In [34]:
spark.sql(""" select  employee_name,max(salary) as average  from Employee group by employee_name order by employee_name asc """).show()

+-------------+-------+
|employee_name|average|
+-------------+-------+
|        James|   3000|
|      Michael|   4600|
|       Robert|   4100|
+-------------+-------+



In [20]:
dropdf = df.dropDuplicates().show()

+-------------+----------+------+
|employee_name|department|salary|
+-------------+----------+------+
|        James|     Sales|  3000|
|      Michael|     Sales|  4600|
|       Robert|     Sales|  4100|
+-------------+----------+------+



In [21]:
spark

In [23]:
sc.parallelize([12,13,14,15])

ParallelCollectionRDD[27] at readRDDFromFile at PythonRDD.scala:289

In [24]:
rdd = sc.parallelize([12,13,14,15])

In [25]:
rdd.collect()

[12, 13, 14, 15]

In [26]:
rdd.getNumPartitions()

12

In [27]:
rdd.first()

12

In [28]:
rdd.take(2)

[12, 13]

In [40]:
rdd_1 = sc.parallelize([1,2,1,2,2],2).countByValue().items()

In [41]:
rdd_1

dict_items([(1, 2), (2, 3)])

In [43]:
for key,value in rdd_1:
    print(f"key {key} value {value}")

key 1 value 2
key 2 value 3


In [45]:
df.show()

+-------------+----------+------+
|employee_name|department|salary|
+-------------+----------+------+
|        James|     Sales|  3000|
|      Michael|     Sales|  4600|
|       Robert|     Sales|  4100|
|        James|     Sales|  3000|
+-------------+----------+------+



In [46]:
data = [("Apple",1),("Apple",2),("Banana",1),("Banana",2)]

In [60]:
test_df = spark.createDataFrame(data=data,schema = ["Fruits","count"]).rdd

In [61]:
test_df.collect()

[Row(Fruits='Apple', count=1),
 Row(Fruits='Apple', count=2),
 Row(Fruits='Banana', count=1),
 Row(Fruits='Banana', count=2)]

In [57]:
test_df.reduceByKey(lambda x,y:x+y).collect()

                                                                                

[('Apple', 3), ('Banana', 3)]

In [63]:
test_df.groupByKey().collect()

[('Apple', <pyspark.resultiterable.ResultIterable at 0x10cd389b0>),
 ('Banana', <pyspark.resultiterable.ResultIterable at 0x10cd38f80>)]