In [0]:
# Create a dataframe from schema
from pyspark.sql.types import StructType,StructField, StringType, IntegerType
data = [("Harry","","Potter","36636","M",30000),
    ("Michael","Rose","","36650","M",40000),
    ("Rob","","Williams","42101","M",40000),
    ("Marianna","Anne","Jones","39192","F",40000),
    ("Jennifer","Mary","Harrison","","F",-1)
  ]

schema = StructType([ \
    StructField("firstname",StringType(),True), \
    StructField("middlename",StringType(),True), \
    StructField("lastname",StringType(),True), \
    StructField("id", StringType(), True), \
    StructField("gender", StringType(), True), \
    StructField("salary", IntegerType(), True) \
  ])
 
df = spark.createDataFrame(data=data,schema=schema)
# df.printSchema()
df.show(truncate=False)


In [0]:
# Create a dataframe from table
# Data link - https://archive.ics.uci.edu/ml/datasets/online+retail 

df2 = spark.read.table("online_retail_csv")

In [0]:
df2.head(n=10)

In [0]:
# Select a few columns
df2.select("StockCode","Description").show(20,False)

In [0]:
# Select All columns from List
df2.select("*").show(20,False)

In [0]:
print("Distinct count: "+str(df2.count()))

In [0]:
distinctDF = df2.distinct()
print("Distinct count: "+str(distinctDF.count()))
distinctDF.show(truncate=False)


In [0]:
df3 = df2.dropDuplicates()
print("Distinct count: "+str(df3.count()))
df3.show(truncate=False)

In [0]:
# Using equals condition
df3_filter = df3.filter(df3.Country == "United Kingdom")


In [0]:
from pyspark.sql.functions import countDistinct
df4=df3_filter.select(countDistinct("*"))
df4.show()

In [0]:
df3.groupBy("Country").count().show(100,truncate=False)

In [0]:
from pyspark.sql.functions import col,sum,avg,max
df3.groupBy("Country") \
    .agg(sum("Quantity").alias("sum_Quantity"), \
         max("Quantity").alias("max_Quantity"), \
         sum("UnitPrice").alias("sum_UnitPrice"), \
         avg("UnitPrice").alias("avg_UnitPrice") \
     ) \
    .show(truncate=False)

In [0]:
from pyspark.sql.functions import col, asc,desc

df3.sort("Country","Quantity").show(truncate=False)

In [0]:
df3.orderBy(df3.Country.desc(),df3.Quantity.asc()).show(truncate=False)