In [1]:
# Usage of spark object in PySpark shell
import findspark
findspark.init()
import pyspark
from pyspark.sql import SparkSession, Row
import sys
from operator import add
spark = SparkSession.builder.getOrCreate()

#import pandas as pd


In [2]:


# Create a DataFrame from a list of tuples
data = [("John", 25, "USA"),
        ("Emma", 28, "Canada"),
        ("Mike", 22, "UK")]

df = spark.createDataFrame(data, ["Name", "Age", "Country"])

# Show the DataFrame
df.show()

# Print the schema
df.printSchema()


+----+---+-------+
|Name|Age|Country|
+----+---+-------+
|John| 25|    USA|
|Emma| 28| Canada|
|Mike| 22|     UK|
+----+---+-------+

root
 |-- Name: string (nullable = true)
 |-- Age: long (nullable = true)
 |-- Country: string (nullable = true)



In [3]:



# Select specific columns
selected_df = df.select("Name", "Country")
selected_df.show()


+----+-------+
|Name|Country|
+----+-------+
|John|    USA|
|Emma| Canada|
|Mike|     UK|
+----+-------+



In [4]:

# Filter rows based on a condition
filtered_df = df.filter(df.Age > 25)
filtered_df.show()


+----+---+-------+
|Name|Age|Country|
+----+---+-------+
|Emma| 28| Canada|
+----+---+-------+



In [5]:

# Group by a column and perform aggregation
grouped_df = df.groupBy("Country").agg({"Age": "avg"})
grouped_df.show()


+-------+--------+
|Country|avg(Age)|
+-------+--------+
|    USA|    25.0|
| Canada|    28.0|
|     UK|    22.0|
+-------+--------+



In [6]:

# Join two DataFrames
data2 = [("John", "Sales"),
         ("Emma", "Marketing"),
         ("Mike", "Finance")]

df2 = spark.createDataFrame(data2, ["Name", "Department"])

joined_df = df.join(df2, on="Name", how="inner")
joined_df.show()

# Stop the SparkSession
spark.stop()


+----+---+-------+----------+
|Name|Age|Country|Department|
+----+---+-------+----------+
|Emma| 28| Canada| Marketing|
|John| 25|    USA|     Sales|
|Mike| 22|     UK|   Finance|
+----+---+-------+----------+



In [None]:


data = [Row(col1='pyspark and spark', col2=1), Row(col1='pyspark', col2=2), Row(col1='spark vs hadoop', col2=2), Row(col1='spark', col2=2), Row(col1='hadoop', col2=2)]
df = spark.createDataFrame(data)
lines = df.rdd.map(lambda r: r[0])

counters = lines.flatMap(lambda x: x.split(' ')) \
   .map(lambda x: (x, 1)) \
   .reduceByKey(add)

output = counters.collect()
sortedCollection = sorted(output, key = lambda r: r[1], reverse = True)

for (word, count) in sortedCollection:
   print("%s: %i" % (word, count))