In [None]:
from pyspark.sql import SparkSession

# Create Spark session
spark = SparkSession.builder \
    .appName("BDA Lab 3") \
    .getOrCreate()


# Load the CSV file into a DataFrame
df = spark.read.csv("Amazon_Responded_Oct05.csv", header=True, inferSchema=True)

df.printSchema()
df.describe().show()
df.show(5)
df.count()   #no.of rows
len(df.columns) #no. of columns
# Extracting the relevant columns
df_selected = df.select("user_id_str", "user_followers_count", "text_")
df_selected.show(5)

from pyspark.sql.functions import col, max

# Group by user_id_str and get the max number of followers for each user
df_user_max_followers = df_selected.groupBy("user_id_str").agg(
    max("user_followers_count").alias("max_followers")
)

df_user_max_followers.show(5)

# Filter out rows where 'user_followers_count' is not a valid number (non-numeric)
df_clean = df_selected.filter(col("user_followers_count").cast("string").rlike("^[0-9]+$"))


df_clean.show(5)

# Group by user_id_str and get the max number of followers for each user
df_user_max_followers = df_clean.groupBy("user_id_str").agg(
    max("user_followers_count").alias("max_followers")
)

df_user_max_followers.show(5)

# Count the number of tweets per user
df_tweets_per_user = df_selected.groupBy("user_id_str").count()

# Show the first few rows of the result
df_tweets_per_user.show(5)

df_popular_users = df_user_max_followers.filter(col("max_followers") > 5000)
df_popular_users.show(5)

# Joining the popular users' DataFrame with the original DataFrame
df_popular_tweets = df_selected.join(df_popular_users, "user_id_str")

# Show a few rows to confirm
df_popular_tweets.show(5)

from pyspark.sql.functions import explode, split, lower

# Tokenize the text column into words and explode to get each word on a separate row
words_df = df_popular_tweets.withColumn("word", explode(split(col("text_"), r"\s+")))

# Remove any non-alphabetic words and lower case all the words
words_df = words_df.filter(col("word").rlike("^[a-zA-Z]+$"))
words_df = words_df.withColumn("word", lower(col("word")))
words_df.show(5)

# Count the frequency of each word
word_counts = words_df.groupBy("word").count()

# Get the top 10 most popular words by frequency
top_10_words = word_counts.orderBy(col("count").desc()).limit(10)

top_10_words.show()
