In [11]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import split, explode, lower, col
from pyspark.ml.feature import StopWordsRemover
from collections import Counter

In [12]:
# Spark session & context
spark = SparkSession.builder.master("local").getOrCreate()
sc = spark.sparkContext

In [14]:
%ls -la /sparkdata

total 7496
drwxrwxrwx 1 root root     512 Apr 14 17:28 [0m[34;42m.[0m/
drwxr-xr-x 1 root root    4096 Apr 14 17:17 [01;34m..[0m/
-rwxrwxrwx 1 root root 7669276 Apr 14 17:28 [01;32mgoogleplaystore_user_reviews.csv[0m*


In [16]:
csv_path = "/sparkdata/googleplaystore_user_reviews.csv"

In [17]:
df = spark.read.csv(csv_path, header=True, inferSchema=True)
df.show()

+--------------------+--------------------+--------------------+-------------------+----------------------+
|                 App|   Translated_Review|           Sentiment| Sentiment_Polarity|Sentiment_Subjectivity|
+--------------------+--------------------+--------------------+-------------------+----------------------+
|10 Best Foods for...|"I like eat delic...| also ""Best Befo...|           Positive|                   1.0|
|10 Best Foods for...|This help eating ...|            Positive|               0.25|   0.28846153846153844|
|10 Best Foods for...|                 nan|                 nan|                nan|                   nan|
|10 Best Foods for...|Works great espec...|            Positive|                0.4|                 0.875|
|10 Best Foods for...|        Best idea us|            Positive|                1.0|                   0.3|
|10 Best Foods for...|            Best way|            Positive|                1.0|                   0.3|
|10 Best Foods for...|      

In [18]:
print("Number of rows:", df.count())

Number of rows: 64295


In [19]:
# Tokenize the reviews
df_words = df.withColumn("word", explode(split(lower(col("Translated_Review")), "\\W+")))

# Remove stop words
stop_words = set(StopWordsRemover.loadDefaultStopWords("english"))  # Default English stop words
df_filtered = df_words.filter(~df_words.word.isin(stop_words))

# Count word frequency
word_frequency = df_filtered.groupBy("word").count().orderBy(col("count").desc())

# Display the top keywords
word_frequency.show()

# If you want to get the results in Python
keywords_list = word_frequency.rdd.map(lambda row: (row.word, row.count)).collect()


+------+-----+
|  word|count|
+------+-----+
|   nan|26863|
|      |22489|
|  game| 9326|
|  like| 5427|
|  good| 5243|
|   app| 4941|
| great| 4775|
|  love| 4672|
|   get| 4644|
|  time| 4480|
|really| 3067|
|  even| 2828|
|   ads| 2630|
|     m| 2522|
|update| 2498|
|  play| 2442|
| phone| 2419|
|please| 2362|
|  work| 2316|
|  also| 2292|
+------+-----+
only showing top 20 rows

