In [6]:
%pip install -q python-dotenv

Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
import configparser
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
from pyspark.sql.functions import split, explode, lower, col
from pyspark.ml.feature import StopWordsRemover
from dotenv import load_dotenv

In [3]:
load_dotenv()

True

In [4]:
KEYSPACE = os.environ["CASSANDRA_KEYSPACE"]
IP_ADDRESS = os.environ["CASSANDRA_IP_ADDRESS"]
PORT = os.environ["CASSANDRA_PORT"]
TABLE = "user_reviews"
spark_config_apth = 'conf/spark.ini'
IP_ADDRESS, PORT, KEYSPACE, TABLE

('cassandra', '9042', 'mykeyspace', 'user_reviews')

In [5]:
config = configparser.ConfigParser()
config.optionxform=str
config.read(spark_config_apth)
list(config['spark'].items())

[('spark.master', 'local[*]'),
 ('spark.driver.memory', '1g'),
 ('spark.executor.memory', '1g'),
 ('spark.executor.instances', '4'),
 ('spark.sql.execution.arrow.pyspark.enabled', 'true'),
 ('spark.dynamicAllocation.enabled', 'true'),
 ('spark.dynamicAllocation.minExecutors', '1'),
 ('spark.dynamicAllocation.maxExecutors', '6'),
 ('spark.jars.packages',
  'com.datastax.spark:spark-cassandra-connector_2.12:3.5.1'),
 ('spark.cassandra.input.split.sizeInMB', '32'),
 ('spark.cassandra.input.fetch.sizeInRows', '1000'),
 ('spark.cassandra.query.retry.count', '-1')]

In [7]:
conf = SparkConf()
conf.setAll(list(config['spark'].items()))

<pyspark.conf.SparkConf at 0x7f4afb68ddd0>

In [8]:
spark = SparkSession.builder.config(conf=conf) \
    .config("spark.cassandra.connection.host", IP_ADDRESS) \
    .config("spark.cassandra.connection.port", PORT) \
    .getOrCreate()
    
sc = spark.sparkContext

In [9]:
df = spark.read \
    .format("org.apache.spark.sql.cassandra") \
    .options(table=TABLE, keyspace=KEYSPACE) \
    .load()
    
df.cache().show(5, False)

+-----------+-------------------------------+---------+-------------------+----------------------+-----------------------------------------------------------------------------------------------------------------------------------------+
|indexcolumn|app                            |sentiment|sentiment_polarity |sentiment_subjectivity|translated_review                                                                                                                        |
+-----------+-------------------------------+---------+-------------------+----------------------+-----------------------------------------------------------------------------------------------------------------------------------------+
|1810       |95Live -SG#1 Live Streaming App|Neutral  |0.0                |0.4                   |Why aren't women naked? . Give some IDs. . .                                                                                             |
|12567      |Barbie Life™                   |Positiv

In [13]:
df.unpersist()

DataFrame[indexcolumn: string, app: string, sentiment: string, sentiment_polarity: string, sentiment_subjectivity: string, translated_review: string]

In [10]:
df.rdd.getNumPartitions()

10

In [11]:
print("Number of rows:", df.count())

Number of rows: 64295


In [12]:
# Tokenize the reviews
df_words = df.withColumn("word", explode(split(lower(col("Translated_Review")), "\\W+")))

# Remove stop words
stop_words = set(StopWordsRemover.loadDefaultStopWords("english"))  # Default English stop words
df_filtered = df_words.filter(~df_words.word.isin(stop_words))

# Count word frequency
word_frequency = df_filtered.groupBy("word").count().orderBy(col("count").desc())

# Display the top keywords
word_frequency.show()

# If you want to get the results in Python
keywords_list = word_frequency.rdd.map(lambda row: (row.word, row.count)).collect()


+------+-----+
|  word|count|
+------+-----+
|      |23013|
|  game| 9384|
|  like| 5498|
|  good| 5271|
|   app| 4998|
| great| 4810|
|   get| 4726|
|  love| 4681|
|  time| 4538|
|really| 3096|
|  even| 2883|
|   ads| 2661|
|     m| 2564|
|update| 2536|
|  play| 2479|
| phone| 2439|
|please| 2397|
|  work| 2349|
|  also| 2339|
|  much| 2305|
+------+-----+
only showing top 20 rows



In [13]:
spark.stop()