In [1]:
import pyspark

In [2]:
from pyspark.sql import SparkSession

In [3]:
#loading the required sql functions for the further processing
from pyspark.sql import functions
from pyspark.sql.functions import col, trim, lower, regexp_replace

In [4]:
spark = SparkSession.builder \
    .appName("BookReviewEDA") \
    .getOrCreate()

25/06/26 08:26:01 WARN Utils: Your hostname, vaibhavi-HP-Laptop-15-fd0xxx resolves to a loopback address: 127.0.1.1; using 192.168.1.9 instead (on interface wlo1)
25/06/26 08:26:01 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/06/26 08:26:01 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [5]:
#loading the ratings dataset
df = spark.read.csv("/home/vaibhavi/spark-ml-venv/ml_project/data/Books_rating.csv", header=True, inferSchema=True)
df.printSchema()
df.show(5)

                                                                                

root
 |-- Id: string (nullable = true)
 |-- Title: string (nullable = true)
 |-- Price: string (nullable = true)
 |-- User_id: string (nullable = true)
 |-- profileName: string (nullable = true)
 |-- review/helpfulness: string (nullable = true)
 |-- review/score: string (nullable = true)
 |-- review/time: string (nullable = true)
 |-- review/summary: string (nullable = true)
 |-- review/text: string (nullable = true)

+----------+--------------------+-----+--------------+--------------------+------------------+------------+-----------+--------------------+--------------------+
|        Id|               Title|Price|       User_id|         profileName|review/helpfulness|review/score|review/time|      review/summary|         review/text|
+----------+--------------------+-----+--------------+--------------------+------------------+------------+-----------+--------------------+--------------------+
|1882931173|Its Only Art If I...| NULL| AVCGYZL8FQQTD|"Jim of Oz ""jim-...|               7/

In [6]:
batches = df.randomSplit([0.1]*10, seed=42) 

- We have created batches for efficient handling, so further processing would go with the batches.

In [6]:
#taking the required columns, text and score, as required for the sentiment analysis

df_clean = df.select(("review/text"),("review/score"))


In [7]:
df_clean.head()

Row(review/text="This is only for Julie Strain fans. It's a collection of her photos -- about 80 pages worth with a nice section of paintings by Olivia.If you're looking for heavy literary content, this isn't the place to find it -- there's only about 2 pages with text and everything else is photos.Bottom line: if you only want one book, the Six Foot One ... is probably a better choice, however, if you like Julie like I like Julie, you won't go wrong on this one either.", review/score='4.0')

In [8]:
df_clean.show(n=5)

+--------------------+------------+
|         review/text|review/score|
+--------------------+------------+
|This is only for ...|         4.0|
|I don't care much...|         5.0|
|"If people become...|         5.0|
|Theodore Seuss Ge...|         4.0|
|"Philip Nel - Dr....|         4.0|
+--------------------+------------+
only showing top 5 rows



In [None]:
#review?scoew was refferd as string, converting it to int
df_clean=df_clean.withColumn("review_score", col("review/score").cast("int"))

In [12]:
df_clean.show()

+--------------------+------------+------------+
|         review/text|review/score|review_score|
+--------------------+------------+------------+
|This is only for ...|         4.0|           4|
|I don't care much...|         5.0|           5|
|"If people become...|         5.0|           5|
|Theodore Seuss Ge...|         4.0|           4|
|"Philip Nel - Dr....|         4.0|           4|
|"""Dr. Seuss: Ame...|         4.0|           4|
|Theodor Seuss Gie...|         5.0|           5|
|"When I recieved ...|         5.0|           5|
|"Trams (or any pu...|         5.0|           5|
|As far as I am aw...|         4.0|           4|
|I just finished t...|         5.0|           5|
|"Many small churc...|         5.0|           5|
|I just finished r...|         5.0|           5|
|"I hadn't been a ...|         5.0|           5|
|I bought this boo...|         1.0|           1|
|"I have to admit,...|         4.0|           4|
|"This is a self-p...|         1.0|           1|
|When I first read..

In [13]:
df_clean.groupBy("review_score").count().show()



+------------+-------+
|review_score|  count|
+------------+-------+
|        NULL|  18064|
|           1| 201000|
|           3| 252940|
|           5|1795795|
|           4| 581728|
|           2| 150449|
|         327|      5|
|          19|     15|
|  1295568000|      1|
|  1208995200|      1|
|  1211760000|      2|
+------------+-------+



                                                                                

In [None]:
#checking the text at rating 19
df_clean.filter(col("review_score") == 19).show()


                                                                                

+-----------+------------+------------+
|review/text|review/score|review_score|
+-----------+------------+------------+
|        8/8|       19.95|          19|
|        4/4|       19.95|          19|
|        4/4|       19.95|          19|
|        4/4|       19.95|          19|
|        3/3|       19.95|          19|
|        2/2|       19.95|          19|
|        1/1|       19.95|          19|
|        0/0|       19.95|          19|
|        0/0|       19.95|          19|
|        0/0|       19.95|          19|
|        0/0|       19.95|          19|
|        1/2|       19.95|          19|
|        2/4|       19.95|          19|
|        1/3|       19.95|          19|
|        4/8|       19.95|          19|
+-----------+------------+------------+



In [17]:
# converting/cleaning the review_score, as we can see there are columns with review_text which is not a text 
df_text = df_clean.filter(col("review/text").isNotNull()) \
    .withColumn("clean_text", lower(trim(col("review/text")))) \
    .withColumn("clean_text", regexp_replace("clean_text", "[^a-zA-Z0-9\\s]", "")) \
    .filter(col("clean_text") != "")

In [18]:
df_text.show(truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [24]:
#java is giving OutOfMemory erroe for heavy processing further on, so we are manually dividing the batches for efficient handling
batches = df_text.randomSplit([0.1]*10, seed=42)  # 10 batches


ConnectionRefusedError: [Errno 111] Connection refused

In [19]:
#dropping the original review/text column
df_text = df_text.drop("review/text")

In [20]:
df_text.show()

+------------+------------+--------------------+
|review/score|review_score|          clean_text|
+------------+------------+--------------------+
|         4.0|           4|this is only for ...|
|         5.0|           5|i dont care much ...|
|         5.0|           5|if people become ...|
|         4.0|           4|theodore seuss ge...|
|         4.0|           4|philip nel  dr se...|
|         4.0|           4|dr seuss american...|
|         5.0|           5|theodor seuss gie...|
|         5.0|           5|when i recieved t...|
|         5.0|           5|trams or any publ...|
|         4.0|           4|as far as i am aw...|
|         5.0|           5|i just finished t...|
|         5.0|           5|many small church...|
|         5.0|           5|i just finished r...|
|         5.0|           5|i hadnt been a sm...|
|         1.0|           1|i bought this boo...|
|         4.0|           4|i have to admit i...|
|         1.0|           1|this is a selfpub...|
|         5.0|      

In [21]:
df_text.cache()

DataFrame[review/score: string, review_score: int, clean_text: string]

In [22]:
#dropping original reivew/score as its schema was infered as string.
df_text=df_text.drop('review/score')

In [23]:
df_text.groupBy("review_score").count().show()

25/06/26 08:10:44 WARN BlockManager: Block rdd_49_8 could not be removed as it was not found on disk or in memory
25/06/26 08:10:44 ERROR Executor: Exception in task 8.0 in stage 17.0 (TID 82)
java.lang.OutOfMemoryError: Java heap space
	at java.nio.HeapByteBuffer.<init>(HeapByteBuffer.java:57)
	at java.nio.ByteBuffer.allocate(ByteBuffer.java:335)
	at org.apache.spark.sql.execution.columnar.ColumnBuilder$.ensureFreeSpace(ColumnBuilder.scala:167)
	at org.apache.spark.sql.execution.columnar.BasicColumnBuilder.appendFrom(ColumnBuilder.scala:73)
	at org.apache.spark.sql.execution.columnar.NativeColumnBuilder.org$apache$spark$sql$execution$columnar$NullableColumnBuilder$$super$appendFrom(ColumnBuilder.scala:99)
	at org.apache.spark.sql.execution.columnar.NullableColumnBuilder.appendFrom(NullableColumnBuilder.scala:61)
	at org.apache.spark.sql.execution.columnar.NullableColumnBuilder.appendFrom$(NullableColumnBuilder.scala:54)
	at org.apache.spark.sql.execution.columnar.NativeColumnBuilder.o

Py4JError: py4j.reflection does not exist in the JVM

ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/home/vaibhavi/ml-project-env/lib/python3.10/site-packages/py4j/clientserver.py", line 516, in send_command
    raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/vaibhavi/ml-project-env/lib/python3.10/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/home/vaibhavi/ml-project-env/lib/python3.10/site-packages/py4j/clientserver.py", line 539, in send_command
    raise Py4JNetworkError(
py4j.protocol.Py4JNetworkError: Error while sending or receiving


In [38]:
# removing the columns which have unusual review_score
df_final = df_text.filter(
    (col("review_score").isNotNull()) &
    (col("review_score").between(1, 5))
)


In [39]:
df_final.show()

+------------+--------------------+
|review_score|          clean_text|
+------------+--------------------+
|           4|this is only for ...|
|           5|i dont care much ...|
|           5|if people become ...|
|           4|theodore seuss ge...|
|           4|philip nel  dr se...|
|           4|dr seuss american...|
|           5|theodor seuss gie...|
|           5|when i recieved t...|
|           5|trams or any publ...|
|           4|as far as i am aw...|
|           5|i just finished t...|
|           5|many small church...|
|           5|i just finished r...|
|           5|i hadnt been a sm...|
|           1|i bought this boo...|
|           4|i have to admit i...|
|           1|this is a selfpub...|
|           5|when i first read...|
|           5|i read the review...|
|           5|i really enjoyed ...|
+------------+--------------------+
only showing top 20 rows



In [40]:
df_final.groupBy("review_score").count().show()



+------------+-------+
|review_score|  count|
+------------+-------+
|           1| 200992|
|           3| 252940|
|           5|1795762|
|           4| 581722|
|           2| 150449|
+------------+-------+



                                                                                

- Defining a function for calculating the sentiment of a review.

>classfying the review into 3 types:    
>good : review_score > 4      
>average : 3 < review_score < 4
>bad : review_score < 3

In [51]:
def sentiment_score(review_score):
    if (review_score < 3):
        sentiment = "bad"
    elif (review_score > 4):
        sentiment = "good"
    else:
        sentiment = "average"
    return sentiment

In [41]:
from pyspark.sql.functions import udf

In [52]:
#now applying the written function to every line of the sentiment,using lambda functions


# Register UDF
sentiment_udf = udf(sentiment_score)


In [53]:
## use the udf 

df_final=df_final.withColumn("sentiment",sentiment_udf(col("review_score")))

In [54]:
df_final.show()

+------------+--------------------+---------+
|review_score|          clean_text|sentiment|
+------------+--------------------+---------+
|           4|this is only for ...|  average|
|           5|i dont care much ...|     good|
|           5|if people become ...|     good|
|           4|theodore seuss ge...|  average|
|           4|philip nel  dr se...|  average|
|           4|dr seuss american...|  average|
|           5|theodor seuss gie...|     good|
|           5|when i recieved t...|     good|
|           5|trams or any publ...|     good|
|           4|as far as i am aw...|  average|
|           5|i just finished t...|     good|
|           5|many small church...|     good|
|           5|i just finished r...|     good|
|           5|i hadnt been a sm...|     good|
|           1|i bought this boo...|      bad|
|           4|i have to admit i...|  average|
|           1|this is a selfpub...|      bad|
|           5|when i first read...|     good|
|           5|i read the review...

In [55]:
#checking the distribution of the sentiments
df_final.groupBy("sentiment").count().show()



+---------+-------+
|sentiment|  count|
+---------+-------+
|  average| 834662|
|      bad| 351441|
|     good|1795762|
+---------+-------+



                                                                                

- imbalanced data is seen for all the categories.To have better features, we will add additional column - text polarity for better understanding of the sentiment.


In [57]:
from textblob import TextBlob
from pyspark.sql.functions import udf
from pyspark.sql.types import FloatType

def get_polarity(text):
    if text:
        return TextBlob(text).sentiment.polarity
    return 0.0



In [58]:
polarity_udf = udf(get_polarity)

In [59]:
df_final = df_final.withColumn("polarity",polarity_udf("clean_text"))

In [78]:
df_final.cache()

DataFrame[review_score: int, clean_text: string, sentiment: string, polarity: string, inference: string]

In [83]:
from pyspark.sql.functions import when

df_final = df_final.withColumn("sentiment",
    when(col("polarity") >= 0.3, "positive")
    .when(col("polarity") <= -0.3, "negative")
    .otherwise("neutral"))


In [84]:
# Sample to balance (adjust ratio if needed)
pos = df_final.filter(col("sentiment") == "positive").sample(False, 0.2, seed=42)
neg = df_final.filter(col("sentiment") == "negative")
neu = df_final.filter(col("sentiment") == "neutral")

df_balanced = pos.union(neg).union(neu)


In [85]:
df_balanced.show()

25/06/22 17:40:41 WARN BlockManager: Block rdd_193_0 could not be removed as it was not found on disk or in memory
25/06/22 17:40:41 ERROR Executor: Exception in task 0.0 in stage 49.0 (TID 265)
java.lang.OutOfMemoryError: Java heap space
	at java.nio.HeapByteBuffer.<init>(HeapByteBuffer.java:57)
	at java.nio.ByteBuffer.allocate(ByteBuffer.java:335)
	at org.apache.spark.sql.execution.columnar.ColumnBuilder$.ensureFreeSpace(ColumnBuilder.scala:167)
	at org.apache.spark.sql.execution.columnar.BasicColumnBuilder.appendFrom(ColumnBuilder.scala:73)
	at org.apache.spark.sql.execution.columnar.NativeColumnBuilder.org$apache$spark$sql$execution$columnar$NullableColumnBuilder$$super$appendFrom(ColumnBuilder.scala:99)
	at org.apache.spark.sql.execution.columnar.NullableColumnBuilder.appendFrom(NullableColumnBuilder.scala:61)
	at org.apache.spark.sql.execution.columnar.NullableColumnBuilder.appendFrom$(NullableColumnBuilder.scala:54)
	at org.apache.spark.sql.execution.columnar.NativeColumnBuilder

Py4JError: py4j.reflection does not exist in the JVM

ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/home/vaibhavi/ml-project-env/lib/python3.10/site-packages/py4j/clientserver.py", line 516, in send_command
    raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/vaibhavi/ml-project-env/lib/python3.10/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/home/vaibhavi/ml-project-env/lib/python3.10/site-packages/py4j/clientserver.py", line 539, in send_command
    raise Py4JNetworkError(
py4j.protocol.Py4JNetworkError: Error while sending or receiving


In [None]:
df_balanced.coalesce(1).write \
    .mode("overwrite") \
    .option("header", True) \
    .csv("output/balanced_sentiment")


In [25]:
spark.stop()

ConnectionRefusedError: [Errno 111] Connection refused