In [1]:
import pyspark

In [2]:
from pyspark.sql import SparkSession

In [3]:
#loading the required sql functions for the further processing
from pyspark.sql import functions
from pyspark.sql.functions import col, trim, lower, regexp_replace

In [4]:
from pyspark.sql.functions import when

In [5]:
spark = SparkSession.builder \
    .appName("BookReviewEDABatches") \
    .getOrCreate()

25/06/26 09:52:17 WARN Utils: Your hostname, vaibhavi-HP-Laptop-15-fd0xxx resolves to a loopback address: 127.0.1.1; using 192.168.1.9 instead (on interface wlo1)
25/06/26 09:52:17 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/06/26 09:52:17 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/06/26 09:52:18 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [6]:
#df = spark.read.csv("/home/vaibhavi/spark-ml-venv/ml_project/data/Books_rating.csv", header=True, inferSchema=True)
#df.printSchema()
#df.show(5)

df = spark.read \
    .option("header", True) \
    .option("quote", '"') \
    .option("escape", '"') \
    .option("multiLine", True) \
    .option("mode", "DROPMALFORMED") \
    .csv("/home/vaibhavi/spark-ml-venv/ml_project/data/Books_rating.csv")
df.printSchema()
df.show()

root
 |-- Id: string (nullable = true)
 |-- Title: string (nullable = true)
 |-- Price: string (nullable = true)
 |-- User_id: string (nullable = true)
 |-- profileName: string (nullable = true)
 |-- review/helpfulness: string (nullable = true)
 |-- review/score: string (nullable = true)
 |-- review/time: string (nullable = true)
 |-- review/summary: string (nullable = true)
 |-- review/text: string (nullable = true)

+----------+--------------------+-----+--------------+--------------------+------------------+------------+-----------+--------------------+--------------------+
|        Id|               Title|Price|       User_id|         profileName|review/helpfulness|review/score|review/time|      review/summary|         review/text|
+----------+--------------------+-----+--------------+--------------------+------------------+------------+-----------+--------------------+--------------------+
|1882931173|Its Only Art If I...| NULL| AVCGYZL8FQQTD|Jim of Oz "jim-of...|               7/

In [7]:
#selecting onyl the required columns
df = df.select("review/score","review/text")

In [8]:
df.show()

+------------+--------------------+
|review/score|         review/text|
+------------+--------------------+
|         4.0|This is only for ...|
|         5.0|I don't care much...|
|         5.0|If people become ...|
|         4.0|Theodore Seuss Ge...|
|         4.0|Philip Nel - Dr. ...|
|         4.0|"Dr. Seuss: Ameri...|
|         5.0|Theodor Seuss Gie...|
|         5.0|When I recieved t...|
|         5.0|Trams (or any pub...|
|         4.0|As far as I am aw...|
|         5.0|I just finished t...|
|         5.0|Many small church...|
|         5.0|I just finished r...|
|         5.0|I hadn't been a s...|
|         1.0|I bought this boo...|
|         4.0|I have to admit, ...|
|         1.0|This is a self-pu...|
|         5.0|When I first read...|
|         5.0|I read the review...|
|         5.0|I really enjoyed ...|
+------------+--------------------+
only showing top 20 rows



In [9]:
#creating batches for efficient handling 
batches = df.randomSplit([0.1]*10, seed=42) 

In [10]:
# defining the sentiment score and polarity udfs
# gave a nonetype exception while executing, handling it by writing a unknown score
def sentiment_score(review_score):
    if review_score is None:
        return "unknown" 
    if (review_score < 3):
        sentiment = "bad"
    elif (review_score > 4):
        sentiment = "good"
    else:
        sentiment = "average"
    return sentiment

In [11]:
from textblob import TextBlob
from pyspark.sql.functions import udf
from pyspark.sql.types import FloatType

def get_polarity(text):
    if text:
        return TextBlob(text).sentiment.polarity
    return 0.0

In [12]:
#registering both the udfs (USER DEFINED FUNCTIONS)
sentiment_udf = udf(sentiment_score)
polarity_udf = udf(get_polarity)

In [13]:
# function to generate the final dataframe 

Defining a function to clean and process the dataFrame.
 - Steps in the function:
 
    1.To convert score to int

    2. to clean the text - by trimming and lowercasing

    3. removing the unusual data (eg. reviews with rating 19 or NULL)

    4. Generating the sentiment and polarity of the review

    5. returning the cleaned df

In [14]:
# Step 1: Sample just 5 rows
df_sample = df.limit(5)

# Step 2: Show actual raw values
df_sample.select("review/score", "review/text").show(truncate=False)

# Step 3: Clean ONLY the review/text column
from pyspark.sql.functions import lower, trim, regexp_replace

df_cleaned = df_sample.withColumn(
    "clean_text", regexp_replace(lower(trim(col("review/text"))), "[^a-zA-Z0-9\\s]", "")
)

# Step 4: Show cleaned version
df_cleaned.select("clean_text").show(truncate=False)


+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [15]:
def processing_df(batch):
    df_clean = batch.withColumn("review_score", col("review/score").cast("int")) \
        .filter(col("review/text").isNotNull()) \
        .withColumn("clean_text", regexp_replace(lower(trim(col("review/text"))), "[^a-zA-Z0-9\\s]", "")) \
        .filter(col("clean_text") != "") \
        .filter(col("review/score").isNotNull())\
        .withColumn("rating_sentiment", sentiment_udf(col("review_score"))) \
        .withColumn("polarity", polarity_udf(col("clean_text"))) \
        .withColumn("polarity_sentiment",
            when(col("polarity") >= 0.3, "positive")
            .when(col("polarity") <= -0.3, "negative")
            .otherwise("neutral"))\
        .select("review_score", "clean_text", "rating_sentiment", "polarity", "polarity_sentiment") 
    
    return df_clean

In [16]:
# looping through all the batches fro the processing logic and storing it into output folder
 
for i, batch in enumerate(batches):
    print(f"Processing batch {i + 1}...")
    processed = processing_df(batch)
    processed.write \
        .mode("overwrite") \
        .option("header", True) \
        .option("quote", '"') \
        .option("escape", '"') \
        .option("quoteAll", True) \
        .csv(f"output/eda_batch_{i + 1}")


Processing batch 1...


                                                                                

Processing batch 2...


                                                                                

Processing batch 3...


                                                                                

Processing batch 4...


                                                                                

Processing batch 5...


                                                                                

Processing batch 6...


                                                                                

Processing batch 7...


                                                                                

Processing batch 8...


                                                                                

Processing batch 9...


                                                                                

Processing batch 10...


                                                                                

In [27]:
#done! eda batches are prepped and ready to pull into further processing

In [17]:
#checking the actual csv made:
df_final = spark.read.csv("/home/vaibhavi/spark-ml-venv/ml_project/preprocessing/output/eda_batch_1", header=True, inferSchema=True)
df_final.printSchema()
df_final.show(5)

root
 |-- review_score: integer (nullable = true)
 |-- clean_text: string (nullable = true)
 |-- rating_sentiment: string (nullable = true)
 |-- polarity: double (nullable = true)
 |-- polarity_sentiment: string (nullable = true)

+------------+--------------------+----------------+--------------------+------------------+
|review_score|          clean_text|rating_sentiment|            polarity|polarity_sentiment|
+------------+--------------------+----------------+--------------------+------------------+
|           1|i now cant abide ...|             bad|  0.1111111111111111|           neutral|
|           1|thats actually on...|             bad| 0.28932367149758453|           neutral|
|           1|occasional imperf...|             bad|-0.03500000000000002|           neutral|
|           1|airframe is meant...|             bad|0.004458971088435386|           neutral|
|           1|amanda miranda is...|             bad| 0.12636363636363637|           neutral|
+------------+-----------