In [0]:
# Load cleaned CSV directly from DBFS
df = spark.read.csv("dbfs:/FileStore/cleaned_zomato1.csv", header=True, inferSchema=True)
df.show()


+----------+----------+--------------------+--------------------+--------------------+
|rate_clean|cost_clean|        online_order|          book_table|     listed_in(city)|
+----------+----------+--------------------+--------------------+--------------------+
|       4.1|      null|                 Yes|                 Yes|                null|
|     800.0|      null|Pasta, Lunch Buff...|North Indian, Mug...|'RATED\n  The pla...|
|       4.1|     800.0|                 Yes|                  No|'RATED\n  This pl...|
|       3.8|     800.0|                 Yes|                  No|pasta churros and...|
|       3.7|     300.0|                  No|                  No|        ('Rated 4.0'|
|       3.8|      null|                  No|                  No|                null|
|     600.0|      null| Panipuri, Gol Gappe|North Indian, Raj...|                null|
|       3.8|      null|                 Yes|                  No|                null|
|     600.0|      null|Onion Rings, Past...

In [0]:
df=df.na.drop()

In [0]:
df.show()

+----------+----------+------------+----------+--------------------+
|rate_clean|cost_clean|online_order|book_table|     listed_in(city)|
+----------+----------+------------+----------+--------------------+
|       4.1|     800.0|         Yes|        No|'RATED\n  This pl...|
|       3.8|     800.0|         Yes|        No|pasta churros and...|
|       3.7|     300.0|          No|        No|        ('Rated 4.0'|
|       4.2|     600.0|         Yes|       Yes|""RATED\n  While ...|
|       4.2|     500.0|         Yes|       Yes|""RATED\n  The Co...|
|       4.0|     450.0|          No|        No|""RATED\n  Loved ...|
|       3.8|     800.0|         Yes|        No|pasta churros and...|
|       3.9|     300.0|          No|        No|                  []|
|       3.1|     400.0|         Yes|        No|        Banashankari|
|       3.7|     500.0|         Yes|        No|               Cafes|
|       3.6|     900.0|          No|        No|        ('Rated 3.0'|
|       3.7|     300.0|          N

In [0]:
df.printSchema()


root
 |-- rate_clean: double (nullable = true)
 |-- cost_clean: double (nullable = true)
 |-- online_order: string (nullable = true)
 |-- book_table: string (nullable = true)
 |-- listed_in(city): string (nullable = true)



In [0]:
from pyspark.sql.functions import when,col
#Turn rate_clean into binary class: good (1) or not (0)
df = df.withColumn("target", when(col("rate_clean") >= 4.0, 1).otherwise(0))

In [0]:
# Original DataFrame before StringIndexer
train_df, test_df = df.randomSplit([0.7, 0.3], seed=42)


In [0]:
from pyspark.ml.feature import StringIndexer, VectorAssembler

# Example for online_order
indexer = StringIndexer(inputCol="online_order", outputCol="online_order_index", handleInvalid="keep")
indexer_model = indexer.fit(train_df)
train_df = indexer_model.transform(train_df)
test_df = indexer_model.transform(test_df)

# Index 'book_table'
indexer_book = StringIndexer(inputCol="book_table", outputCol="book_table_index", handleInvalid="keep")
indexer_book_model = indexer_book.fit(train_df)
train_df = indexer_book_model.transform(train_df)
test_df = indexer_book_model.transform(test_df)

# Index 'listed_in(city)'
indexer_city = StringIndexer(inputCol="listed_in(city)", outputCol="city_index", handleInvalid="keep")
indexer_city_model = indexer_city.fit(train_df)
train_df = indexer_city_model.transform(train_df)
test_df = indexer_city_model.transform(test_df)

# Assemble features
assembler = VectorAssembler(
    inputCols=["cost_clean", "online_order_index", "book_table_index", "city_index"],
    outputCol="features"
)
train_df = assembler.transform(train_df)
test_df = assembler.transform(test_df)


In [0]:
df.printSchema()

root
 |-- rate_clean: double (nullable = true)
 |-- cost_clean: double (nullable = true)
 |-- online_order: string (nullable = true)
 |-- book_table: string (nullable = true)
 |-- listed_in(city): string (nullable = true)
 |-- target: integer (nullable = false)



In [0]:
train_df = train_df.na.drop()
test_df = test_df.na.drop()


In [0]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(featuresCol="features", labelCol="target")
model = lr.fit(train_df)
predictions = model.transform(test_df)


In [0]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
# Evaluate Model
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol="target")
accuracy = evaluator.evaluate(predictions)

print("Model Accuracy:", accuracy)


Model Accuracy: 0.775281393947345


In [0]:
df.write.csv("dbfs:/FileStore/cleaned_zomato2.csv", header=True)

In [0]:
display(dbutils.fs.ls("dbfs:/FileStore/cleaned_zomato2.csv"))


path,name,size,modificationTime
dbfs:/FileStore/cleaned_zomato2.csv/_SUCCESS,_SUCCESS,0,1749024140000
dbfs:/FileStore/cleaned_zomato2.csv/_committed_7292268192929872355,_committed_7292268192929872355,736,1749024140000
dbfs:/FileStore/cleaned_zomato2.csv/_started_7292268192929872355,_started_7292268192929872355,0,1749024138000
dbfs:/FileStore/cleaned_zomato2.csv/part-00000-tid-7292268192929872355-6d536bc0-60ae-4980-96be-71031b4e4991-556-1-c000.csv,part-00000-tid-7292268192929872355-6d536bc0-60ae-4980-96be-71031b4e4991-556-1-c000.csv,412189,1749024140000
dbfs:/FileStore/cleaned_zomato2.csv/part-00001-tid-7292268192929872355-6d536bc0-60ae-4980-96be-71031b4e4991-557-1-c000.csv,part-00001-tid-7292268192929872355-6d536bc0-60ae-4980-96be-71031b4e4991-557-1-c000.csv,341993,1749024140000
dbfs:/FileStore/cleaned_zomato2.csv/part-00002-tid-7292268192929872355-6d536bc0-60ae-4980-96be-71031b4e4991-558-1-c000.csv,part-00002-tid-7292268192929872355-6d536bc0-60ae-4980-96be-71031b4e4991-558-1-c000.csv,337465,1749024139000
dbfs:/FileStore/cleaned_zomato2.csv/part-00003-tid-7292268192929872355-6d536bc0-60ae-4980-96be-71031b4e4991-559-1-c000.csv,part-00003-tid-7292268192929872355-6d536bc0-60ae-4980-96be-71031b4e4991-559-1-c000.csv,298642,1749024140000
dbfs:/FileStore/cleaned_zomato2.csv/part-00004-tid-7292268192929872355-6d536bc0-60ae-4980-96be-71031b4e4991-560-1-c000.csv,part-00004-tid-7292268192929872355-6d536bc0-60ae-4980-96be-71031b4e4991-560-1-c000.csv,328064,1749024140000
dbfs:/FileStore/cleaned_zomato2.csv/part-00005-tid-7292268192929872355-6d536bc0-60ae-4980-96be-71031b4e4991-561-1-c000.csv,part-00005-tid-7292268192929872355-6d536bc0-60ae-4980-96be-71031b4e4991-561-1-c000.csv,305897,1749024140000
dbfs:/FileStore/cleaned_zomato2.csv/part-00006-tid-7292268192929872355-6d536bc0-60ae-4980-96be-71031b4e4991-562-1-c000.csv,part-00006-tid-7292268192929872355-6d536bc0-60ae-4980-96be-71031b4e4991-562-1-c000.csv,253347,1749024140000
