In [None]:
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.ml.feature import *
from pyspark.ml.regression import *

In [None]:
spark=SparkSession.builder.appName('FlipkartToyPriceModel').getOrCreate()
df=spark.read.csv('/content/flipkart_toys.xls',header=True,inferSchema=True)

In [None]:
df.show(5)

+---+--------------------+------+------------+---------+---------+-------+
|_c0|                Name|Rating|Rating_count|New_Price|Old_Price|  Offer|
+---+--------------------+------+------------+---------+---------+-------+
|  0|ARIZON DIY Plasti...|   3.8|    (53,122)|     ₹144|     ₹649|77% off|
|  1|Miss & Chief Kick...|   4.2|     (8,487)|     ₹799|   ₹2,499|68% off|
|  2|Learn With Fun Lo...|   4.1|     (1,521)|     ₹279|   ₹1,799|84% off|
|  3|xelix 2 in 1 Baby...|   4.4|     (7,068)|     ₹282|     ₹999|71% off|
|  4|GoodsNet Flash Dr...|   3.9|    (18,313)|     ₹247|     ₹899|72% off|
+---+--------------------+------+------------+---------+---------+-------+
only showing top 5 rows



In [None]:
# Clean ₹ and commas, and convert to numbers
df_cleaned = df.withColumn("New_Price", regexp_replace("New_Price", "₹|,", "").cast(DoubleType())) \
               .withColumn("Old_Price", regexp_replace("Old_Price", "₹|,", "").cast(DoubleType())) \
               .withColumn("Rating", col("Rating").cast(DoubleType())) \
               .withColumn("Rating_count", regexp_replace("Rating_count", r"[(),]", "").cast(IntegerType())) \
               .withColumn("Offer", regexp_replace("Offer", "% off", "").cast(DoubleType()))


In [None]:
feature_cols=['Rating','Rating_count','New_Price']
assembler=VectorAssembler(inputCols=feature_cols,outputCol='features')
assembled_df=assembler.transform(df_cleaned).select('features','offer')

In [None]:
train_df,test_df=assembled_df.randomSplit([0.8,0.2],seed=42)
lr=LinearRegression(featuresCol='features',labelCol='offer',predictionCol='prediction')
lr_model=lr.fit(train_df)

In [None]:
print(f"Coefficients: {lr_model.coefficients}")
print(f"Intercept: {lr_model.intercept}")
test_results=lr_model.evaluate(test_df)
print(f"RMSE: {test_results.rootMeanSquaredError}")
print(f"R2: {test_results.r2}")

Coefficients: [0.9469809378433298,9.175181156223161e-05,-0.012304448542574254]
Intercept: 74.27555812641698
RMSE: 5.91925986859266
R2: -0.03099254854419442


 Interpretation of Results
Coefficients:

Rating: +0.947 → Small positive impact on discount.

Rating_count: +0.0000917 → Almost negligible impact.

New_Price: -0.0123 → Higher price leads to slightly lower discount.

Intercept: ~74.28 → Baseline discount when all other features are 0 (not practically meaningful here).

RMSE (Root Mean Squared Error): 5.92 → The average prediction error is ±6%, which is relatively high if discounts vary in a tight range.

R² (R-squared): -0.03 → 🚨 Negative R² indicates your model is worse than simply predicting the mean discount.