In [171]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import Imputer, OneHotEncoder, StringIndexer, IndexToString
from pyspark.ml import Pipeline
from pyspark.ml.feature import StandardScaler, MinMaxScaler, RobustScaler
from pyspark.ml.feature import VectorAssembler
from pyspark.sql import DataFrame
from pyspark.sql.functions import col,when
from pyspark.sql.functions import col, lit, coalesce
import pandas as pd



In [172]:
spark=SparkSession.builder.appName('Dataframe').getOrCreate()

In [173]:
spark

In [174]:
## read the dataset
df_pyspark=spark.read.option('header','true').csv('train2.csv',inferSchema=True)

In [175]:
### Check the schema
df_pyspark.printSchema()

root
 |-- id: integer (nullable = true)
 |-- week: integer (nullable = true)
 |-- city_code: string (nullable = true)
 |-- region_code: string (nullable = true)
 |-- center_type: string (nullable = true)
 |-- op_area: integer (nullable = true)
 |-- category: string (nullable = true)
 |-- cuisine: string (nullable = true)
 |-- checkout_price: double (nullable = true)
 |-- base_price: double (nullable = true)
 |-- emailer_for_promotion: integer (nullable = true)
 |-- homepage_featured: integer (nullable = true)
 |-- num_orders: integer (nullable = true)



In [176]:
df_pyspark=spark.read.csv('train1.csv',header=True,inferSchema=True)
df_pyspark.show()

+-------+----+---------+-----------+-----------+-------+---------+-----------+--------------+----------+---------------------+-----------------+----------+
|     id|week|city_code|region_code|center_type|op_area| category|    cuisine|checkout_price|base_price|emailer_for_promotion|homepage_featured|num_orders|
+-------+----+---------+-----------+-----------+-------+---------+-----------+--------------+----------+---------------------+-----------------+----------+
|1379560|   1|      647|         56|     TYPE_C|    2.0|Beverages|       Thai|        136.83|    152.29|                    0|                0|       177|
|1466964|   1|      647|         56|     TYPE_C|    2.0|Beverages|       Thai|        136.83|    135.83|                    0|                0|       270|
|1346989|   1|      647|         56|     TYPE_C|    2.0|Beverages|       Thai|        134.86|    135.86|                    0|                0|       189|
|1338232|   1|      647|         56|     TYPE_C|    2.0|Beverage

In [177]:
# Drop the 'id' column
df_pyspark = df_pyspark.drop('id')

In [178]:
df_pyspark.head(5)

[Row(week=1, city_code=647, region_code=56, center_type='TYPE_C', op_area=2.0, category='Beverages', cuisine='Thai', checkout_price=136.83, base_price=152.29, emailer_for_promotion=0, homepage_featured=0, num_orders=177),
 Row(week=1, city_code=647, region_code=56, center_type='TYPE_C', op_area=2.0, category='Beverages', cuisine='Thai', checkout_price=136.83, base_price=135.83, emailer_for_promotion=0, homepage_featured=0, num_orders=270),
 Row(week=1, city_code=647, region_code=56, center_type='TYPE_C', op_area=2.0, category='Beverages', cuisine='Thai', checkout_price=134.86, base_price=135.86, emailer_for_promotion=0, homepage_featured=0, num_orders=189),
 Row(week=1, city_code=647, region_code=56, center_type='TYPE_C', op_area=2.0, category='Beverages', cuisine='Indian', checkout_price=339.5, base_price=437.53, emailer_for_promotion=0, homepage_featured=0, num_orders=54),
 Row(week=1, city_code=647, region_code=56, center_type='TYPE_C', op_area=2.0, category='Beverages', cuisine='In

In [179]:
# Check for generic missing values (null, empty string, or any other placeholder)
missing_columns = [
    col_name for col_name in df_pyspark.columns
    if (df_pyspark
        .filter((col(col_name).isNull()) | (col(col_name) == "") | (col(col_name) == " "))
        .count() > 0)
]
missing_columns

[]

In [180]:
# categorical columns
categorical_columns = [col[0] for col in df_pyspark.dtypes if col[1] == 'string']

categorical_columns

['center_type', 'category', 'cuisine']

In [181]:

# Create a list of StringIndexer stages for each categorical column
indexer_stages = [StringIndexer(inputCol=col_name, outputCol=f"{col_name}_index") for col_name in categorical_columns]

# Create a pipeline with all the StringIndexer stages
pipeline = Pipeline(stages=indexer_stages)

# Fit the pipeline on the data
indexer_model = pipeline.fit(df_pyspark)

# Transform the data using the fitted pipeline
df_pyspark = indexer_model.transform(df_pyspark)
# drop the categorcial columns
df_pyspark = df_pyspark.drop(*categorical_columns)


df_pyspark.show()

+----+---------+-----------+-------+--------------+----------+---------------------+-----------------+----------+-----------------+--------------+-------------+
|week|city_code|region_code|op_area|checkout_price|base_price|emailer_for_promotion|homepage_featured|num_orders|center_type_index|category_index|cuisine_index|
+----+---------+-----------+-------+--------------+----------+---------------------+-----------------+----------+-----------------+--------------+-------------+
|   1|      647|         56|    2.0|        136.83|    152.29|                    0|                0|       177|              1.0|           0.0|          1.0|
|   1|      647|         56|    2.0|        136.83|    135.83|                    0|                0|       270|              1.0|           0.0|          1.0|
|   1|      647|         56|    2.0|        134.86|    135.86|                    0|                0|       189|              1.0|           0.0|          1.0|
|   1|      647|         56|    2.

In [182]:
# Assuming your DataFrame is named 'df'
df_pyspark = df_pyspark.select(
    "week",
    "city_code",
    "region_code",
    "op_area",
    "checkout_price",
    "base_price",
    "emailer_for_promotion",
    "homepage_featured",
    "center_type_index",
    "category_index",
    "cuisine_index",
    "num_orders"  # Move 'num_orders' to the last position
)

# Show the modified DataFrame
df_pyspark.show()

+----+---------+-----------+-------+--------------+----------+---------------------+-----------------+-----------------+--------------+-------------+----------+
|week|city_code|region_code|op_area|checkout_price|base_price|emailer_for_promotion|homepage_featured|center_type_index|category_index|cuisine_index|num_orders|
+----+---------+-----------+-------+--------------+----------+---------------------+-----------------+-----------------+--------------+-------------+----------+
|   1|      647|         56|    2.0|        136.83|    152.29|                    0|                0|              1.0|           0.0|          1.0|       177|
|   1|      647|         56|    2.0|        136.83|    135.83|                    0|                0|              1.0|           0.0|          1.0|       270|
|   1|      647|         56|    2.0|        134.86|    135.86|                    0|                0|              1.0|           0.0|          1.0|       189|
|   1|      647|         56|    2.

In [184]:
# Standardize all the columns
feature_columns = df_pyspark.columns


assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
df_pyspark = assembler.transform(df_pyspark)

scaler = StandardScaler(inputCol="features", outputCol="scaled_features", withStd=True, withMean=True)
scaler_model = scaler.fit(df_pyspark)
df_pyspark = scaler_model.transform(df_pyspark)

#df_target = df_pyspark.select("scaled_features")

#df_pyspark.select("scaled_features").show(truncate=False)

df_pyspark = df_pyspark.select("scaled_features")

df_pyspark.show(truncate=False)
#df_pyspark.show(truncate=False)


+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|scaled_features                                                                                                                                                                                                                                   |
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|[-1.7764924456907198,0.686546923842498,-0.03483677923496167,-1.908597432470319,-1.2776859296489929,-1.2560462896342774,-0.2971862303521698,-0.35012291530432504,0.4603362691941659,-1.0847567187456257,-0.38510495194384187,-0.2143669443913471]  |
|[-1.776492445690719

In [185]:
# conversion to pandas

pandas_df = df_pyspark.select("scaled_features").toPandas()

# Extract individual columns from the array column 'scaled_features'
scaled_features_columns = ["feature_" + str(i) for i in range(len(pandas_df["scaled_features"][0]))]

# Create individual columns in the Pandas DataFrame
pandas_df[scaled_features_columns] = pd.DataFrame(pandas_df["scaled_features"].tolist(), index=pandas_df.index)

# Drop the original array column 'scaled_features'
pandas_df = pandas_df.drop(columns=["scaled_features"])
pandas_df.columns = [
    "week", "city_code", "region_code", "op_area", "checkout_price", "base_price",
    "emailer_for_promotion", "homepage_featured", "center_type_index", "category_index",
    "cuisine_index", "num_orders"
]

pandas_df

Unnamed: 0,week,city_code,region_code,op_area,checkout_price,base_price,emailer_for_promotion,homepage_featured,center_type_index,category_index,cuisine_index,num_orders
0,-1.776492,0.686547,-0.034837,-1.908597,-1.277686,-1.256046,-0.297186,-0.350123,0.460336,-1.084757,-0.385105,-0.214367
1,-1.776492,0.686547,-0.034837,-1.908597,-1.277686,-1.358463,-0.297186,-0.350123,0.460336,-1.084757,-0.385105,0.020527
2,-1.776492,0.686547,-0.034837,-1.908597,-1.290567,-1.358276,-0.297186,-0.350123,0.460336,-1.084757,-0.385105,-0.184058
3,-1.776492,0.686547,-0.034837,-1.908597,0.047477,0.518762,-0.297186,-0.350123,0.460336,-1.084757,0.515251,-0.525034
4,-1.776492,0.686547,-0.034837,-1.908597,-0.580222,-0.694745,-0.297186,-0.350123,0.460336,-1.084757,0.515251,-0.560394
...,...,...,...,...,...,...,...,...,...,...,...,...
456543,1.691302,-1.942014,1.155551,0.381437,0.992882,0.808466,-0.297186,-0.350123,-0.784669,0.442681,0.515251,-0.489673
456544,1.691302,-1.942014,1.155551,0.381437,0.979805,0.796022,-0.297186,-0.350123,-0.784669,0.442681,0.515251,-0.555343
456545,1.691302,-1.942014,1.155551,0.381437,-0.618276,-0.205870,-0.297186,-0.350123,-0.784669,0.697254,-1.285461,0.603974
456546,1.691302,-1.942014,1.155551,0.381437,-0.580222,-0.253968,-0.297186,-0.350123,-0.784669,0.697254,-1.285461,1.179844
