In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.ml.feature import HashingTF, IDF, StringIndexer, VectorAssembler
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from delta import *


In [None]:
builder = SparkSession.builder \
    .appName("Gold Layer Processing") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:3.1.0") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

spark = configure_spark_with_delta_pip(builder).getOrCreate()


In [None]:
# Load required silver tables
businesses_silver = spark.read.format("delta").load("D:/Project/delta_lake/silver/business")
reviews_silver = spark.read.format("delta").load("D:/Project/delta_lake/silver/reviews")
users_silver = spark.read.format("delta").load("D:/Project/delta_lake/silver/users")
checkins_silver = spark.read.format("delta").load("D:/Project/delta_lake/silver/checkins_metrics")


In [None]:
def create_business_features(df):
    # Select and create relevant business features
    return df.select(
        "business_id",
        "stars",
        "review_count",
        "categories_array",
        "price_range",
        "business_status",
        "rating_category"
    )

def create_user_features(df):
    # Select and create relevant user features
    return df.select(
        "user_id",
        "review_count",
        "average_stars",
        "fans",
        "engagement_score",
        "user_status",
        "rating_behavior"
    )

def create_review_features(df):
    # Select and create relevant review features
    return df.select(
        "review_id",
        "user_id",
        "business_id",
        "stars",
        "text_length",
        "total_votes",
        "rating_category"
    )

# Apply feature creation
business_features = create_business_features(businesses_silver)
user_features = create_user_features(users_silver)
review_features = create_review_features(reviews_silver)


In [None]:
def process_tfidf_features(business_df):
    # Process categories with TF-IDF
    hashing_tf = HashingTF(inputCol="categories_array", outputCol="tf_features", numFeatures=100)
    tf_df = hashing_tf.transform(business_df)
    
    idf = IDF(inputCol="tf_features", outputCol="tfidf_features")
    idf_model = idf.fit(tf_df)
    tfidf_df = idf_model.transform(tf_df)
    
    return tfidf_df

business_features = process_tfidf_features(business_features)


In [None]:
def create_training_dataset(reviews_df, business_features_df, user_features_df):
    # Join all features
    training_df = reviews_df \
        .join(business_features_df, "business_id") \
        .join(user_features_df, "user_id")
    
    # Create feature vector
    assembler = VectorAssembler(
        inputCols=[
            "stars_x", "review_count_x", "fans", 
            "engagement_score", "tfidf_features"
        ],
        outputCol="features"
    )
    
    return assembler.transform(training_df)

training_data = create_training_dataset(review_features, business_features, user_features)


In [None]:
def index_categorical_columns(df):
    categorical_cols = ["price_range", "business_status", "user_status", "rating_behavior"]
    
    for col_name in categorical_cols:
        indexer = StringIndexer(
            inputCol=col_name,
            outputCol=f"{col_name}_indexed"
        )
        df = indexer.fit(df).transform(df)
    
    return df

training_data = index_categorical_columns(training_data)


In [None]:
def prepare_train_test_sets(df):
    # Split into training and testing sets
    train_df, test_df = df.randomSplit([0.8, 0.2], seed=42)
    
    return train_df, test_df

train_data, test_data = prepare_train_test_sets(training_data)


In [None]:
# Save processed datasets
train_data.write.format("delta") \
    .mode("overwrite") \
    .save("D:/Project/delta_lake/gold/train_data")

test_data.write.format("delta") \
    .mode("overwrite") \
    .save("D:/Project/delta_lake/gold/test_data")

# Save feature metadata
business_features.write.format("delta") \
    .mode("overwrite") \
    .save("D:/Project/delta_lake/gold/business_features")

user_features.write.format("delta") \
    .mode("overwrite") \
    .save("D:/Project/delta_lake/gold/user_features")
