In [2]:
# Import necessary libraries
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when
from pyspark.ml.feature import VectorAssembler, MinMaxScaler
import pandas as pd
import xgboost as xgb
import pickle

# Initialize Spark Session
spark = SparkSession.builder \
    .appName("WineQualityPrediction") \
    .getOrCreate()

# Load the dataset
df = spark.read.csv("/content/winequality.csv", header=True, inferSchema=True)

# Data Cleaning
df = df.fillna(0)  # Replace missing values with 0 (or use mean if needed)
df = df.withColumn("best_quality", when(col("quality") > 5, 1).otherwise(0))
df = df.drop("total sulfur dioxide")  # Drop unnecessary column

# Feature Assembly
feature_columns = [col for col in df.columns if col not in ("quality", "best_quality")]
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
df = assembler.transform(df)

# Scaling Features
scaler = MinMaxScaler(inputCol="features", outputCol="scaled_features")
scaler_model = scaler.fit(df)
scaled_data = scaler_model.transform(df)

# Select relevant columns
scaled_data = scaled_data.select(col("scaled_features").alias("features"), col("best_quality"))

# Train-Test Split
train_df, test_df = scaled_data.randomSplit([0.8, 0.2], seed=40)

# Convert to Pandas for XGBoost compatibility
train_pd = train_df.select("features", "best_quality").toPandas()
test_pd = test_df.select("features", "best_quality").toPandas()

# Prepare data for XGBoost
X_train = pd.DataFrame(train_pd["features"].tolist(), columns=feature_columns)
y_train = train_pd["best_quality"]

X_test = pd.DataFrame(test_pd["features"].tolist(), columns=feature_columns)
y_test = test_pd["best_quality"]

# Train XGBoost Classifier
xgb_model = xgb.XGBClassifier()
xgb_model.fit(X_train, y_train)

# Evaluate the Model
from sklearn.metrics import classification_report, confusion_matrix

train_pred = xgb_model.predict(X_train)
test_pred = xgb_model.predict(X_test)

print("Classification Report (Test Data):\n", classification_report(y_test, test_pred))

# Save the model as a pickle file
with open("xgb_classifier.pkl", "wb") as file:
    pickle.dump(xgb_model, file)

print("Model saved as 'xgb_classifier.pkl'")


Classification Report (Test Data):
               precision    recall  f1-score   support

           0       0.76      0.77      0.77       104
           1       0.81      0.81      0.81       129

    accuracy                           0.79       233
   macro avg       0.79      0.79      0.79       233
weighted avg       0.79      0.79      0.79       233

Model saved as 'xgb_classifier.pkl'
