# 와인 품질 분류 모델 만들기
데이터 출처 : https://www.kaggle.com/datasets/taweilo/wine-quality-dataset-balanced-classification

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("wine_quality_model").getOrCreate()

In [2]:
file_path = 'file:////home/jovyan/work/start_spark/learning_spark_data/wine_data.csv'
file_path

'file:////home/jovyan/work/start_spark/learning_spark_data/wine_data.csv'

In [16]:
df = spark.read.csv(file_path, inferSchema=True, header=True)
df.printSchema()

root
 |-- fixed_acidity: double (nullable = true)
 |-- volatile_acidity: double (nullable = true)
 |-- citric_acid: double (nullable = true)
 |-- residual_sugar: double (nullable = true)
 |-- chlorides: double (nullable = true)
 |-- free_sulfur_dioxide: double (nullable = true)
 |-- total_sulfur_dioxide: double (nullable = true)
 |-- density: double (nullable = true)
 |-- pH: double (nullable = true)
 |-- sulphates: double (nullable = true)
 |-- alcohol: double (nullable = true)
 |-- quality: integer (nullable = true)



In [4]:
df.show(5)

+-------------+----------------+-----------+--------------+---------+-------------------+--------------------+-------+----+---------+-------+-------+
|fixed_acidity|volatile_acidity|citric_acid|residual_sugar|chlorides|free_sulfur_dioxide|total_sulfur_dioxide|density|  pH|sulphates|alcohol|quality|
+-------------+----------------+-----------+--------------+---------+-------------------+--------------------+-------+----+---------+-------+-------+
|         11.6|            0.58|       0.66|           2.2|    0.074|               10.0|                47.0| 1.0008|3.25|     0.57|    9.0|      3|
|         10.4|            0.61|       0.49|           2.1|      0.2|                5.0|                16.0| 0.9994|3.16|     0.63|    8.4|      3|
|          7.4|           1.185|        0.0|          4.25|    0.097|                5.0|                14.0| 0.9966|3.63|     0.54|   10.7|      3|
|         10.4|            0.44|       0.42|           1.5|    0.145|               34.0|           

In [5]:
from pyspark.sql.functions import col, sum, when, isnan
null_counts = df.select(
                    [
                        sum(when(col(c).isNull() | isnan(c), 1).otherwise(0)).alias(c)
                        for c in df.columns
                    ]
                )
null_counts.show()

+-------------+----------------+-----------+--------------+---------+-------------------+--------------------+-------+---+---------+-------+-------+
|fixed_acidity|volatile_acidity|citric_acid|residual_sugar|chlorides|free_sulfur_dioxide|total_sulfur_dioxide|density| pH|sulphates|alcohol|quality|
+-------------+----------------+-----------+--------------+---------+-------------------+--------------------+-------+---+---------+-------+-------+
|            0|               0|          0|             0|        0|                  0|                   0|      0|  0|        0|      0|      0|
+-------------+----------------+-----------+--------------+---------+-------------------+--------------------+-------+---+---------+-------+-------+



In [8]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import VectorAssembler, StringIndexer, OneHotEncoder, StandardScaler

In [9]:
train_data, test_data = df.randomSplit([0.8,0.2], seed=64)

In [7]:
stages = []

In [20]:
df_c = df.columns
df_c.pop(-1)
df_c

['fixed_acidity',
 'volatile_acidity',
 'citric_acid',
 'residual_sugar',
 'chlorides',
 'free_sulfur_dioxide',
 'total_sulfur_dioxide',
 'density',
 'pH',
 'sulphates',
 'alcohol']

In [21]:
num_features = df_c
for num in num_features:
    num_assembler = VectorAssembler(inputCols=[num], outputCol=num+'_vector')
    num_scaler = StandardScaler(inputCol=num_assembler.getOutputCol(), outputCol=num+'_scaled')
    stages += [num_assembler, num_scaler]
stages

[VectorAssembler_439d209c77a8,
 StandardScaler_fc973bb0a6cd,
 VectorAssembler_e4ce8b2bff3e,
 StandardScaler_29dd79f175fe,
 VectorAssembler_a95d92cc1a48,
 StandardScaler_5d9fc20e8cd7,
 VectorAssembler_e1b99ef2e119,
 StandardScaler_18c6276eb92e,
 VectorAssembler_6fcd1afef2ff,
 StandardScaler_14a3e9947f01,
 VectorAssembler_64d7d954a3ab,
 StandardScaler_e4322e8fa258,
 VectorAssembler_001ca225b02d,
 StandardScaler_c50e8e1d20b7,
 VectorAssembler_ac032890db49,
 StandardScaler_1d6f501a3558,
 VectorAssembler_64ba0204c4a4,
 StandardScaler_5bc4c4a002c2,
 VectorAssembler_2527aa90ac16,
 StandardScaler_ea8e815b964f,
 VectorAssembler_b0e43bc41d35,
 StandardScaler_18bd1af699a5]

In [22]:
assembler_input = [num+'_scaled' for num in num_features]
assembler_input

['fixed_acidity_scaled',
 'volatile_acidity_scaled',
 'citric_acid_scaled',
 'residual_sugar_scaled',
 'chlorides_scaled',
 'free_sulfur_dioxide_scaled',
 'total_sulfur_dioxide_scaled',
 'density_scaled',
 'pH_scaled',
 'sulphates_scaled',
 'alcohol_scaled']

In [23]:
assembler = VectorAssembler(
    inputCols= assembler_input,
    outputCol= 'feature_vector'
)
stages += [assembler]
stages

[VectorAssembler_439d209c77a8,
 StandardScaler_fc973bb0a6cd,
 VectorAssembler_e4ce8b2bff3e,
 StandardScaler_29dd79f175fe,
 VectorAssembler_a95d92cc1a48,
 StandardScaler_5d9fc20e8cd7,
 VectorAssembler_e1b99ef2e119,
 StandardScaler_18c6276eb92e,
 VectorAssembler_6fcd1afef2ff,
 StandardScaler_14a3e9947f01,
 VectorAssembler_64d7d954a3ab,
 StandardScaler_e4322e8fa258,
 VectorAssembler_001ca225b02d,
 StandardScaler_c50e8e1d20b7,
 VectorAssembler_ac032890db49,
 StandardScaler_1d6f501a3558,
 VectorAssembler_64ba0204c4a4,
 StandardScaler_5bc4c4a002c2,
 VectorAssembler_2527aa90ac16,
 StandardScaler_ea8e815b964f,
 VectorAssembler_b0e43bc41d35,
 StandardScaler_18bd1af699a5,
 VectorAssembler_700d359f0ceb]

In [24]:
from pyspark.ml import Pipeline
pipeline = Pipeline(stages=stages)
fitted_transform = pipeline.fit(train_data)
vtrain_data = fitted_transform.transform(train_data)
vtrain_data.printSchema()

root
 |-- fixed_acidity: double (nullable = true)
 |-- volatile_acidity: double (nullable = true)
 |-- citric_acid: double (nullable = true)
 |-- residual_sugar: double (nullable = true)
 |-- chlorides: double (nullable = true)
 |-- free_sulfur_dioxide: double (nullable = true)
 |-- total_sulfur_dioxide: double (nullable = true)
 |-- density: double (nullable = true)
 |-- pH: double (nullable = true)
 |-- sulphates: double (nullable = true)
 |-- alcohol: double (nullable = true)
 |-- quality: integer (nullable = true)
 |-- fixed_acidity_vector: vector (nullable = true)
 |-- fixed_acidity_scaled: vector (nullable = true)
 |-- volatile_acidity_vector: vector (nullable = true)
 |-- volatile_acidity_scaled: vector (nullable = true)
 |-- citric_acid_vector: vector (nullable = true)
 |-- citric_acid_scaled: vector (nullable = true)
 |-- residual_sugar_vector: vector (nullable = true)
 |-- residual_sugar_scaled: vector (nullable = true)
 |-- chlorides_vector: vector (nullable = true)
 |-- chl

In [25]:
vtrain_data.select('feature_vector', 'quality').show(3)

+--------------------+-------+
|      feature_vector|quality|
+--------------------+-------+
|[1.57187400796699...|      6|
|[1.61323911343980...|      8|
|[1.73733442985825...|      7|
+--------------------+-------+
only showing top 3 rows



In [32]:
lr = LogisticRegression(featuresCol='feature_vector', labelCol='quality')
lr_model = lr.fit(vtrain_data)

In [33]:
#테스트데이터도 변환
vtest_data = fitted_transform.transform(test_data)
#테스트데이터로 예측
pred = lr_model.transform(vtest_data)

In [34]:
pred.select('quality', 'prediction').show(5)

+-------+----------+
|quality|prediction|
+-------+----------+
|      5|       5.0|
|      6|       6.0|
|      7|       7.0|
|      6|       7.0|
|      4|       6.0|
+-------+----------+
only showing top 5 rows



In [36]:
from pyspark.sql.functions import expr
comp = pred.withColumn('correct', expr('case when quality = prediction then 1 else 0 end'))
comp.where('correct=0').count()

2948

In [37]:
comp.selectExpr('avg(correct) as accuracy').collect()[0]['accuracy']

0.301090564248459

In [40]:
from pyspark.sql.functions import col, avg, stddev
for num in num_features:
    mean_val = df.select(avg(num)).collect()[0][0]
    stddev_val = df.select(stddev(num)).collect()[0][0]
    
    # 2. Z-score 임계값 설정
    threshold = 3.0
    
    
    # 3. 이상치 제거
    df_no_outliers_zscore = df.filter(
        (col(num) >= (mean_val - threshold * stddev_val)) &
        (col(num) <= (mean_val + threshold * stddev_val))
    )
    train_data, test_data = df_no_outliers_zscore.randomSplit([0.8,0.2], seed=64)
    
    pipeline = Pipeline(stages=stages)
    fitted_transform = pipeline.fit(train_data)
    vtrain_data = fitted_transform.transform(train_data)
    lr = LogisticRegression(featuresCol='feature_vector', labelCol='quality')
    lr_model = lr.fit(vtrain_data)
    #테스트데이터도 변환
    vtest_data = fitted_transform.transform(test_data)
    #테스트데이터로 예측
    pred = lr_model.transform(vtest_data)
    comp = pred.withColumn('correct', expr('case when quality = prediction then 1 else 0 end'))
    comp.where('correct=0').count()
    acc = comp.selectExpr('avg(correct) as accuracy').collect()[0]['accuracy']
    print(f'{num} : {acc}')

fixed_acidity : 0.301090564248459
volatile_acidity : 0.301090564248459
citric_acid : 0.301090564248459
residual_sugar : 0.301090564248459
chlorides : 0.30369843527738266
free_sulfur_dioxide : 0.301090564248459
total_sulfur_dioxide : 0.301090564248459
density : 0.301090564248459
pH : 0.2917063870352717
sulphates : 0.301090564248459
alcohol : 0.3013276434329066


In [41]:
for num in num_features:
    quantiles = df.approxQuantile(num, [0.25, 0.75], 0.01)
    Q1 = quantiles[0]
    Q3 = quantiles[1]
    
    # 2. IQR 계산
    IQR = Q3 - Q1
    
    # 3. 경계값 설정
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df_no_outliers = df.filter((col(num) >= lower_bound) & (col(num) <= upper_bound))

    train_data, test_data = df_no_outliers.randomSplit([0.8,0.2], seed=64)
    
    pipeline = Pipeline(stages=stages)
    fitted_transform = pipeline.fit(train_data)
    vtrain_data = fitted_transform.transform(train_data)
    lr = LogisticRegression(featuresCol='feature_vector', labelCol='quality')
    lr_model = lr.fit(vtrain_data)
    #테스트데이터도 변환
    vtest_data = fitted_transform.transform(test_data)
    #테스트데이터로 예측
    pred = lr_model.transform(vtest_data)
    comp = pred.withColumn('correct', expr('case when quality = prediction then 1 else 0 end'))
    comp.where('correct=0').count()
    acc = comp.selectExpr('avg(correct) as accuracy').collect()[0]['accuracy']
    print(f'{num} : {acc}')

fixed_acidity : 0.301090564248459
volatile_acidity : 0.301090564248459
citric_acid : 0.301090564248459
residual_sugar : 0.301090564248459
chlorides : 0.301090564248459
free_sulfur_dioxide : 0.301090564248459
total_sulfur_dioxide : 0.301090564248459
density : 0.301090564248459
pH : 0.3104106972301815
sulphates : 0.301090564248459
alcohol : 0.3013276434329066
