In [1]:
import numpy as np
import pandas as pd
from scipy import stats

# Load data từ thư mục local 
wine_data = pd.read_csv("../data/winequality-red.csv")

# Chọn tất cả cột (hoặc giới hạn nếu muốn: wine_data = wine_data[['fixed acidity', 'volatile acidity', 'quality']])
# Kiểm tra data
print(wine_data.head(5))  # Xem 5 dòng đầu
print(wine_data.dtypes)   # Kiểu dữ liệu (hầu hết là float/int)
print(wine_data.shape)    # Kích thước (1599 rows, 12 columns)

   fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
0            7.4              0.70         0.00             1.9      0.076   
1            7.8              0.88         0.00             2.6      0.098   
2            7.8              0.76         0.04             2.3      0.092   
3           11.2              0.28         0.56             1.9      0.075   
4            7.4              0.70         0.00             1.9      0.076   

   free sulfur dioxide  total sulfur dioxide  density    pH  sulphates  \
0                 11.0                  34.0   0.9978  3.51       0.56   
1                 25.0                  67.0   0.9968  3.20       0.68   
2                 15.0                  54.0   0.9970  3.26       0.65   
3                 17.0                  60.0   0.9980  3.16       0.58   
4                 11.0                  34.0   0.9978  3.51       0.56   

   alcohol  quality  
0      9.4        5  
1      9.8        5  
2      9.8        5 

In [2]:
# Kiểm tra missing values
print(wine_data.isnull().sum())  # Nếu có NaN ở cột nào, sẽ hiển thị

# Fill NaN bằng 0 cho cột 'quality' (hoặc thay bằng mean: .fillna(wine_data['quality'].mean()))
wine_data['quality'] = wine_data['quality'].fillna(0)

fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
dtype: int64


In [3]:
# Tính trên cột 'quality'
column = 'quality'

# Mean (giá trị trung bình)
data_mean = np.mean(wine_data[column])
print("Mean:", data_mean)

# Median (trung vị)
data_median = np.median(wine_data[column])
print("Median:", data_median)

# Mode (mode, giá trị phổ biến nhất)
data_mode = stats.mode(wine_data[column])
print("Mode:", data_mode.mode)

# Variance (phương sai)
data_variance = np.var(wine_data[column])
print("Variance:", data_variance)

# Standard Deviation (độ lệch chuẩn)
data_sd = np.std(wine_data[column])
print("Standard Deviation:", data_sd)

# Range (phạm vi: max - min)
data_max = np.max(wine_data[column])
data_min = np.min(wine_data[column])
data_range = data_max - data_min
print("Range:", data_range)

# 60th Percentile (phân vị 60%)
data_percentile = np.percentile(wine_data[column], 60)
print("60th Percentile:", data_percentile)

# 75th Quartile (Q3, phần tư vị 75%)
data_quartile = np.quantile(wine_data[column], 0.75)
print("75th Quartile:", data_quartile)

# IQR (khoảng tứ phân vị)
data_IQR = stats.iqr(wine_data[column])
print("IQR:", data_IQR)

Mean: 5.6360225140712945
Median: 6.0
Mode: 5
Variance: 0.6517605398308234
Standard Deviation: 0.8073168769639486
Range: 5
60th Percentile: 6.0
75th Quartile: 6.0
IQR: 1.0


In [4]:
# Thống kê mô tả toàn bộ dataset
print(wine_data.describe())

       fixed acidity  volatile acidity  citric acid  residual sugar  \
count    1599.000000       1599.000000  1599.000000     1599.000000   
mean        8.319637          0.527821     0.270976        2.538806   
std         1.741096          0.179060     0.194801        1.409928   
min         4.600000          0.120000     0.000000        0.900000   
25%         7.100000          0.390000     0.090000        1.900000   
50%         7.900000          0.520000     0.260000        2.200000   
75%         9.200000          0.640000     0.420000        2.600000   
max        15.900000          1.580000     1.000000       15.500000   

         chlorides  free sulfur dioxide  total sulfur dioxide      density  \
count  1599.000000          1599.000000           1599.000000  1599.000000   
mean      0.087467            15.874922             46.467792     0.996747   
std       0.047065            10.460157             32.895324     0.001887   
min       0.012000             1.000000         