In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score



In [2]:
# fetch data for red wines
file = "winequality-red.csv"
df = pd.read_csv(file, delimiter=';')
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [3]:
df.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0
mean,8.319637,0.527821,0.270976,2.538806,0.087467,15.874922,46.467792,0.996747,3.311113,0.658149,10.422983,5.636023
std,1.741096,0.17906,0.194801,1.409928,0.047065,10.460157,32.895324,0.001887,0.154386,0.169507,1.065668,0.807569
min,4.6,0.12,0.0,0.9,0.012,1.0,6.0,0.99007,2.74,0.33,8.4,3.0
25%,7.1,0.39,0.09,1.9,0.07,7.0,22.0,0.9956,3.21,0.55,9.5,5.0
50%,7.9,0.52,0.26,2.2,0.079,14.0,38.0,0.99675,3.31,0.62,10.2,6.0
75%,9.2,0.64,0.42,2.6,0.09,21.0,62.0,0.997835,3.4,0.73,11.1,6.0
max,15.9,1.58,1.0,15.5,0.611,72.0,289.0,1.00369,4.01,2.0,14.9,8.0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1599 non-null   float64
 1   volatile acidity      1599 non-null   float64
 2   citric acid           1599 non-null   float64
 3   residual sugar        1599 non-null   float64
 4   chlorides             1599 non-null   float64
 5   free sulfur dioxide   1599 non-null   float64
 6   total sulfur dioxide  1599 non-null   float64
 7   density               1599 non-null   float64
 8   pH                    1599 non-null   float64
 9   sulphates             1599 non-null   float64
 10  alcohol               1599 non-null   float64
 11  quality               1599 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 150.0 KB


### Preprocessing the data

In [5]:
#converting 'quality' column from 'int' to 'float' datatype
df['quality'] = df['quality'].astype(dtype = 'float64')

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1599 non-null   float64
 1   volatile acidity      1599 non-null   float64
 2   citric acid           1599 non-null   float64
 3   residual sugar        1599 non-null   float64
 4   chlorides             1599 non-null   float64
 5   free sulfur dioxide   1599 non-null   float64
 6   total sulfur dioxide  1599 non-null   float64
 7   density               1599 non-null   float64
 8   pH                    1599 non-null   float64
 9   sulphates             1599 non-null   float64
 10  alcohol               1599 non-null   float64
 11  quality               1599 non-null   float64
dtypes: float64(12)
memory usage: 150.0 KB


In [7]:
# Separate features and target
X = df.drop(columns=['quality']) # Features
y = df['quality'] # Target variable

### Building a model

In [8]:
# split the data into training and testing sets at 80/20
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Training linear regression model

In [9]:
# Initialize the linear regression model and train it
model = LinearRegression()
model.fit(X_train, y_train)

### Regression equation

The coefficients indicate the impact of each variable on wine quality. Positive coefficients suggest a positive correlation, while negative coefficients suggest a negative correlation, i.e. whether quality goes up or down when a given variable increases.

In [10]:
# Regression equation coefficients
coefficients = model.coef_
intercept = model.intercept_

# Display the regression equation
print("Regression equation for estimating 'quality': ")
regression_equation = f"quality = {intercept:.2f}"
for i in range(len(coefficients)):
    regression_equation += f" + {coefficients[i]:.2f} * {X.columns[i]}"
print(regression_equation)

Regression equation for estimating 'quality': 
quality = 14.36 + 0.02 * fixed acidity + -1.00 * volatile acidity + -0.14 * citric acid + 0.01 * residual sugar + -1.81 * chlorides + 0.01 * free sulfur dioxide + -0.00 * total sulfur dioxide + -10.35 * density + -0.39 * pH + 0.84 * sulphates + 0.28 * alcohol


#### Validating the equation with the first 10 wines

In [11]:
# Selecting features for the first 10 wines
first_10 = X.head(10)

# Calculating predicted quality for each of the first 10 wines
predicted_quality = []
for i in range(len(first_10)):
    prediction = intercept
    for j in range(len(coefficients)):
        prediction += coefficients[j] * first_10.iloc[i, j]
    predicted_quality.append(prediction)

# Printing predicted quality and real quality for the first 10 wines
for i in range(len(predicted_quality)):
    print("Wine", i+1, ": Predicted vs Real Quality: {:.2f}/{:.2f}\n".format(predicted_quality[i], y.head(i+1).values[-1]))

Wine 1 : Predicted vs Real Quality: 5.05/5.00

Wine 2 : Predicted vs Real Quality: 5.15/5.00

Wine 3 : Predicted vs Real Quality: 5.21/5.00

Wine 4 : Predicted vs Real Quality: 5.68/6.00

Wine 5 : Predicted vs Real Quality: 5.05/5.00

Wine 6 : Predicted vs Real Quality: 5.08/5.00

Wine 7 : Predicted vs Real Quality: 5.11/5.00

Wine 8 : Predicted vs Real Quality: 5.36/7.00

Wine 9 : Predicted vs Real Quality: 5.33/7.00

Wine 10 : Predicted vs Real Quality: 5.60/5.00



### Top 5 Variables

In [12]:
# Get absolute coefficients and corresponding variable names and combine them
coefficients_abs = abs(coefficients)
variable_names = X.columns
coefficients_df = pd.DataFrame({'Variable': variable_names, 'Coefficient': coefficients_abs})

# Sort by coefficient magnitude in descending order
coefficients_df = coefficients_df.sort_values(by='Coefficient', ascending=False)

# Get the top 5 most useful variables
top_5_variables = coefficients_df.head(5)
print("\nTop 5 most useful variables for estimating wine quality:")
print(top_5_variables)


Top 5 most useful variables for estimating wine quality:
           Variable  Coefficient
7           density    10.351594
4         chlorides     1.806503
1  volatile acidity     1.001304
9         sulphates     0.841172
8                pH     0.393688


### Validation-based error estimate

In [13]:
# Make predictions on the testing set
y_pred = model.predict(X_test)
# Calculate the mean squared error and r2s
mse = mean_squared_error(y_test, y_pred)
print(f"mse: {mse:.2f}")
r2s = r2_score(y_test, y_pred)
print(f"r2s: {r2s:.2f}")

mse: 0.39
r2s: 0.40


We observe a high r2 score. The model seems to fit the observed data well.