In [None]:
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Step 1: Load the wine quality dataset
wine_data = pd.read_csv('WineQT.csv')

# Step 2: Select features (all columns except 'quality') and target ('quality')
X = wine_data.drop('quality', axis=1)
y = wine_data['quality']

# Step 3: Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4: Train a linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Step 5: Make predictions on the test set
predictions = model.predict(X_test)

# Step 6: Calculate average wine quality for each alcohol level
average_quality = wine_data.groupby('alcohol')['quality'].mean().reset_index()

# Step 7: Create barplot
plt.figure(figsize=(12, 8))
sns.barplot(x='alcohol', y='quality', data=average_quality)
plt.title('Average Wine Quality by Alcohol Level', fontsize=16)
plt.xlabel('Alcohol Level', fontsize=14)
plt.ylabel('Average Quality', fontsize=14)

# Rotate x-axis labels for better readability
plt.xticks(rotation=45)

# Increase tick label font size
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)

# Step 8: Display linear regression results
print("Linear Regression Results:")
print("Intercept:", model.intercept_)
print("Coefficients:", model.coef_)
print("Mean Squared Error (MSE):", mean_squared_error(y_test, predictions))
print("R-squared:", r2_score(y_test, predictions))

plt.show()
