In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import warnings

# Ignore all warnings
warnings.filterwarnings("ignore")

In [None]:
df=pd.read_csv("/kaggle/input/red-wine-quality-cortez-et-al-2009/winequality-red.csv")
df.head()

In [None]:
df.shape

In [None]:
df.columns

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.isnull().sum()

In [None]:
df.quality.value_counts()

In [None]:
import seaborn as sns
sns.countplot(x='quality', data=df, palette="Set3")

plt.title('Distribution of Wine Quality')
plt.xlabel('Quality')
plt.ylabel('Count')

plt.show()

In [None]:
df.hist(bins=20, figsize=(10, 10))
plt.show()

In [None]:
correlation_matrix = df.corr()

# Set up the figure and axes
plt.figure(figsize=(10, 8))

# Create a heatmap using seaborn
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)

plt.title('Correlation Heatmap')
plt.show()

In [None]:

sns.pairplot(df, diag_kind="kde")  # Create the pair plot

plt.show() 

In [None]:
df.drop('total sulfur dioxide', axis=1,inplace=True)

In [None]:
df

In [None]:
# Violin Plot: Compare distribution of alcohol content by wine quality
plt.figure(figsize=(10, 6))
sns.violinplot(x="quality", y="alcohol", data=df)
plt.title("Violin Plot of Alcohol Content by Wine Quality")
plt.xlabel("Quality")
plt.ylabel("Alcohol Content")
plt.show()

In [None]:
df['best_quality'] = [1 if x > 5 else 0 for x in df.quality]
df.head()

In [None]:
plt.figure(figsize=(15, 8))
sns.set(style="whitegrid")  # Optional: Set the style of the plots

# Loop through each column and create distribution plots
for column in df.columns:
    plt.subplot(4, 3, df.columns.get_loc(column) + 1)  # Adjust subplot layout as needed
    sns.kdeplot(data=df, x=column, hue="best quality",fill=True,common_norm=False)
plt.tight_layout()
plt.show()

In [None]:
df.dtypes

In [None]:
df.head()

In [None]:
corr=df.corr()
corr

In [None]:
quality_corr=df.corr()['best_quality'].to_frame()
quality_corr

In [None]:
plt.figure(figsize=(15,5))
plt.title('correlation of target feature with predictor features')
sns.barplot(data=quality_corr,y=quality_corr.index,x='best_quality')

In [None]:
df.head()

In [None]:
x = df.drop(['quality', 'best_quality'], axis=1)
y = df['best_quality']
 

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(x, y,test_size=0.2, random_state=10)

In [None]:
from sklearn.preprocessing import MinMaxScaler
mm = MinMaxScaler()
X_train = mm.fit_transform(X_train)
X_test = mm.transform(X_test)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score



# Initialize classifiers
classifiers = [
    ("Logistic Regression", LogisticRegression()),
    ("Support Vector Machine", SVC()),
    ("Naive Bayes",GaussianNB())
]

# Initialize a dataframe to store results
results_df = pd.DataFrame(columns=['Algorithm', 'Accuracy'])

# Iterate through classifiers, train, predict and store accuracy
for algorithm, clf in classifiers:
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    results_df = results_df.append({'Algorithm': algorithm, 'Accuracy': accuracy}, ignore_index=True)

# Print the results dataframe
print(results_df)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
random_forest = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model on the training data
random_forest.fit(X_train, y_train)

# Predict wine quality on the test data
y_pred = random_forest.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

class_report = classification_report(y_test, y_pred)
print("Classification Report:")
print(class_report)

In [None]:
conf_matrix = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(10, 8))
sns.heatmap(conf_matrix, annot=True, cmap="coolwarm", fmt="d")
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()