In [None]:
import numpy as np 
import pandas as pd
import math

import plotly.graph_objs as go
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt

sns.set(style="darkgrid")
sns.set_palette(sns.color_palette("viridis"))

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold, KFold

from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix


In [None]:


df = pd.read_csv('WineQT.csv')

# conver the columns into lowercase
df.columns = [c.lower().replace(' ','_') for c in df.columns]
# df['quality'] = df['quality'].apply(lambda x: x-3)
df.head()

### EDA 

Missing Values


In [None]:
df.isna().sum()

1. No missing values

**Duplicated Values**

In [None]:
print("\nNumber of duplicated rows : ", df.drop(columns=['id']).duplicated().sum(),"\n")

In [None]:
df = df.drop_duplicates(subset=['fixed_acidity', 'volatile_acidity', 'citric_acid', 'residual_sugar',
       'chlorides', 'free_sulfur_dioxide', 'total_sulfur_dioxide', 'density',
       'ph', 'sulphates', 'alcohol', 'quality'], keep='first')

2. About 10% of rows were duplicated and has been removed

In [None]:
df.columns

In [None]:
df.shape

In [None]:
df.columns

In [None]:
# Expected data types
expected_dtypes = {
    'fixed_acidity': 'float64',
    'volatile_acidity': 'float64',
    'citric_acid': 'float64',
    'residual_sugar': 'float64',
    'chlorides': 'float64',
    'free_sulfur_dioxide': 'float64',
    'total_sulfur_dioxide': 'float64',
    'density': 'float64',
    'ph': 'float64',
    'sulphates': 'float64',
    'alcohol': 'float64',
    'quality': 'int64',
    "id" : "int64"
}

# Check for incorrect data types
incorrect_dtypes = {col: dtype for col, dtype in df.dtypes.items() if dtype != expected_dtypes[col]}

print("Incorrect df types:")
print(None)

# Convert to the correct df types if needed
print("Data is correct with following:")
for col, dtype in incorrect_dtypes.items():
    df[col] = df[col].astype(expected_dtypes[col])

# Verify the conversion
print(df.dtypes)

**Outliers**

In [None]:
features = ['fixed_acidity', 'volatile_acidity', 'citric_acid', 'residual_sugar',
           'chlorides', 'free_sulfur_dioxide', 'total_sulfur_dioxide', 'density',
           'ph', 'sulphates', 'alcohol', 'quality']


plt.figure(figsize=(12, 8))

for feature in features:
    plt.subplot(3, 4, features.index(feature) + 1)
    sns.boxplot(data=df, y=feature, color='skyblue', width=0.5)
    plt.title(f'Boxplot of {feature}')
    plt.ylabel('')

plt.tight_layout()
plt.show()

In [None]:
percent_to_drop = 100 - 100*len(df[(df['residual_sugar']<=7) & (df['chlorides']<=0.4)])/len(df)

print(f"\nDropping selected outliers will result in loss of {percent_to_drop:.2f} % of data")

3. Outliers are present but seem to contain valuable information
Most outliers are in<br>
    1. residual_sugar<br>
    2. chlorides<br>
    3. sulphates<br>

**Target Distribution**

In [None]:
plt.figure(figsize=(8, 5))
sns.countplot(data=df, x='quality', palette='Set2')
plt.title('Distribution of Wine Quality')
plt.xlabel('Wine Quality')
plt.ylabel('Count')
plt.show()

**Correlations**


In [None]:
features = ['fixed_acidity', 'volatile_acidity', 'citric_acid', 'residual_sugar',
            'chlorides', 'free_sulfur_dioxide', 'total_sulfur_dioxide', 'density',
            'ph', 'sulphates', 'alcohol', 'quality']

correlation_data = df[features]

correlation_matrix = correlation_data.corr()

plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=.5)
plt.title("Correlation Matrix")
plt.show()

1. No missing values

2. About 10% of rows were duplicated and has been removed

3. Outliers are present but seem to contain valuable information
    Most outliers are in
       1. residual_sugar
       2. chlorides
       3. sulphates
We have removed a little bit of them that don't affect the quality with small number of data points:
* removed data points where df['residual_sugar'] > 7
* removed data points where df['chlorides'] > 0.4

4. Best Quality Wine (4 and 5, we've shifted quality to be from 0 to 5)
    Has highest
       1. alcohol
       2. citric_acid
       3. sulphates
    Has lowest
       1. volatile_acidity
       2. dencity
       3. pH
5. Dataset is unbalanced

Feature Engineering

In [None]:


df['Total_sulphur_Dioxide'] = df['free_sulfur_dioxide'] + df['total_sulfur_dioxide']
df = df.drop(columns = ['free_sulfur_dioxide','total_sulfur_dioxide'])
df['Acidity'] = df['fixed_acidity'] + df['volatile_acidity'] + df['citric_acid']

df = df.drop(columns = ['fixed_acidity','volatile_acidity','citric_acid'])

def categorize_sugar(sugar):
  if sugar< 1.5 :
    return "low"
  elif sugar >1.5 and sugar<7:
    return "medium"
  else:
    return "high"
  
df['residual_sugar'] = df['residual_sugar'].apply(categorize_sugar)

def categorize_pH(pH):
  if pH<3:
    return "acidic"
  elif pH>=3 and pH<=4:
    return "neutral"
  else:
    return "basic"

df['ph'] = df['ph'].apply(categorize_pH)

cate_cols = ['residual_sugar', 'ph']

df = pd.get_dummies(df, columns=cate_cols)

df["residual_sugar_high"]= df["residual_sugar_high"].astype(int)
df["residual_sugar_low"]= df["residual_sugar_low"].astype(int)
df["residual_sugar_medium"]= df["residual_sugar_medium"].astype(int)
df["ph_acidic"]= df["ph_acidic"].astype(int)
df["ph_basic"]= df["ph_basic"].astype(int)
df["ph_neutral"]= df["ph_neutral"].astype(int)


# Train test Split 
X=df.drop("quality",axis=1)
y=df['quality']
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,random_state=42)

In [None]:
df.head(3)

In [None]:
X=df.drop("quality",axis=1)
y=df['quality']
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,random_state=42)

Decision Tree

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [None]:
import warnings 
warnings.filterwarnings(action="ignore")

treeclassifier = DecisionTreeClassifier()

Before Hyperparameter Tuninig: 

In [None]:
clf2 = DecisionTreeClassifier()
clf2.fit(X_train, y_train)

# Make predictions on the test set
y_pred2 = clf2.predict(X_test)

# Calculate the test accuracy
accuracy2 = accuracy_score(y_pred2, y_test)

# Calculate the training accuracy
train_accuracy2 = clf2.score(X_train, y_train)

print("Training Accuracy for DecisionTreeClassifier: ", train_accuracy2)
print("Test Accuracy for DecisionTreeClassifier: ", accuracy2)


After Hyperparamter + Best Parameter: 

In [None]:
parameters2 = {
    'criterion' : ['gini','entropy'],
    'splitter' : ['best','random'],
    'max_depth' : [1,2,3,4,5],
    'max_features' : ['auto','sqrt','log2']
}

clf2 = GridSearchCV(treeclassifier, param_grid = parameters2, cv=5,scoring='accuracy')

clf2.fit(X_train,y_train)


from sklearn.metrics import accuracy_score

# Extract the best parameters from the grid search
best_params2 = clf2.best_params_

# Refit the DecisionTreeClassifier with the best parameters
clf2 = DecisionTreeClassifier(**best_params2)
clf2.fit(X_train, y_train)

# Make predictions on the test set
y_pred2 = clf2.predict(X_test)

# Calculate the test accuracy
accuracy2 = accuracy_score(y_pred2, y_test)

# Calculate the training accuracy
train_accuracy2 = clf2.score(X_train, y_train)

# Print the best parameters, training accuracy, and test accuracy
print("Best parameters for DecisionTreeClassifier: ", best_params2)
print("Training Accuracy for DecisionTreeClassifier: ", train_accuracy2)
print("Test Accuracy for DecisionTreeClassifier: ", accuracy2)


In [None]:
# Generate random data with the same shape and features as X_train
X_train_random = np.random.rand(X_train.shape[0], X_train.shape[1])

print(f"Random Data: {[X_train_random[0]]}")
# Make predictions on the random data
y_pred_random = clf2.predict([X_train_random[0]])

# Print predictions for the random data
print("Predictions for random data: ", y_pred_random)

**With the best prameter**

**Random Forest**

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score

**Without Hyperparamter**

In [None]:
clf3__ = RandomForestClassifier()
clf3__.fit(X_train,y_train)

train_accuracy3 = clf3__.score(X_train, y_train)

y_pred3 = clf3__.predict(X_test)
accuracy3 = accuracy_score(y_test,y_pred3)

print("Training Accuracy for RandomForestClassifier: ", train_accuracy3)

print("Testing Accuracy for RandomForestClassifier:  : " , accuracy3)


**With best paramters**

In [None]:
clf3 =RandomForestClassifier()

parameters3 = {
    'criterion' : ['gini','entropy'],
    'max_depth' : [1,2,3,4,5,6,7,8,9],
    'n_estimators' : [1,10,100,200,300,500,1000]
}
clf3 = RandomizedSearchCV(clf3, param_distributions =parameters3, scoring='accuracy',cv=5,verbose=3)

clf3.fit(X_train,y_train)



best_params3 = clf3.best_params_
clf3__ = RandomForestClassifier(**best_params3)
clf3__.fit(X_train,y_train)

train_accuracy3 = clf3__.score(X_train, y_train)

y_pred3 = clf3__.predict(X_test)
accuracy3 = accuracy_score(y_test,y_pred3)

print("Best parameters for Random_Forest : ",best_params3)
print("Training Accuracy for RandomForestClassifier: ", train_accuracy3)

print("Testing Accuracy for RandomForestClassifier:  : " , accuracy3)


In [None]:
# Generate random data with the same shape and features as X_train
X_train_random = np.random.rand(X_train.shape[0], X_train.shape[1])

print(f"Random Data: {[X_train_random[0]]}")
# Make predictions on the random data
y_pred_random = clf3__.predict([X_train_random[0]])

# Print predictions for the random data
print("Predictions for random data: ", y_pred_random)

Logstic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.metrics import accuracy_score
import warnings

Before Hyperparameter

In [None]:
clf1 = LogisticRegression()

clf1.fit(X_train, y_train)

# Make predictions on the test set
y_pred2 = clf1.predict(X_test)

# Calculate the test accuracy
accuracy2 = accuracy_score(y_pred2, y_test)

# Calculate the training accuracy
train_accuracy2 = clf1.score(X_train, y_train)

print("Training Accuracy for LogisticRegression: ", train_accuracy2)
print("Test Accuracy for LogisticRegression: ", accuracy2)

After Hyperparameter

In [None]:
parameters1 = {'penalty' : ['l1','l2','elasticnet','None'],'C':[1,5,10,20,50,75,100]}

clf1 = GridSearchCV(clf1,param_grid=parameters1,cv=5)

clf1.fit(X_train,y_train)

train_accuracy1 = clf1.score(X_train, y_train)

best_params = clf1.best_params_

clf1 =LogisticRegression(C = best_params['C'], penalty = best_params['penalty'])


clf1.fit(X_train,y_train)


y_pred1 = clf1.predict(X_test)
accuracy = accuracy_score(y_pred1,y_test)
print("Training Accuracy for LogisticRegression: ", train_accuracy1)
print("Test Accuracy for LogisticRegression: ", accuracy)

In [None]:
# Generate random data with the same shape and features as X_train
X_train_random = np.random.rand(X_train.shape[0], X_train.shape[1])

print(f"Random Data: {[X_train_random[0]]}")
# Make predictions on the random data
y_pred_random = clf1.predict([X_train_random[0]])

# Print predictions for the random data
print("Predictions for random data: ", y_pred_random)