In [2]:
pip install ucimlrepo

Collecting ucimlrepo
  Downloading ucimlrepo-0.0.7-py3-none-any.whl.metadata (5.5 kB)
Downloading ucimlrepo-0.0.7-py3-none-any.whl (8.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.7


**Importing Libraries**

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from ucimlrepo import fetch_ucirepo
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split


# UCI Wine Dataset

Data_url': 'https://archive.ics.uci.edu/static/public/109/data.csv

ðŸ§ª Objective

- Classify wines into 3 classes based on their chemical properties.

- Each class represents a different cultivar (grape variety).

**Dataset Summary**

| Property           | Value                           |
| ------------------ | ------------------------------- |
| **Samples**        | 178 wine samples                |
| **Features**       | 13 numeric attributes           |
| **Target Classes** | 3 (Cultivars: class 1, 2, or 3) |
| **Missing Values** | None                            |
| **Data Type**      | All continuous numeric (float)  |


In [4]:
wine=fetch_ucirepo(id=109)

featuresData=wine.data.features #all features
print(featuresData)

labelData=wine.data.targets #basically class which is 1,2,3
print(labelData)

print(wine.variables)
print(wine.metadata)

     Alcohol  Malicacid   Ash  Alcalinity_of_ash  Magnesium  Total_phenols  \
0      14.23       1.71  2.43               15.6        127           2.80   
1      13.20       1.78  2.14               11.2        100           2.65   
2      13.16       2.36  2.67               18.6        101           2.80   
3      14.37       1.95  2.50               16.8        113           3.85   
4      13.24       2.59  2.87               21.0        118           2.80   
..       ...        ...   ...                ...        ...            ...   
173    13.71       5.65  2.45               20.5         95           1.68   
174    13.40       3.91  2.48               23.0        102           1.80   
175    13.27       4.28  2.26               20.0        120           1.59   
176    13.17       2.59  2.37               20.0        120           1.65   
177    14.13       4.10  2.74               24.5         96           2.05   

     Flavanoids  Nonflavanoid_phenols  Proanthocyanins  Color_i

In [5]:
print(featuresData.shape)
print(labelData.shape)

(178, 13)
(178, 1)


In [6]:
wineDF=pd.concat([featuresData,labelData],axis=1)

In [7]:
wineDF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 178 entries, 0 to 177
Data columns (total 14 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Alcohol                       178 non-null    float64
 1   Malicacid                     178 non-null    float64
 2   Ash                           178 non-null    float64
 3   Alcalinity_of_ash             178 non-null    float64
 4   Magnesium                     178 non-null    int64  
 5   Total_phenols                 178 non-null    float64
 6   Flavanoids                    178 non-null    float64
 7   Nonflavanoid_phenols          178 non-null    float64
 8   Proanthocyanins               178 non-null    float64
 9   Color_intensity               178 non-null    float64
 10  Hue                           178 non-null    float64
 11  0D280_0D315_of_diluted_wines  178 non-null    float64
 12  Proline                       178 non-null    int64  
 13  class

In [8]:
wineDF.describe()

Unnamed: 0,Alcohol,Malicacid,Ash,Alcalinity_of_ash,Magnesium,Total_phenols,Flavanoids,Nonflavanoid_phenols,Proanthocyanins,Color_intensity,Hue,0D280_0D315_of_diluted_wines,Proline,class
count,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0
mean,13.000618,2.336348,2.366517,19.494944,99.741573,2.295112,2.02927,0.361854,1.590899,5.05809,0.957449,2.611685,746.893258,1.938202
std,0.811827,1.117146,0.274344,3.339564,14.282484,0.625851,0.998859,0.124453,0.572359,2.318286,0.228572,0.70999,314.907474,0.775035
min,11.03,0.74,1.36,10.6,70.0,0.98,0.34,0.13,0.41,1.28,0.48,1.27,278.0,1.0
25%,12.3625,1.6025,2.21,17.2,88.0,1.7425,1.205,0.27,1.25,3.22,0.7825,1.9375,500.5,1.0
50%,13.05,1.865,2.36,19.5,98.0,2.355,2.135,0.34,1.555,4.69,0.965,2.78,673.5,2.0
75%,13.6775,3.0825,2.5575,21.5,107.0,2.8,2.875,0.4375,1.95,6.2,1.12,3.17,985.0,3.0
max,14.83,5.8,3.23,30.0,162.0,3.88,5.08,0.66,3.58,13.0,1.71,4.0,1680.0,3.0


In [9]:
wineDF=shuffle(wineDF,random_state=5)

In [10]:
wineDF.head()

Unnamed: 0,Alcohol,Malicacid,Ash,Alcalinity_of_ash,Magnesium,Total_phenols,Flavanoids,Nonflavanoid_phenols,Proanthocyanins,Color_intensity,Hue,0D280_0D315_of_diluted_wines,Proline,class
28,13.87,1.9,2.8,19.4,107,2.95,2.97,0.37,1.76,4.5,1.25,3.4,915,1
66,13.11,1.01,1.7,15.0,78,2.98,3.18,0.26,2.28,5.3,1.12,3.18,502,2
3,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480,1
17,13.83,1.57,2.62,20.0,115,2.95,3.4,0.4,1.72,6.6,1.13,2.57,1130,1
138,13.49,3.59,2.19,19.5,88,1.62,0.48,0.58,0.88,5.7,0.81,1.82,580,3


In [11]:
wineDF['class']=wineDF['class'].astype(str)

In [12]:
import altair as alt
alt.Chart(wineDF).mark_circle(size=50).encode(
    x="Alcohol:Q",
    y='Color_intensity:Q',
    color='class:N',
    tooltip=['Alcohol','Color_intensity','class']
).interactive().properties(
    title="Alcohol Vs Color_intensity by class"
)

In [13]:
alt.Chart(wineDF).mark_bar(opacity=0.7).encode(
    alt.X('Flavanoids:Q', bin=alt.Bin(maxbins=30)),
    alt.Y('count()'),
    alt.Color('class:N')
).properties(
    title='Flavanoids Distribution by Wine Class'
)


In [14]:
alt.Chart(wineDF).mark_boxplot().encode(
    x='class:N',
    y='Malicacid:Q',
    color='class:N'
).properties(
    title='Malic Acid Distribution by Wine Class'
)


In [15]:
from sklearn.decomposition import PCA

# PCA for dimensionality reduction
pca = PCA(n_components=2)
X_pca = pca.fit_transform(featuresData)
pcaDF = pd.DataFrame(X_pca, columns=['PC1', 'PC2'])
pcaDF['class'] = labelData.reset_index(drop=True)

# PCA Scatter Plot
pca_scatter = alt.Chart(pcaDF).mark_circle().encode(
    x='PC1',
    y='PC2',
    color='class',
    tooltip=['PC1', 'PC2', 'class']
).interactive()
pca_scatter.show()

**Preprocessing and Feature-Target Split**

In [16]:
X_train, X_test, y_train, y_test = train_test_split(featuresData, labelData, test_size=0.2, random_state=42)

In [17]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import VotingClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

# Split and scale
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [18]:
# Individual classifiers
dt = DecisionTreeClassifier(random_state=42)
rf = RandomForestClassifier(n_estimators=100, random_state=42)
lr = LogisticRegression(max_iter=2000, random_state=42)
svm = SVC(probability=True, kernel='rbf', random_state=42)  # Enable probability for soft voting
nb = GaussianNB()
knn = KNeighborsClassifier(n_neighbors=5)
gb = GradientBoostingClassifier(n_estimators=100, random_state=42)


In [19]:
# Voting Classifier - use 'soft' voting since all support predict_proba
voting_clf = VotingClassifier(
    estimators=[
        ('dt', dt),
        ('rf', rf),
        ('lr', lr),
        ('svm', svm),
        ('nb', nb),
        ('knn', knn),
        ('gb', gb)
    ],
    voting='soft'  # switch to 'hard' if needed
)


In [20]:
# Fit
voting_clf.fit(X_train_scaled, y_train)

# Predict
y_pred = voting_clf.predict(X_test_scaled)

# Evaluation
print("Voting Classifier Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))




  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


Voting Classifier Accuracy: 1.0

Classification Report:
               precision    recall  f1-score   support

           1       1.00      1.00      1.00        14
           2       1.00      1.00      1.00        14
           3       1.00      1.00      1.00         8

    accuracy                           1.00        36
   macro avg       1.00      1.00      1.00        36
weighted avg       1.00      1.00      1.00        36



In [22]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# List of models
models = [
    ('Decision Tree', dt),
    ('Random Forest', rf),
    ('Logistic Regression', lr),
    ('SVM', svm),
    ('Naive Bayes', nb),
    ('KNN', knn),
    ('Gradient Boosting', gb)
]

# Results storage
metrics = []

for name, model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, average='weighted', zero_division=0)
    rec = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')

    metrics.append((name, acc, prec, rec, f1))

# Display results
import pandas as pd

metrics_df = pd.DataFrame(metrics, columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1-Score'])
print(metrics_df.sort_values(by='Accuracy', ascending=False).to_string(index=False))

  return fit_method(estimator, *args, **kwargs)
  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  return self._fit(X, y)
  y = column_or_1d(y, warn=True)


              Model  Accuracy  Precision   Recall  F1-Score
      Random Forest  1.000000   1.000000 1.000000  1.000000
Logistic Regression  1.000000   1.000000 1.000000  1.000000
        Naive Bayes  1.000000   1.000000 1.000000  1.000000
      Decision Tree  0.944444   0.946296 0.944444  0.943997
  Gradient Boosting  0.944444   0.946296 0.944444  0.943997
                SVM  0.805556   0.801058 0.805556  0.802427
                KNN  0.722222   0.722222 0.722222  0.722222
