<a href="https://colab.research.google.com/github/ShashwathShinde6/Machine-Learning/blob/master/bml_lca_k_means_clustering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **BML LCA-3**

In [None]:
# Import required libraries
import numpy as np
import pandas as pd
from sklearn.datasets import load_wine
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score
from scipy.stats import mode
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# Load the Wine dataset
data = load_wine()
X = data.data
y = data.target

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Optional: reduce dimensionality for better clustering (not required)
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

# Apply KMeans clustering
kmeans = KMeans(n_clusters=3, random_state=42)
clusters = kmeans.fit_predict(X_scaled)

# Map predicted clusters to true labels using mode
# For each cluster, assign the most common true label
labels = np.zeros_like(clusters)
for i in range(3):
    mask = (clusters == i)
    labels[mask] = mode(y[mask], keepdims=False).mode

# Calculate accuracy
accuracy = accuracy_score(y, labels)
print(f"Clustering Accuracy: {accuracy:.2f}")


Clustering Accuracy: 0.97


# Data Loading

In [None]:
# Import pandas
import pandas as pd

# Load the Wine dataset from a CSV file (replace with your file path)
wine_data = pd.read_csv('/content/wine.data')  # Replace with your file path

# Display the first few rows to check if data is loaded correctly
print("Wine Dataset (First 5 rows):")
print(wine_data.head())


Wine Dataset (First 5 rows):
   1  14.23  1.71  2.43  15.6  127   2.8  3.06   .28  2.29  5.64  1.04  3.92  \
0  1  13.20  1.78  2.14  11.2  100  2.65  2.76  0.26  1.28  4.38  1.05  3.40   
1  1  13.16  2.36  2.67  18.6  101  2.80  3.24  0.30  2.81  5.68  1.03  3.17   
2  1  14.37  1.95  2.50  16.8  113  3.85  3.49  0.24  2.18  7.80  0.86  3.45   
3  1  13.24  2.59  2.87  21.0  118  2.80  2.69  0.39  1.82  4.32  1.04  2.93   
4  1  14.20  1.76  2.45  15.2  112  3.27  3.39  0.34  1.97  6.75  1.05  2.85   

   1065  
0  1050  
1  1185  
2  1480  
3   735  
4  1450  


# Data Preprocessing

In [None]:
# Import the necessary libraries for exploration
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Convert the Wine dataset into a DataFrame for easier exploration
wine_df = pd.DataFrame(wine.data, columns=wine.feature_names)

# Add the target to the DataFrame (class labels)
wine_df['target'] = wine.target

# Show basic statistics and data overview
print("Dataset Description (Summary statistics):")
print(wine_df.describe())  # Get the summary statistics of the data

# Check the shape of the data (rows and columns)
print("\nDataset Shape (rows, columns):")
print(wine_df.shape)

# Check for any missing values
print("\nCheck for Missing Values:")
print(wine_df.isnull().sum())  # This will show the count of missing values per column

# Check for any duplicate rows in the dataset
print("\nCheck for Duplicates:")
print(wine_df.duplicated().sum())  # This will show the number of duplicate rows

# Check basic info about the dataset
print("\nDataset Info:")
print(wine_df.info())  # This will give you the types of each column and memory usage

# Now, Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(wine_df.drop(columns='target'))  # Don't scale the target

# Show the first few scaled features
print("\nScaled Features (First 5 rows):")
print(X_scaled[:5])


Dataset Description (Summary statistics):
          alcohol  malic_acid         ash  alcalinity_of_ash   magnesium  \
count  178.000000  178.000000  178.000000         178.000000  178.000000   
mean    13.000618    2.336348    2.366517          19.494944   99.741573   
std      0.811827    1.117146    0.274344           3.339564   14.282484   
min     11.030000    0.740000    1.360000          10.600000   70.000000   
25%     12.362500    1.602500    2.210000          17.200000   88.000000   
50%     13.050000    1.865000    2.360000          19.500000   98.000000   
75%     13.677500    3.082500    2.557500          21.500000  107.000000   
max     14.830000    5.800000    3.230000          30.000000  162.000000   

       total_phenols  flavanoids  nonflavanoid_phenols  proanthocyanins  \
count     178.000000  178.000000            178.000000       178.000000   
mean        2.295112    2.029270              0.361854         1.590899   
std         0.625851    0.998859              0.

# Train-Test Split

In [None]:
# Import train_test_split from sklearn
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Print shapes to verify the split
print("\nTraining Data Shape:", X_train.shape)
print("Testing Data Shape:", X_test.shape)



Training Data Shape: (142, 13)
Testing Data Shape: (36, 13)


# Model Training

In [None]:
# Import KNeighborsClassifier
from sklearn.neighbors import KNeighborsClassifier

# Initialize the KNN classifier with n_neighbors=5
knn = KNeighborsClassifier(n_neighbors=5)

# Train the KNN classifier on the training data
knn.fit(X_train, y_train)

# Print a message indicating the model has been trained
print("KNN model has been trained.")


KNN model has been trained.
