In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
# Load the dataset
file_path = r"D:\DATA-SCIENCE\ASSIGNMENTS\16 KNN\zoo.csv"
zoo_df = pd.read_csv(file_path)

In [5]:
import pandas as pd

df = pd.read_csv("Zoo.csv")
print(df.head())
print(df.info())


  animal name  hair  feathers  eggs  milk  airborne  aquatic  predator  \
0    aardvark     1         0     0     1         0        0         1   
1    antelope     1         0     0     1         0        0         0   
2        bass     0         0     1     0         0        1         1   
3        bear     1         0     0     1         0        0         1   
4        boar     1         0     0     1         0        0         1   

   toothed  backbone  breathes  venomous  fins  legs  tail  domestic  catsize  \
0        1         1         1         0     0     4     0         0        1   
1        1         1         1         0     0     4     1         0        1   
2        1         1         0         0     1     0     1         0        0   
3        1         1         1         0     0     4     0         0        1   
4        1         1         1         0     0     4     1         0        1   

   type  
0     1  
1     1  
2     4  
3     1  
4     1  
<class '

In [None]:
# Drop non-numeric column
zoo_df = zoo_df.drop(columns=["animal name"])

In [None]:
# 1. Correlation heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(zoo_df.corr(numeric_only=True), cmap="coolwarm", annot=False)
plt.title("Feature Correlation Heatmap")
plt.show()

In [None]:
# 2. Pairplot (simplified to avoid overload)
selected_features = ['hair', 'feathers', 'eggs', 'milk', 'aquatic', 'legs', 'type']
sns.pairplot(zoo_df[selected_features], hue='type', diag_kind="hist", palette='viridis')
plt.suptitle("Pairwise Relationships Between Key Features", y=1.02)
plt.show()

In [None]:
# 3. Distribution of 'legs'
plt.figure(figsize=(8, 5))
sns.countplot(x='legs', data=zoo_df, palette='magma')
plt.title('Distribution of Number of Legs')
plt.xlabel('Legs')
plt.ylabel('Count')
plt.show()

In [None]:
# 4. Presence of hair across animal types
plt.figure(figsize=(8, 5))
sns.countplot(x='type', hue='hair', data=zoo_df, palette='Set2')
plt.title('Presence of Hair Across Animal Types')
plt.xlabel('Animal Type')
plt.ylabel('Count')
plt.legend(title='Hair')
plt.show()

Step 2: Data Preprocessing

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Load the dataset
file_path = r"D:\DATA SCIENCE\ASSIGNMENTS\16 KNN\KNN\Zoo.csv"
zoo_df = pd.read_csv(file_path)

In [None]:
# Drop non-numeric / irrelevant column
zoo_df = zoo_df.drop(columns=["animal name"])

In [None]:
# Check for outliers (mainly in 'legs')
plt.figure(figsize=(6,4))
sns.boxplot(x=zoo_df["legs"])
plt.title("Outlier Check — Legs Feature")
plt.show()

In [None]:
# Split features and target
X = zoo_df.drop(columns=["type"])
y = zoo_df["type"]

In [None]:
# Scale features (important for KNN)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
# Split into training and test sets (80-20)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
print("Training set size:", X_train.shape)
print("Testing set size:", X_test.shape)

Step 3: Splitting the dataset into training and testing sets (80–20)

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [None]:
# Load the dataset
file_path = r"D:\DATA SCIENCE\ASSIGNMENTS\16 KNN\KNN\Zoo.csv"
zoo_df = pd.read_csv(file_path)

In [None]:
# Drop irrelevant column
zoo_df = zoo_df.drop(columns=["animal name"])

In [None]:
# Split features and target
X = zoo_df.drop(columns=["type"])
y = zoo_df["type"]

In [None]:
# Standardize features (important for KNN)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
# Split dataset — 80% train, 20% test with stratified sampling
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

In [None]:
print("Training set size:", X_train.shape)
print("Testing set size:", X_test.shape)
print("\nClass distribution in Training set:\n", y_train.value_counts(normalize=True).round(2))
print("\nClass distribution in Testing set:\n", y_test.value_counts(normalize=True).round(2))

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Load the dataset
file_path = r"D:\DATA SCIENCE\ASSIGNMENTS\16 KNN\KNN\Zoo.csv"
zoo_df = pd.read_csv(file_path)

In [None]:
# Display first few rows, structure, and basic info
zoo_head = zoo_df.head()
zoo_info = zoo_df.info()
zoo_description = zoo_df.describe()

In [None]:
# Class distribution plot
plt.figure(figsize=(8, 5))
sns.countplot(x='type', data=zoo_df, palette='viridis')
plt.title('Distribution of Animal Types')
plt.xlabel('Animal Type')
plt.ylabel('Count')
plt.show()

In [None]:
zoo_head, zoo_description

Step 2: Data Preprocessing for Zoo dataset
- Handles missing values check
- Visualizes possible outliers (legs)
- Drops irrelevant columns
- Scales features for KNN
- Splits data (80% train / 20% test) with stratification

In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import joblib   # optional: to save the scaler

In [None]:
# -------------------------
# 1. Load dataset (try user path, fallback to /mnt/data)
# -------------------------
user_path = r"D:\DATA SCIENCE\ASSIGNMENTS\16 KNN\KNN\Zoo.csv"
fallback_path = "/mnt/data/Zoo.csv"

In [None]:
if os.path.exists(user_path):
    file_path = user_path
elif os.path.exists(fallback_path):
    file_path = fallback_path
else:
    raise FileNotFoundError(
        f"Zoo.csv not found at either '{user_path}' or '{fallback_path}'. "
        "Put the file in one of these paths or update file_path variable."
    )

In [None]:
df = pd.read_csv(file_path)
print(f"Loaded file: {file_path}")
print("Shape:", df.shape)
print("\nColumns:", list(df.columns))
print("\nFirst 5 rows:\n", df.head())

In [None]:
# -------------------------
# 2. Missing values check
# -------------------------
print("\nMissing values per column:")
print(df.isnull().sum())

In [None]:
# -------------------------
# 3. Drop irrelevant columns
# -------------------------
if "animal name" in df.columns:
    df = df.drop(columns=["animal name"])
    print("\nDropped column: 'animal name'")

In [None]:
# -------------------------
# 4. Quick stats & outlier check for 'legs'
# -------------------------
print("\nSummary statistics for numeric features:\n", df.describe())

In [None]:
plt.figure(figsize=(6, 3))
sns.boxplot(x=df["legs"])
plt.title("Boxplot — legs")
plt.xlabel("Number of legs")
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(8,4))
sns.countplot(x="legs", data=df)
plt.title("Countplot — legs distribution")
plt.xlabel("Legs")
plt.ylabel("Count")
plt.tight_layout()
plt.show()

NOTE: legs values like 0 and 8 are biologically valid (snakes, arachnids), so we keep them.

In [None]:
# -------------------------
# 5. Split features and target
# -------------------------
if "type" not in df.columns:
    raise KeyError("'type' column (target) not found in dataset. Make sure file contains target column named 'type'.")

In [None]:
X = df.drop(columns=["type"])
y = df["type"]

In [None]:
print("\nFeature matrix shape:", X.shape)
print("Target vector shape:", y.shape)
print("\nTarget class distribution:\n", y.value_counts().sort_index())

In [None]:
# -------------------------
# 6. Feature scaling (StandardScaler)
# -------------------------
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)   # X is mostly binary + 'legs', so standardization is appropriate

In [None]:
# Optional: save scaler for later (useful when deploying)
scaler_outpath = os.path.join(os.path.dirname(file_path), "zoo_scaler.joblib")
joblib.dump(scaler, scaler_outpath)
print(f"\nStandardScaler saved to: {scaler_outpath}")

In [None]:
# -------------------------
# 7. Train-test split (80% train, 20% test) with stratification
# -------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

In [None]:
print("\nAfter splitting:")
print("X_train:", X_train.shape, "X_test:", X_test.shape)
print("y_train distribution:\n", pd.Series(y_train).value_counts().sort_index())
print("y_test distribution:\n", pd.Series(y_test).value_counts().sort_index())

In [None]:
# -------------------------
# 8. (Optional) Save train/test as .npz or .csv for next steps
# -------------------------
out_dir = os.path.dirname(file_path)
import numpy as np
np.savez_compressed(os.path.join(out_dir, "zoo_knn_data.npz"),
                    X_train=X_train, X_test=X_test, y_train=y_train.values, y_test=y_test.values)
print(f"\nSaved processed arrays to: {os.path.join(out_dir, 'zoo_knn_data.npz')}")

End of Step 2 preprocessing script

Step 5: Choosing an appropriate distance metric and value for K

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

In [None]:
# Load and preprocess data
file_path = r"D:\DATA SCIENCE\ASSIGNMENTS\16 KNN\KNN\Zoo.csv"
df = pd.read_csv(file_path)
df = df.drop(columns=["animal name"])
X = df.drop(columns=["type"])
y = df["type"]

In [None]:
# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [None]:
# Pipeline: scaling + KNN
pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("knn", KNeighborsClassifier())
])

In [None]:
# Define parameter grid for grid search
param_grid = {
    "knn__n_neighbors": list(range(1, 21)),  # K = 1 to 20
    "knn__p": [1, 2],                        # Manhattan (1) and Euclidean (2)
    "knn__weights": ["uniform", "distance"]  # test both weighting schemes
}

In [None]:
# Perform grid search
grid = GridSearchCV(pipe, param_grid, cv=5, scoring="accuracy", n_jobs=-1)
grid.fit(X_train, y_train)

In [None]:
# Display best results
print("Best parameters:", grid.best_params_)
print("Best cross-validation accuracy: {:.4f}".format(grid.best_score_))

In [None]:
# Evaluate on test set
best_model = grid.best_estimator_
y_pred = best_model.predict(X_test)
test_acc = accuracy_score(y_test, y_pred)
print("Test set accuracy: {:.4f}".format(test_acc))

In [None]:
# Visualize K vs accuracy for each distance metric
results = pd.DataFrame(grid.cv_results_)
plt.figure(figsize=(8, 5))
for p_val, label in zip([1, 2], ['Manhattan (p=1)', 'Euclidean (p=2)']):
    subset = results[results['param_knn__p'] == p_val]
    plt.plot(subset['param_knn__n_neighbors'], subset['mean_test_score'], marker='o', label=label)
plt.xlabel("Number of Neighbours (K)")
plt.ylabel("Mean CV Accuracy")
plt.title("KNN Performance across K values and Distance Metrics")
plt.legend()
plt.grid(alpha=0.3)
plt.show()

Step 6: Evaluate the KNN Classifier’s Performance

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
import joblib

In [None]:
# Load the dataset again (for continuity)
file_path = r"D:\DATA SCIENCE\ASSIGNMENTS\16 KNN\KNN\Zoo.csv"
df = pd.read_csv(file_path)
df = df.drop(columns=["animal name"])

In [None]:
# Split features & target
X = df.drop(columns=["type"])
y = df["type"]

In [None]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [None]:
# Best model parameters (from previous step)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
knn = KNeighborsClassifier(n_neighbors=5, p=2, weights='distance')
knn.fit(X_train_scaled, y_train)

In [None]:
# Predict on test set
y_pred = knn.predict(X_test_scaled)

In [None]:
# Compute metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)

In [None]:
# Print evaluation metrics
print(f"Accuracy : {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall   : {recall:.4f}")
print(f"F1-Score : {f1:.4f}\n")

In [None]:
print("Detailed Classification Report:")
print(classification_report(y_test, y_pred, zero_division=0))

In [None]:
# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8,6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Greens', xticklabels=np.unique(y), yticklabels=np.unique(y))
plt.title("Confusion Matrix — KNN Classifier on Zoo Dataset")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.show()

Step 7: Visualize decision boundaries using PCA (2D projection)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split

In [None]:
# Load and preprocess
file_path = r"D:\DATA SCIENCE\ASSIGNMENTS\16 KNN\KNN\Zoo.csv"
df = pd.read_csv(file_path)
df = df.drop(columns=["animal name"])

In [None]:
X = df.drop(columns=["type"])
y = df["type"]

In [None]:
# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

In [None]:
# Standardize
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
# Reduce dimensions to 2 using PCA for visualization
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

In [None]:
# Train KNN with best parameters
knn = KNeighborsClassifier(n_neighbors=5, p=2, weights='distance')
knn.fit(X_pca, y)

In [None]:
# Create mesh grid over the 2D PCA space
x_min, x_max = X_pca[:, 0].min() - 1, X_pca[:, 0].max() + 1
y_min, y_max = X_pca[:, 1].min() - 1, X_pca[:, 1].max() + 1
xx, yy = np.meshgrid(np.linspace(x_min, x_max, 400), np.linspace(y_min, y_max, 400))

In [None]:
# Predict class for each grid point
Z = knn.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

In [None]:
# Plot decision boundaries
plt.figure(figsize=(10, 7))
plt.contourf(xx, yy, Z, alpha=0.3, cmap='tab10')
sns.scatterplot(x=X_pca[:, 0], y=X_pca[:, 1], hue=y, palette='tab10', edgecolor='k', s=60)
plt.title("Decision Boundaries of KNN Classifier (PCA 2D Projection)")
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.legend(title="Animal Type", loc='best', bbox_to_anchor=(1.05, 1))
plt.tight_layout()
plt.show()