In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Load the dataset
file_path = r"D:\DATA SCIENCE\ASSIGNMENTS\16 KNN\KNN\Zoo.csv"
zoo_df = pd.read_csv(file_path)

In [None]:
# Drop non-numeric column
zoo_df = zoo_df.drop(columns=["animal name"])

In [None]:
# 1. Correlation heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(zoo_df.corr(numeric_only=True), cmap="coolwarm", annot=False)
plt.title("Feature Correlation Heatmap")
plt.show()

In [None]:
# 2. Pairplot (simplified to avoid overload)
selected_features = ['hair', 'feathers', 'eggs', 'milk', 'aquatic', 'legs', 'type']
sns.pairplot(zoo_df[selected_features], hue='type', diag_kind="hist", palette='viridis')
plt.suptitle("Pairwise Relationships Between Key Features", y=1.02)
plt.show()

In [None]:
# 3. Distribution of 'legs'
plt.figure(figsize=(8, 5))
sns.countplot(x='legs', data=zoo_df, palette='magma')
plt.title('Distribution of Number of Legs')
plt.xlabel('Legs')
plt.ylabel('Count')
plt.show()

In [None]:
# 4. Presence of hair across animal types
plt.figure(figsize=(8, 5))
sns.countplot(x='type', hue='hair', data=zoo_df, palette='Set2')
plt.title('Presence of Hair Across Animal Types')
plt.xlabel('Animal Type')
plt.ylabel('Count')
plt.legend(title='Hair')
plt.show()

Step 2: Data Preprocessing

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Load the dataset
file_path = r"D:\DATA SCIENCE\ASSIGNMENTS\16 KNN\KNN\Zoo.csv"
zoo_df = pd.read_csv(file_path)

In [None]:
# Drop non-numeric / irrelevant column
zoo_df = zoo_df.drop(columns=["animal name"])

In [None]:
# Check for outliers (mainly in 'legs')
plt.figure(figsize=(6,4))
sns.boxplot(x=zoo_df["legs"])
plt.title("Outlier Check — Legs Feature")
plt.show()

In [None]:
# Split features and target
X = zoo_df.drop(columns=["type"])
y = zoo_df["type"]

In [None]:
# Scale features (important for KNN)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
# Split into training and test sets (80-20)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
print("Training set size:", X_train.shape)
print("Testing set size:", X_test.shape)

Step 3: Splitting the dataset into training and testing sets (80–20)

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [None]:
# Load the dataset
file_path = r"D:\DATA SCIENCE\ASSIGNMENTS\16 KNN\KNN\Zoo.csv"
zoo_df = pd.read_csv(file_path)

In [None]:
# Drop irrelevant column
zoo_df = zoo_df.drop(columns=["animal name"])

In [None]:
# Split features and target
X = zoo_df.drop(columns=["type"])
y = zoo_df["type"]

In [None]:
# Standardize features (important for KNN)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
# Split dataset — 80% train, 20% test with stratified sampling
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

In [None]:
print("Training set size:", X_train.shape)
print("Testing set size:", X_test.shape)
print("\nClass distribution in Training set:\n", y_train.value_counts(normalize=True).round(2))
print("\nClass distribution in Testing set:\n", y_test.value_counts(normalize=True).round(2))

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Load the dataset
file_path = r"D:\DATA SCIENCE\ASSIGNMENTS\16 KNN\KNN\Zoo.csv"
zoo_df = pd.read_csv(file_path)

In [None]:
# Display first few rows, structure, and basic info
zoo_head = zoo_df.head()
zoo_info = zoo_df.info()
zoo_description = zoo_df.describe()

In [None]:
# Class distribution plot
plt.figure(figsize=(8, 5))
sns.countplot(x='type', data=zoo_df, palette='viridis')
plt.title('Distribution of Animal Types')
plt.xlabel('Animal Type')
plt.ylabel('Count')
plt.show()

In [None]:
zoo_head, zoo_description