In [1]:
# 1. IMPORT NECESSARY LIBRARIES
# -----------------------------------------------------------------------------
# Reason: We need standard data science and ML libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import joblib  # for saving the model efficiently
import warnings
warnings.filterwarnings('ignore')  # Suppress non-critical warnings for clean output

# Set style for better plots
sns.set(style="whitegrid")
plt.rcParams['figure.figsize'] = (10, 6)

In [None]:
# 2. LOAD THE DATA
# -----------------------------------------------------------------------------
# Reason: Load the training dataset. Last column is the target (class label)
data_path = 'fish_disease_train.csv'

if not os.path.exists(data_path):
    raise FileNotFoundError(f"Dataset not found at {data_path}. Please check the file path.")

df = pd.read_csv(data_path)

print("Dataset loaded successfully!")
print(f"Shape of dataset: {df.shape}")
display(df.head())

In [None]:
# =============================================================================
# PART (a): Train a Random Forest Classifier on Fish Disease Dataset
# Output: Save the trained model as 'model_1.pkl'
# =============================================================================

# 1. IMPORT NECESSARY LIBRARIES
# -----------------------------------------------------------------------------
# Reason: We need standard data science and ML libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import joblib  # for saving the model efficiently
import warnings
warnings.filterwarnings('ignore')  # Suppress non-critical warnings for clean output

# Set style for better plots
sns.set(style="whitegrid")
plt.rcParams['figure.figsize'] = (10, 6)

# 2. LOAD THE DATA
# -----------------------------------------------------------------------------
# Reason: Load the training dataset. Last column is the target (class label)
data_path = 'fish_disease_train.csv'

if not os.path.exists(data_path):
    raise FileNotFoundError(f"Dataset not found at {data_path}. Please check the file path.")

df = pd.read_csv(data_path)

print("Dataset loaded successfully!")
print(f"Shape of dataset: {df.shape}")
display(df.head())

# 3. EXPLORATORY DATA ANALYSIS (EDA)
# -----------------------------------------------------------------------------

# 3.1 Basic information
print("\n" + "="*50)
print("DATASET INFORMATION")
print("="*50)
df.info()

# 3.2 Check for missing values
print("\nMissing values per column:")
print(df.isnull().sum())

# 3.3 Statistical summary of numerical features
print("\nStatistical Summary (Numerical Features):")
display(df.describe())

# 3.4 Identify feature columns and target
# Reason: Last column is class label (0 to 9)
feature_columns = df.columns[:-1]
target_column = df.columns[-1]

print(f"\nNumber of features: {len(feature_columns)}")
print(f"Target column: {target_column}")
print(f"Unique classes: {sorted(df[target_column].unique())}")

# 3.5 Check class distribution (Important for classification!)
print("\nClass Distribution:")
class_counts = df[target_column].value_counts().sort_index()
print(class_counts)

# Visualize class distribution
plt.figure(figsize=(10, 5))
sns.countplot(data=df, x=target_column, order=class_counts.index, palette='viridis')
plt.title('Distribution of Fish Disease Classes')
plt.xlabel('Disease Class')
plt.ylabel('Count')
plt.show()

# 3.6 Check for outliers using boxplots (on a subset of features for clarity)
# Reason: Texture/color/statistical features can have extreme values
sample_features = df[feature_columns].columns[:10]  # Show first 10 features
plt.figure(figsize=(15, 8))
sns.boxplot(data=df[sample_features])
plt.xticks(rotation=90)
plt.title("Boxplot of Selected Features (Checking Outliers)")
plt.show()

# 3.7 Correlation matrix (to see multicollinearity among features)
plt.figure(figsize=(16, 12))
corr = df[feature_columns].corr()
sns.heatmap(corr, cmap='coolwarm', center=0, square=True, cbar_kws={"shrink": .8})
plt.title("Feature Correlation Heatmap")
plt.tight_layout()
plt.show()

# Optional: Print highly correlated pairs (>0.9)
corr_pairs = corr.abs().unstack().sort_values(ascending=False)
high_corr = corr_pairs[(corr_pairs > 0.9) & (corr_pairs < 1.0)]
print("\nHighly correlated feature pairs (|r| > 0.9):")
print(high_corr[::2])  # Avoid duplicates

# 4. CREATE A CLEAN COPY FOR MODELING
# -----------------------------------------------------------------------------
# Reason: Keep original data untouched; work on a copy to allow safe preprocessing
df_model = df.copy()

print(f"\nModeling dataset created with shape: {df_model.shape}")

# 5. SEPARATE FEATURES AND TARGET
# -----------------------------------------------------------------------------
X = df_model[feature_columns]        # Features
y = df_model[target_column]          # Target labels

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")

# 6. TRAIN-TEST SPLIT (Good practice even if not asked â€” prevents data leakage)
# -----------------------------------------------------------------------------
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set: {X_train.shape}, Validation set: {X_val.shape}")

# 7. TRAIN RANDOM FOREST CLASSIFIER
# -----------------------------------------------------------------------------
from sklearn.ensemble import RandomForestClassifier

# Reason for chosen hyperparameters:
# - n_estimators=300: Good balance between performance and speed
# - max_depth=None: Let trees grow fully (RF handles overfitting via averaging)
# - min_samples_split=2, min_samples_leaf=1: Default, works well
# - class_weight='balanced': Helps if classes are imbalanced
# - n_jobs=-1: Use all CPU cores
# - random_state=42: Reproducibility

rf_model = RandomForestClassifier(
    n_estimators=300,
    random_state=42,
    n_jobs=-1,
    class_weight='balanced',   # Important for potentially imbalanced classes
    max_features='sqrt',       # Standard for classification
    bootstrap=True
)

print("\nTraining Random Forest Classifier...")
rf_model.fit(X_train, y_train)

# 8. QUICK EVALUATION ON VALIDATION SET (Optional but recommended)
# -----------------------------------------------------------------------------
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

y_pred = rf_model.predict(X_val)

print("\n" + "="*60)
print("VALIDATION PERFORMANCE (Random Forest - model_1)")
print("="*60)
print(f"Accuracy: {accuracy_score(y_val, y_pred):.4f}")
print("\nClassification Report:")
print(classification_report(y_val, y_pred))

# Optional: Plot confusion matrix
cm = confusion_matrix(y_val, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=range(10), yticklabels=range(10))
plt.title('Confusion Matrix - Validation Set')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()

# 9. SAVE THE TRAINED MODEL AS model_1.pkl
# -----------------------------------------------------------------------------
model_filename = 'model_1.pkl'
joblib.dump(rf_model, model_filename)

print(f"\nRandom Forest model trained and saved as '{model_filename}'")
print(f"Model uses {rf_model.n_features_in_} features and {rf_model.n_classes_} classes.")

# Optional: Verify model can be loaded
loaded_model = joblib.load(model_filename)
print("Model saved and loading verified successfully!")