# Decision Tree Model
Aim is to build a model for each specific station

In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score

## Individually Training Each Station

In [43]:
# Directory containing the cleaned station datasets
dataset_dir = "cleaned_dataset"

In [44]:
# Function to extract map features
def extract_map_features(map_string):
    try:
        # Replace problematic formatting
        cleaned_string = map_string.replace("...", "").replace("\n", " ").replace("[", "").replace("]", "")
        map_array = np.fromstring(cleaned_string, sep=" ")

        # Validate map size
        if map_array.size != 36:  # Adjust grid size if necessary
            raise ValueError(f"Unexpected map size: {map_array.size}")

        # Reshape into a 6 × 6 grid
        map_array = map_array.reshape(6, 6)

        # Calculate features
        return {
            "mean_sla": np.mean(map_array),
            "max_sla": np.max(map_array),
            "min_sla": np.min(map_array),
            "std_sla": np.std(map_array)
        }

    except Exception as e:
        print(f"Error processing map: {e}")
        return {"mean_sla": 0, "max_sla": 0, "min_sla": 0, "std_sla": 0}

In [69]:
# List to store results for all stations
results = []

# Iterate over all station files in the directory
for file_name in os.listdir(dataset_dir):
    if file_name.endswith(".csv"):
        station_name = file_name.replace(".csv", "").replace(" ", "_")
        print(f"Processing station: {station_name}")

        # Load the dataset
        file_path = os.path.join(dataset_dir, file_name)
        df = pd.read_csv(file_path)

        # Normalize column names
        df.columns = df.columns.str.strip().str.replace(" ", "_")

        # Extract map features
        map_features = df["Map"].apply(extract_map_features)
        map_features_df = pd.DataFrame(map_features.tolist())

        # Add extracted features to the main DataFrame
        df_features = pd.concat([df, map_features_df], axis=1)

        # Drop the original 'Map' column
        df_features = df_features.drop(columns=["Map"])

        # Check if station_name exists
        if station_name not in df_features.columns:
            print(f"Error: {station_name} not found in DataFrame columns.")
            print("Available columns:", df_features.columns)
            continue

        # Define features and target
        X = df_features[["mean_sla", "max_sla", "min_sla", "std_sla"]]
        y = df_features[station_name]

        # Split into training and test sets
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        # Initialize and train the decision tree
        decision_tree = DecisionTreeClassifier(max_depth=5, random_state=42)
        decision_tree.fit(X_train, y_train)

        # Predict on test set
        y_pred = decision_tree.predict(X_test)

        # Evaluate the model
        accuracy = accuracy_score(y_test, y_pred)
        report = classification_report(y_test, y_pred, output_dict=True)
        print(f"Stats for {station_name}:")
        print(f"Accuracy: {accuracy:.4f}")
        print("-" * 50)

        # Append the results
        results.append({
            "Station": station_name,
            "Accuracy": accuracy,
        })

# Create a DataFrame with the results
results_df = pd.DataFrame(results)

# Print the final results DataFrame
print("\nSummary of Results:")
results_df.head(12)

Processing station: Atlantic_City
Stats for Atlantic_City:
Accuracy: 0.9597
--------------------------------------------------
Processing station: The_Battery


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Stats for The_Battery:
Accuracy: 0.9935
--------------------------------------------------
Processing station: New_London
Stats for New_London:
Accuracy: 0.9924
--------------------------------------------------
Processing station: Sewells_Point
Stats for Sewells_Point:
Accuracy: 0.9787
--------------------------------------------------
Processing station: Newport
Stats for Newport:
Accuracy: 0.9993
--------------------------------------------------
Processing station: Fort_Pulaski


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Stats for Fort_Pulaski:
Accuracy: 0.9876
--------------------------------------------------
Processing station: Lewes
Stats for Lewes:
Accuracy: 0.9546
--------------------------------------------------
Processing station: Baltimore
Stats for Baltimore:
Accuracy: 0.9774
--------------------------------------------------
Processing station: Sandy_Hook
Stats for Sandy_Hook:
Accuracy: 0.9549
--------------------------------------------------
Processing station: Portland
Stats for Portland:
Accuracy: 0.9848
--------------------------------------------------
Processing station: Washington
Stats for Washington:
Accuracy: 0.9335
--------------------------------------------------
Processing station: Eastport
Stats for Eastport:
Accuracy: 0.9299
--------------------------------------------------

Summary of Results:


Unnamed: 0,Station,Accuracy
0,Atlantic_City,0.95966
1,The_Battery,0.993474
2,New_London,0.992403
3,Sewells_Point,0.978738
4,Newport,0.999311
5,Fort_Pulaski,0.987612
6,Lewes,0.954639
7,Baltimore,0.977366
8,Sandy_Hook,0.954892
9,Portland,0.984838


## Submission Format (model.py)

In [70]:
class Model:
    def __init__(self):
        print("Decision Tree Model Initialized")
        # Load the trained model
        self.model = DecisionTreeClassifier(max_depth=5, random_state=42)
        self.model_weights_path = "model_weights.pkl"
        self._load_model()

    def _load_model(self):
        """Load the saved model weights."""
        if os.path.exists(self.model_weights_path):
            self.model = pd.read_pickle(self.model_weights_path)
            print("Model weights loaded successfully.")
        else:
            print("No pre-trained weights found. Ensure the weights file exists.")

    def predict(self, X):
        """
        Predict anomalies for given input.
        X: Input DataFrame with features.
        Returns: Binary predictions as a DataFrame.
        """
        if not isinstance(X, pd.DataFrame):
            raise ValueError("Input data must be a pandas DataFrame.")
        predictions = self.model.predict(X)
        return pd.DataFrame(predictions, columns=["Prediction"])