In [7]:
# Install required libraries if needed
# !pip install pandas scikit-learn numpy matplotlib

import json, re, os
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, accuracy_score

import pickle

DATA_DIR = Path('.')
DETAILED_DATA = Path('./synthetic_fire_dataset_detailed.jsonl')
ARTIFACTS_DIR = Path('artifacts')
ARTIFACTS_DIR.mkdir(exist_ok=True)
print('Artifacts will be saved to:', ARTIFACTS_DIR.resolve())


Artifacts will be saved to: /content/artifacts


In [5]:
from pathlib import Path
import json, re, os
import pandas as pd

DETAILED_DATA = Path('./synthetic_fire_dataset_detailed.jsonl')

def load_jsonl(path: Path):
    rows = []
    with open(path, 'r') as f:
        for line in f:
            rows.append(json.loads(line))
    return pd.DataFrame(rows)

df = load_jsonl(DETAILED_DATA)
print(df.columns.tolist())

['instruction', 'output']


In [7]:
import numpy as np # Import numpy here

def parse_elevation_range(s: str):
    if not isinstance(s, str):
        return np.nan, np.nan, np.nan
    m = re.match(r'\s*(\d+)\s*-\s*(\d+)\s*', s)
    if not m:
        return np.nan, np.nan, np.nan
    low = int(m.group(1))
    high = int(m.group(2))
    return low, high, high - low

def parse_density(s: str):
    if not isinstance(s, str):
        return np.nan
    m = re.match(r'\s*(\d+)\s*kg/acre\s*', s)
    if not m:
        return np.nan
    return float(m.group(1))

def most_frequent_slope(slopes):
    if not isinstance(slopes, list) or len(slopes) == 0:
        return None
    def norm(lbl):
        if not isinstance(lbl, str): return None
        if 'Flat' in lbl: return 'Flat'
        if 'Moderate' in lbl: return 'Moderate'
        if 'Steep' in lbl: return 'Steep'
        return lbl
    normed = [norm(s) for s in slopes if s]
    if not normed: return None
    vals, counts = np.unique(normed, return_counts=True)
    return vals[counts.argmax()]

records = []
for i, row in df.iterrows():
    instr = row.get('instruction', {})
    outp = row.get('output', {})
    if not isinstance(instr, dict) or not isinstance(outp, dict):
        continue
    topo_report = instr.get('topography_report', {})
    label = topo_report.get('terrain_type', None)
    text = outp.get('disaster_analysis', None)
    if not (label and text):
        continue
    avg_slope_deg = topo_report.get('average_slope_degree', np.nan)
    elev_low, elev_high, elev_diff = parse_elevation_range(topo_report.get('elevation_range_ft', None))
    fuel_density = parse_density(topo_report.get('avg_fuel_density', None))
    slope_mode = most_frequent_slope(topo_report.get('mpr_slopes', []))
    records.append({
        'text': text,
        'terrain_type': label,
        'average_slope_degree': avg_slope_deg,
        'elev_low_ft': elev_low,
        'elev_high_ft': elev_high,
        'elev_diff_ft': elev_diff,
        'fuel_density_kg_per_acre': fuel_density,
        'slope_mode': slope_mode,
    })

data = pd.DataFrame(records)

In [11]:
labels = sorted(data['terrain_type'].dropna().unique().tolist())
label_file = ARTIFACTS_DIR / 'topography_labels_detailed.txt'
with open(label_file, 'w') as f:
    for lab in labels:
        f.write(lab + '\n')
print('Saved labels to:', label_file.resolve())


Saved labels to: /content/artifacts/topography_labels_detailed.txt


In [12]:
X_text = data[['text']].copy()
y = data['terrain_type']

X_train_txt, X_test_txt, y_train, y_test = train_test_split(
    X_text, y, test_size=0.2, random_state=42, stratify=y
)

baseline = Pipeline(steps=[
    ('tfidf', TfidfVectorizer(max_features=5000, ngram_range=(1,2), min_df=2)),
    ('clf', LogisticRegression(max_iter=1500, class_weight='balanced'))
])

baseline.fit(X_train_txt['text'], y_train)
y_pred_base = baseline.predict(X_test_txt['text'])

print('Baseline Accuracy:', accuracy_score(y_test, y_pred_base))
print('\nClassification Report (Baseline):\n')
print(classification_report(y_test, y_pred_base, digits=3))

# Save baseline model
with open(ARTIFACTS_DIR / 'baseline_text_model.pkl', 'wb') as f:
    pickle.dump(baseline, f)
print('Saved baseline model to:', (ARTIFACTS_DIR / 'baseline_text_model.pkl').resolve())


Baseline Accuracy: 0.25

Classification Report (Baseline):

              precision    recall  f1-score   support

      Canyon      0.250     1.000     0.400         5
    Flatland      0.000     0.000     0.000         6
       Hills      0.000     0.000     0.000         5
       Ridge      0.000     0.000     0.000         4

    accuracy                          0.250        20
   macro avg      0.062     0.250     0.100        20
weighted avg      0.062     0.250     0.100        20

Saved baseline model to: /content/artifacts/baseline_text_model.pkl


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [11]:
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, accuracy_score

import pickle
from pathlib import Path

ARTIFACTS_DIR = Path('artifacts')
ARTIFACTS_DIR.mkdir(exist_ok=True)

X = data[['text', 'average_slope_degree', 'elev_low_ft', 'elev_high_ft', 'elev_diff_ft', 'fuel_density_kg_per_acre', 'slope_mode']].copy()
y = data['terrain_type']

text_col = 'text'
num_cols = ['average_slope_degree', 'elev_low_ft', 'elev_high_ft', 'elev_diff_ft', 'fuel_density_kg_per_acre']
cat_cols = ['slope_mode']

preproc = ColumnTransformer(transformers=[
    ('text', TfidfVectorizer(max_features=7000, ngram_range=(1,2), min_df=2), text_col),
    ('num', StandardScaler(with_mean=False), num_cols), # with_mean=False for sparse safety
    ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
])

hybrid = Pipeline(steps=[
    ('prep', preproc),
    ('clf', LogisticRegression(max_iter=2000, class_weight='balanced'))
])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=7, stratify=y
)

hybrid.fit(X_train, y_train)
y_pred_h = hybrid.predict(X_test)

print('Hybrid Accuracy:', accuracy_score(y_test, y_pred_h))
print('\nClassification Report (Hybrid):\n')
print(classification_report(y_test, y_pred_h, digits=3))

# Save hybrid model
with open(ARTIFACTS_DIR / 'hybrid_model.pkl', 'wb') as f:
    pickle.dump(hybrid, f)
print('Saved hybrid model to:', (ARTIFACTS_DIR / 'hybrid_model.pkl').resolve())

Hybrid Accuracy: 0.4

Classification Report (Hybrid):

              precision    recall  f1-score   support

      Canyon      0.429     0.600     0.500         5
    Flatland      0.400     0.400     0.400         5
       Hills      0.333     0.333     0.333         6
       Ridge      0.500     0.250     0.333         4

    accuracy                          0.400        20
   macro avg      0.415     0.396     0.392        20
weighted avg      0.407     0.400     0.392        20

Saved hybrid model to: /content/artifacts/hybrid_model.pkl
