In [None]:
# Install all necessary packages for the project
%pip install pandas numpy scikit-learn matplotlib seaborn

In [2]:
import os
import urllib.request

# Create a 'data' directory if it doesn't already exist
data_dir = 'data'
if not os.path.exists(data_dir):
    os.makedirs(data_dir)
    print(f"Created directory: {data_dir}")

# URL of the dataset and the local path to save it
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00547/Algerian_forest_fires_dataset_UPDATE.csv'
file_path = os.path.join(data_dir, 'Algerian_forest_fires.csv')

# Download the file from the URL
urllib.request.urlretrieve(url, file_path)
print(f"Dataset downloaded and saved to {file_path}")

Dataset downloaded and saved to data\Algerian_forest_fires.csv


In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import pickle

# Load the dataset from the file we downloaded
df = pd.read_csv('data/Algerian_forest_fires.csv', header=1)

print("Dataset loaded successfully. First 5 rows:")
df.head()

Dataset loaded successfully. First 5 rows:


Unnamed: 0,day,month,year,Temperature,RH,Ws,Rain,FFMC,DMC,DC,ISI,BUI,FWI,Classes
0,1,6,2012,29,57,18,0.0,65.7,3.4,7.6,1.3,3.4,0.5,not fire
1,2,6,2012,29,61,13,1.3,64.4,4.1,7.6,1.0,3.9,0.4,not fire
2,3,6,2012,26,82,22,13.1,47.1,2.5,7.1,0.3,2.7,0.1,not fire
3,4,6,2012,25,89,13,2.5,28.6,1.3,6.9,0.0,1.7,0.0,not fire
4,5,6,2012,27,77,16,0.0,64.8,3.0,14.2,1.2,3.9,0.5,not fire


In [4]:
# Create a 'Region' column to distinguish the two datasets inside the file
df.loc[:122, 'Region'] = 0 # Bejaia region
df.loc[124:, 'Region'] = 1 # Sidi-Bel Abbes region

# Clean up the dataframe by dropping null/unwanted rows
df.dropna(how='all', inplace=True)
if 123 in df.index:
    df.drop(123, inplace=True)

# Strip whitespace from column names and the 'Classes' column
df.columns = df.columns.str.strip()
df['Classes'] = df['Classes'].str.strip()

# Convert other columns to numeric
for col in ['DC', 'FWI', 'Temperature', 'RH', 'Ws']:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Map the target variable 'Classes' to 0 and 1
df['Classes'] = df['Classes'].map({'not fire': 0, 'fire': 1})

# --- FIX IS HERE ---
# Drop rows where 'Classes' is missing (NaN), as they can't be used for training
df.dropna(subset=['Classes'], inplace=True)
# Fill any remaining missing values in other columns with the median
df.fillna(df.median(numeric_only=True), inplace=True)

# Ensure the 'Classes' column is integer type
df['Classes'] = df['Classes'].astype(int)

print("Data cleaning complete.")
df.info()

Data cleaning complete.
<class 'pandas.core.frame.DataFrame'>
Index: 243 entries, 0 to 245
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   day          243 non-null    object 
 1   month        243 non-null    object 
 2   year         243 non-null    object 
 3   Temperature  243 non-null    float64
 4   RH           243 non-null    float64
 5   Ws           243 non-null    float64
 6   Rain         243 non-null    object 
 7   FFMC         243 non-null    object 
 8   DMC          243 non-null    object 
 9   DC           243 non-null    float64
 10  ISI          243 non-null    object 
 11  BUI          243 non-null    object 
 12  FWI          243 non-null    float64
 13  Classes      243 non-null    int64  
 14  Region       243 non-null    float64
dtypes: float64(6), int64(1), object(8)
memory usage: 30.4+ KB


In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Set plot style
sns.set(style="whitegrid")

# Visualize the relationship between temperature, humidity, and fire occurence
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x='Temperature', y='RH', hue='Classes', palette='coolwarm')
plt.title('Temperature vs. Relative Humidity by Fire Class')
plt.xlabel('Temperature (Celsius)')
plt.ylabel('Relative Humidity (%)')
plt.legend(title='Fire Occurence', labels=['Not Fire', 'Fire'])
plt.show()

# Correlation heatmap
plt.figure(figsize=(12, 8))
# FIX: Automatically select only numeric columns for the correlation matrix
numeric_df = df.select_dtypes(include=np.number)
sns.heatmap(numeric_df.corr(), annot=True, cmap='viridis', fmt=".2f")
plt.title('Correlation Matrix of Features')
plt.show()

NameError: name 'df' is not defined

<Figure size 1000x600 with 0 Axes>

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Define our features (X) and target (y)
# We drop the original date components and the target variable 'Classes'
X = df.drop(columns=['Classes', 'day', 'month', 'year'])
y = df['Classes']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

# Scale the features. This is important for model performance.
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Training set shape:", X_train_scaled.shape)
print("Testing set shape:", X_test_scaled.shape)