In [1]:
import pandas as pd
import numpy as np
import pymongo
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
firecase = pd.read_csv("fire_cases_in_uk_last_3_years.csv")
firecase.head()

Unnamed: 0,IncidentNumber,DateOfCall,CalYear,TimeOfCall,HourOfCall,IncidentGroup,StopCodeDescription,SpecialServiceType,PropertyCategory,PropertyType,...,FirstPumpArriving_AttendanceTime,FirstPumpArriving_DeployedFromStation,SecondPumpArriving_AttendanceTime,SecondPumpArriving_DeployedFromStation,NumStationsWithPumpsAttending,NumPumpsAttending,PumpCount,PumpHoursRoundUp,Notional Cost (£),NumCalls
0,000006-01012019,01 Jan 2019,2019,00:01:45,0,Special Service,Special Service,Lift Release,Dwelling,Purpose Built Flats/Maisonettes - 4 to 9 storeys,...,,,,,1.0,1.0,1.0,1.0,333.0,2.0
1,000019-01012019,01 Jan 2019,2019,00:04:33,0,Fire,Secondary Fire,,Outdoor,Tree scrub,...,357.0,Edmonton,,,1.0,1.0,1.0,1.0,333.0,1.0
2,000020-01012019,01 Jan 2019,2019,00:04:39,0,False Alarm,False alarm - Good intent,,Outdoor,Domestic garden (vegetation not equipment),...,318.0,Southgate,,,1.0,1.0,1.0,1.0,333.0,1.0
3,000021-01012019,01 Jan 2019,2019,00:04:44,0,False Alarm,AFA,,Dwelling,Stately Home (part not open to public),...,210.0,Kensington,,,1.0,1.0,1.0,1.0,333.0,1.0
4,000024-01012019,01 Jan 2019,2019,00:05:00,0,Special Service,Special Service,Lift Release,Dwelling,Purpose Built Flats/Maisonettes - 4 to 9 storeys,...,329.0,Bethnal Green,,,1.0,1.0,1.0,1.0,333.0,1.0


In [5]:
client = pymongo.MongoClient("mongodb://localhost:27017/")
db = client["fire_db"]
collection = db["fire_data"]

In [8]:
data_dict = firecase.to_dict("records")
collection.insert_many(data_dict)
print(f"Inserted {len(data_dict)} records into MongoDB.")

Inserted 331570 records into MongoDB.


In [4]:
import pymongo

client = pymongo.MongoClient("mongodb://localhost:27017/", serverSelectionTimeoutMS=3000)

try:
    info = client.server_info()
    print("✔️ Terkoneksi ke MongoDB!")
    print("Versi MongoDB:", info["version"])
except pymongo.errors.ServerSelectionTimeoutError as err:
    print("❌ Tidak bisa konek:", err)


✔️ Terkoneksi ke MongoDB!
Versi MongoDB: 8.0.4


In [6]:
# Ambil kembali data dari MongoDB
data = pd.DataFrame(list(collection.find()))

# Drop kolom _id jika ada
if '_id' in data.columns:
    data.drop('_id', axis=1, inplace=True)

data.head()

Unnamed: 0,IncidentNumber,DateOfCall,CalYear,TimeOfCall,HourOfCall,IncidentGroup,StopCodeDescription,SpecialServiceType,PropertyCategory,PropertyType,...,FirstPumpArriving_AttendanceTime,FirstPumpArriving_DeployedFromStation,SecondPumpArriving_AttendanceTime,SecondPumpArriving_DeployedFromStation,NumStationsWithPumpsAttending,NumPumpsAttending,PumpCount,PumpHoursRoundUp,Notional Cost (£),NumCalls
0,000006-01012019,01 Jan 2019,2019,00:01:45,0,Special Service,Special Service,Lift Release,Dwelling,Purpose Built Flats/Maisonettes - 4 to 9 storeys,...,,,,,1.0,1.0,1.0,1.0,333.0,2.0
1,000019-01012019,01 Jan 2019,2019,00:04:33,0,Fire,Secondary Fire,,Outdoor,Tree scrub,...,357.0,Edmonton,,,1.0,1.0,1.0,1.0,333.0,1.0
2,000020-01012019,01 Jan 2019,2019,00:04:39,0,False Alarm,False alarm - Good intent,,Outdoor,Domestic garden (vegetation not equipment),...,318.0,Southgate,,,1.0,1.0,1.0,1.0,333.0,1.0
3,000021-01012019,01 Jan 2019,2019,00:04:44,0,False Alarm,AFA,,Dwelling,Stately Home (part not open to public),...,210.0,Kensington,,,1.0,1.0,1.0,1.0,333.0,1.0
4,000024-01012019,01 Jan 2019,2019,00:05:00,0,Special Service,Special Service,Lift Release,Dwelling,Purpose Built Flats/Maisonettes - 4 to 9 storeys,...,329.0,Bethnal Green,,,1.0,1.0,1.0,1.0,333.0,1.0


In [9]:
# Shape
print("Jumlah baris dan kolom:", data.shape)

# Tipe data per kolom
print("\nTipe data:")
print(data.dtypes)

# Statistik deskriptif
print("\nStatistik deskriptif:")
print(data.describe(include='all'))

# Cek missing values
print("\nMissing values:")
print(data.isnull().sum())

# Cek duplikasi
print("\nJumlah duplikat:", data.duplicated().sum())


Jumlah baris dan kolom: (331570, 39)

Tipe data:
IncidentNumber                             object
DateOfCall                                 object
CalYear                                     int64
TimeOfCall                                 object
HourOfCall                                  int64
IncidentGroup                              object
StopCodeDescription                        object
SpecialServiceType                         object
PropertyCategory                           object
PropertyType                               object
AddressQualifier                           object
Postcode_full                              object
Postcode_district                          object
UPRN                                        int64
USRN                                        int64
IncGeo_BoroughCode                         object
IncGeo_BoroughName                         object
ProperCase                                 object
IncGeo_WardCode                            object
I

In [8]:
data.drop(["IncidentNumber", "DateOfCall", "TimeOfCall", "UPRN", "USRN", "Postcode_full", "ProperCase", "IncGeo_WardNameNew", "CalYear"], axis=1, inplace=True)

In [15]:

# Cek kolom numerik
numerical_cols = firecase.select_dtypes(include=['float64', 'int64']).columns

# Isi missing value dengan median untuk setiap kolom numerik
for col in numerical_cols:
    if firecase[col].isnull().sum() > 0:
        median_val = firecase[col].median()
        firecase[col].fillna(median_val, inplace=True)
        print(f"{col} diisi dengan median: {median_val}")


Easting_m diisi dengan median: 530853.0
Northing_m diisi dengan median: 180978.0
Latitude diisi dengan median: 51.51277601445
Longitude diisi dengan median: -0.11606502855
FirstPumpArriving_AttendanceTime diisi dengan median: 291.0
SecondPumpArriving_AttendanceTime diisi dengan median: 363.0
NumStationsWithPumpsAttending diisi dengan median: 1.0
NumPumpsAttending diisi dengan median: 1.0
PumpCount diisi dengan median: 1.0
PumpHoursRoundUp diisi dengan median: 1.0
Notional Cost (£) diisi dengan median: 346.0
NumCalls diisi dengan median: 1.0


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  firecase[col].fillna(median_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  firecase[col].fillna(median_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values

In [16]:
data.drop_duplicates(inplace=True)

In [10]:
# Hitung IQR untuk kolom 'Notional Cost (£)'
Q1 = data['Notional Cost (£)'].quantile(0.25)
Q3 = data['Notional Cost (£)'].quantile(0.75)
IQR = Q3 - Q1

# Deteksi outlier
outliers = data[(data['Notional Cost (£)'] < (Q1 - 1.5 * IQR)) | 
                (data['Notional Cost (£)'] > (Q3 + 1.5 * IQR))]
print(f"Jumlah outlier Notional Cost (£): {len(outliers)}")

# Hapus outlier (opsional)
data = data[~((data['Notional Cost (£)'] < (Q1 - 1.5 * IQR)) | 
              (data['Notional Cost (£)'] > (Q3 + 1.5 * IQR)))]


Jumlah outlier Notional Cost (£): 45899


In [11]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

# Daftar kolom yang akan di-encode
categorical_cols = [
    'IncidentGroup',
    'StopCodeDescription',
    'SpecialServiceType',
    'PropertyCategory',
    'PropertyType',
    'AddressQualifier',
    'Postcode_district',
    'IncGeo_BoroughCode',
    'IncGeo_BoroughName',
    'IncGeo_WardCode',
    'IncGeo_WardName',
    'IncGeo_WardNameNew',
    'FRS',
    'IncidentStationGround',
    'FirstPumpArriving_DeployedFromStation',
    'SecondPumpArriving_DeployedFromStation'
]

# Tangani missing value dulu dengan fillna (sementara aja supaya bisa di-encode)
for col in categorical_cols:
    data[col] = data[col].fillna('Missing')
    data[col] = le.fit_transform(data[col])


In [12]:
from sklearn.preprocessing import MinMaxScaler

# Daftar kolom numerik yang akan dinormalisasi
num_cols = [
    'FirstPumpArriving_AttendanceTime',
    'SecondPumpArriving_AttendanceTime',
    'NumStationsWithPumpsAttending',
    'NumPumpsAttending',
    'PumpCount',
    'PumpHoursRoundUp',
    'Notional Cost (£)',
    'NumCalls'
]

# Tangani NaN dulu, misalnya isi dengan median
data[num_cols] = data[num_cols].fillna(data[num_cols].median())

# Normalisasi
scaler = MinMaxScaler()
data[num_cols] = scaler.fit_transform(data[num_cols])


In [13]:
data.to_csv("firecase_prepared.csv", index=False)