In [None]:
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

In [None]:
df = pd.read_csv('/content/Facility Management Unified Classification Database (FMUCD).csv')

In [None]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
print(df.columns)


Index(['UniversityID', 'Country', 'State/Province', 'BuildingID',
       'BuildingName', 'Size', 'Type', 'BuiltYear',
       'FCI (facility condition index)', 'CRV (current replacement value)',
       'DMC (deferred maintenance cost)', 'SystemCode', 'SystemDescription',
       'SubsystemCode', 'SubsystemDescription', 'DescriptiveCode',
       'ComponentDescription', 'WOID', 'WODescription', 'WOPriority',
       'WOStartDate', 'WOEndDate', 'WODuration', 'PPM/UPM', 'LaborCost',
       'MaterialCost', 'OtherCost', 'TotalCost', 'LaborHours', 'MinTemp.(°C)',
       'MaxTemp.(°C)', 'Atmospheric pressure(hPa)', 'Humidity(%)',
       'WindSpeed(m/s)', 'WindDegree', 'Precipitation(mm)', 'Snow(mm)',
       'Cloudness(%)'],
      dtype='object')


In [None]:
# 1. Data Preprocessing
# Handling Missing Values
numerical_cols = ['Size', 'BuiltYear', 'FCI (facility condition index)', 'CRV (current replacement value)', 'DMC (deferred maintenance cost)', 'WODuration', 'LaborCost', 'MaterialCost', 'OtherCost', 'TotalCost', 'LaborHours', 'MinTemp.(°C)', 'MaxTemp.(°C)', 'Atmospheric pressure(hPa)', 'Humidity(%)', 'WindSpeed(m/s)', 'WindDegree', 'Precipitation(mm)', 'Snow(mm)', 'Cloudness(%)']
for col in numerical_cols:
    df[col] = df[col].fillna(df[col].mean())

categorical_cols = ['UniversityID', 'Country', 'BuildingID', 'State/Province', 'PPM/UPM' , 'BuildingName', 'Type', 'SystemCode', 'SystemDescription', 'SubsystemDescription', 'SubsystemCode', 'DescriptiveCode', 'ComponentDescription', 'WOID', 'WODescription', 'WOPriority']
for col in categorical_cols:
    df[col] = df[col].fillna(df[col].mode()[0])

In [None]:

df['WOPriority'] = pd.to_numeric(df['WOPriority'], errors='coerce')
df['WOPriority_High'] = (df['WOPriority'] > 90).astype(int)

In [None]:
# 2. Feature Engineering
df['TaskFrequency'] = df.groupby('WOID')['WOID'].transform('count')
df['WOsPerBuilding'] = df.groupby('BuildingID')['WOID'].transform('count')
df['WOStartMonth'] = pd.to_datetime(df['WOStartDate'], dayfirst=True).dt.month
df['WOStartYear'] = pd.to_datetime(df['WOStartDate'], dayfirst=True).dt.year
df['WOEndMonth'] = pd.to_datetime(df['WOEndDate'], dayfirst=True).dt.month
df['WOEndYear'] = pd.to_datetime(df['WOEndDate'], dayfirst=True).dt.year

In [None]:
# 3. Drop Columns
columns_to_drop = ['UniversityID', 'BuildingID', 'BuildingName', 'SystemCode', 'SubsystemCode', 'DescriptiveCode', 'ComponentDescription', 'WOID', 'WODescription', 'Country', 'State/Province', 'SystemDescription', 'SubsystemDescription','WOStartDate','WOEndDate']  
df = df.drop(columns=columns_to_drop, axis=1)

In [None]:
# 4. Handling Class Imbalances with SMOTE
# print(df.columns)
# df['WOPriority_High'] = (df['WOPriority'] > 90).astype(int)
# print(df['WOPriority_High'])
X = df.drop(['WOPriority'], axis=1)  
y = df['WOPriority']  
print(X.columns)

Index(['Size', 'Type', 'BuiltYear', 'FCI (facility condition index)',
       'CRV (current replacement value)', 'DMC (deferred maintenance cost)',
       'WODuration', 'PPM/UPM', 'LaborCost', 'MaterialCost', 'OtherCost',
       'TotalCost', 'LaborHours', 'MinTemp.(°C)', 'MaxTemp.(°C)',
       'Atmospheric pressure(hPa)', 'Humidity(%)', 'WindSpeed(m/s)',
       'WindDegree', 'Precipitation(mm)', 'Snow(mm)', 'Cloudness(%)',
       'WOPriority_High', 'TaskFrequency', 'WOsPerBuilding', 'WOStartMonth',
       'WOStartYear', 'WOEndMonth', 'WOEndYear'],
      dtype='object')


In [None]:
# one-hot coding
categorical_features = X.select_dtypes(include=['object']).columns
X = pd.get_dummies(X, columns=categorical_features, drop_first=True)

In [None]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_train.isnull().sum())

Size                                0
BuiltYear                           0
FCI (facility condition index)      0
CRV (current replacement value)     0
DMC (deferred maintenance cost)     0
WODuration                          0
LaborCost                           0
MaterialCost                        0
OtherCost                           0
TotalCost                           0
LaborHours                          0
MinTemp.(°C)                        0
MaxTemp.(°C)                        0
Atmospheric pressure(hPa)           0
Humidity(%)                         0
WindSpeed(m/s)                      0
WindDegree                          0
Precipitation(mm)                   0
Snow(mm)                            0
Cloudness(%)                        0
WOPriority_High                     0
TaskFrequency                       0
WOsPerBuilding                      0
WOStartMonth                        0
WOStartYear                         0
WOEndMonth                         97
WOEndYear   

In [None]:
date_cols = ['WOStartMonth', 'WOStartYear', 'WOEndMonth', 'WOEndYear']

imputer_date = SimpleImputer(strategy='most_frequent')

X_train[date_cols] = imputer_date.fit_transform(X_train[date_cols])
X_test[date_cols] = imputer_date.transform(X_test[date_cols])

In [None]:
print(X_train.isnull().sum())

Size                               0
BuiltYear                          0
FCI (facility condition index)     0
CRV (current replacement value)    0
DMC (deferred maintenance cost)    0
WODuration                         0
LaborCost                          0
MaterialCost                       0
OtherCost                          0
TotalCost                          0
LaborHours                         0
MinTemp.(°C)                       0
MaxTemp.(°C)                       0
Atmospheric pressure(hPa)          0
Humidity(%)                        0
WindSpeed(m/s)                     0
WindDegree                         0
Precipitation(mm)                  0
Snow(mm)                           0
Cloudness(%)                       0
WOPriority_High                    0
TaskFrequency                      0
WOsPerBuilding                     0
WOStartMonth                       0
WOStartYear                        0
WOEndMonth                         0
WOEndYear                          0
T

In [None]:
smote = SMOTE(random_state=42, k_neighbors=1)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [None]:
print(X_train_resampled)

                 Size    BuiltYear  FCI (facility condition index)  \
0        70640.000000  1999.000000                        0.378876   
1       107080.000000  1971.000000                        0.561959   
2        32011.000000  1957.000000                        0.396217   
3        94270.000000  1967.000000                        0.691328   
4        33771.000000  1923.000000                        0.387749   
...               ...          ...                             ...   
185227  148327.623730  1960.704300                        0.379552   
185228  116451.431776  1959.835131                        0.363232   
185229   28172.567160  1957.428026                        0.318033   
185230  106364.244860  1959.560083                        0.358067   
185231  200920.503662  1962.138353                        0.406479   

        CRV (current replacement value)  DMC (deferred maintenance cost)  \
0                          1.387513e+07                     5.256952e+06   
1      

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

rf_classifier = RandomForestClassifier(random_state=42)  # You can adjust hyperparameters here

rf_classifier.fit(X_train_resampled, y_train_resampled)

y_pred = rf_classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

print(classification_report(y_test, y_pred))

Accuracy: 0.6618097220521612
              precision    recall  f1-score   support

           0       0.15      0.03      0.05        65
           1       0.47      0.50      0.48      1580
           2       0.52      0.61      0.56      1964
           3       0.90      0.96      0.93      2913
           4       0.96      0.80      0.87        84
           5       0.73      0.54      0.62       520
           6       0.45      0.28      0.34        18
           7       0.39      0.24      0.30       732
          14       0.50      0.25      0.33       141
          21       0.40      0.19      0.26        31
          30       0.43      0.44      0.43       110
          92       0.00      0.00      0.00         2
          96       1.00      1.00      1.00         6
          97       1.00      1.00      1.00         1

    accuracy                           0.66      8167
   macro avg       0.57      0.49      0.51      8167
weighted avg       0.65      0.66      0.65      81

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
