In [33]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from datetime import datetime
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import roc_auc_score, roc_curve

%matplotlib inline

# warning control
import warnings
warnings.filterwarnings("ignore")



In [34]:
outages_df= pd.read_csv('kplc_planned_outages.csv')
outages_df.head()

Unnamed: 0,area,date,start_time,end_time,affected_customers
0,Part Of Lunga Lunga Road And Likoni Road,2026-01-18,08:00,17:00,"House of Manji, Wheatabix, Part of Likoni Rd, ..."
1,Whole Of Garage Road,2026-01-20,09:00,17:00,Whole of Garage Rd & adjacent customers.
2,Whole Of Mpweke Lane,2026-01-22,09:00,17:00,"Total South B, Part of Kapiti Rd, Whole of Mpw..."
3,Ongata Rongai,2026-01-21,09:00,17:00,"Whole of Gataka Rd from Masai Mall, Kisumu Ndo..."
4,Parts Of Kuresoi North,2026-01-21,09:00,17:00,"Murundu Mkt, Kibaraa Mkt, Kipkoris Mkt, Kipkew..."


In [35]:
outages_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13 entries, 0 to 12
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   area                13 non-null     object
 1   date                13 non-null     object
 2   start_time          13 non-null     object
 3   end_time            13 non-null     object
 4   affected_customers  13 non-null     object
dtypes: object(5)
memory usage: 652.0+ bytes


In [36]:
weather_df=pd.read_csv('nairobi_weather_2007_2008.csv')
weather_df.head()

Unnamed: 0,date,tmax,tmin,prcp,wspd_max
0,2007-01-01,24.7,13.1,1.0,17.6
1,2007-01-02,23.2,13.7,0.1,17.4
2,2007-01-03,22.6,14.8,0.9,20.0
3,2007-01-04,21.5,14.9,2.3,21.4
4,2007-01-05,23.3,12.9,0.0,18.5


In [37]:
weather_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 731 entries, 0 to 730
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   date      731 non-null    object 
 1   tmax      731 non-null    float64
 2   tmin      731 non-null    float64
 3   prcp      731 non-null    float64
 4   wspd_max  731 non-null    float64
dtypes: float64(4), object(1)
memory usage: 28.7+ KB


In [38]:
outage_df=pd.read_csv('kplc_daily_schedule.csv')
outage_df.head()

Unnamed: 0,date,scheduled_outage_today,n_scheduled_events,total_scheduled_minutes
0,2026-01-18,1,3,1380.0
1,2026-01-20,1,4,1950.0
2,2026-01-21,1,2,960.0
3,2026-01-22,1,3,1380.0
4,2026-01-23,1,1,450.0


In [39]:
outage_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 4 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   date                     5 non-null      object 
 1   scheduled_outage_today   5 non-null      int64  
 2   n_scheduled_events       5 non-null      int64  
 3   total_scheduled_minutes  5 non-null      float64
dtypes: float64(1), int64(2), object(1)
memory usage: 292.0+ bytes


In [40]:
power_df=pd.read_csv('power_multi_household_daily.csv')
power_df.head()

Unnamed: 0,meter_id,date,daily_mean_power,daily_std_power,daily_min_power,daily_max_power,voltage_mean,voltage_std,intensity_mean
0,MTR_001,2006-12-16,2.617495,0.882098,0.222336,6.66046,236.243763,2.922896,13.082828
1,MTR_001,2006-12-17,2.018833,1.032528,0.097898,6.024453,240.087028,4.051467,9.999028
2,MTR_001,2006-12-18,1.311082,0.862724,0.065486,5.25853,241.231694,3.719576,6.421667
3,MTR_001,2006-12-19,0.992268,1.063872,0.020072,6.75039,241.999313,3.069492,4.926389
4,MTR_001,2006-12-20,1.325185,1.134982,0.026983,5.161571,242.308063,3.345704,6.467361


In [41]:
power_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14420 entries, 0 to 14419
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   meter_id          14420 non-null  object 
 1   date              14420 non-null  object 
 2   daily_mean_power  14330 non-null  float64
 3   daily_std_power   14330 non-null  float64
 4   daily_min_power   14330 non-null  float64
 5   daily_max_power   14330 non-null  float64
 6   voltage_mean      14330 non-null  float64
 7   voltage_std       14330 non-null  float64
 8   intensity_mean    14330 non-null  float64
dtypes: float64(7), object(2)
memory usage: 1014.0+ KB


In [42]:
lead_df=pd.read_csv('lead1.0-small.csv')
lead_df.head()

Unnamed: 0,building_id,timestamp,meter_reading,anomaly
0,1,2016-01-01 00:00:00,,0
1,32,2016-01-01 00:00:00,,0
2,41,2016-01-01 00:00:00,,0
3,55,2016-01-01 00:00:00,,0
4,69,2016-01-01 00:00:00,,0


In [43]:
lead_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1749494 entries, 0 to 1749493
Data columns (total 4 columns):
 #   Column         Dtype  
---  ------         -----  
 0   building_id    int64  
 1   timestamp      object 
 2   meter_reading  float64
 3   anomaly        int64  
dtypes: float64(1), int64(2), object(1)
memory usage: 53.4+ MB


In [44]:
df=pd.read_csv('KPLC_Inspection_Report_2007_2008.csv')
df.head()

Unnamed: 0,meter_id,risk_level,risk_score,total_anomalies,percent_anomalous,max_streak_days,worst_anomaly_score,last_anomaly_date,alert_message
0,MTR_003,High,100.0,14,0.019178,1,-0.05824,2008-11-23,ALERT: Meter MTR_003 classified as High risk ...
1,MTR_009,High,69.2,15,0.020548,2,-0.049901,2008-11-23,ALERT: Meter MTR_009 classified as High risk ...
2,MTR_006,Medium,58.2,15,0.020548,2,-0.047639,2008-11-23,ALERT: Meter MTR_006 classified as Medium ris...
3,MTR_010,Medium,44.1,15,0.020548,2,-0.044759,2008-11-23,ALERT: Meter MTR_010 classified as Medium ris...
4,MTR_002,Medium,43.5,14,0.019178,2,-0.044645,2008-11-23,ALERT: Meter MTR_002 classified as Medium ris...


In [45]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   meter_id             10 non-null     object 
 1   risk_level           10 non-null     object 
 2   risk_score           10 non-null     float64
 3   total_anomalies      10 non-null     int64  
 4   percent_anomalous    10 non-null     float64
 5   max_streak_days      10 non-null     int64  
 6   worst_anomaly_score  10 non-null     float64
 7   last_anomaly_date    10 non-null     object 
 8   alert_message        10 non-null     object 
dtypes: float64(3), int64(2), object(4)
memory usage: 852.0+ bytes
