In [5]:
!pip install numpy




[notice] A new release of pip is available: 24.0 -> 25.3
[notice] To update, run: C:\Users\tharu\AppData\Local\Programs\Python\Python312\python.exe -m pip install --upgrade pip


In [1]:
import pandas as pd
import numpy as np
import os

In [3]:
RAW_DATA_FILE = 'raw_data/SME-divya-3p-1ms-influxdata_1apr2021-31dec2021.csv'
CLEANED_DATA_DIR = 'data'
CLEANED_DATA_FILE = os.path.join(CLEANED_DATA_DIR, 'cleaned_bangalore_data.csv')

In [4]:
print(f"Loading raw data from {RAW_DATA_FILE}...")
try:
    df = pd.read_csv(RAW_DATA_FILE, comment='#')
except FileNotFoundError:
    print(f"ERROR: Raw data file not found at {RAW_DATA_FILE}")
    print("Please make sure the file is in the 'raw_data' directory.")
    exit()

Loading raw data from raw_data/SME-divya-3p-1ms-influxdata_1apr2021-31dec2021.csv...


In [None]:
df.head()

Unnamed: 0.1,Unnamed: 0,result,table,_start,_stop,_time,_value,_field,_measurement
0,,,0,2020-12-31T18:30:00Z,2022-12-31T18:29:59Z,2021-04-06T16:00:00.246Z,0.87,pf,Phase3
1,,,0,2020-12-31T18:30:00Z,2022-12-31T18:29:59Z,2021-04-06T16:00:07.203Z,0.87,pf,Phase3
2,,,0,2020-12-31T18:30:00Z,2022-12-31T18:29:59Z,2021-04-06T16:00:14.167Z,0.88,pf,Phase3
3,,,0,2020-12-31T18:30:00Z,2022-12-31T18:29:59Z,2021-04-06T16:00:21.245Z,0.88,pf,Phase3
4,,,0,2020-12-31T18:30:00Z,2022-12-31T18:29:59Z,2021-04-06T16:00:29.223Z,0.87,pf,Phase3


In [5]:
if all(col in df.columns for col in ['_time', '_value', '_field', '_measurement']):
    df = df[['_time', '_value', '_field', '_measurement']]
    print("Successfully loaded and selected relevant columns.")
else:
    print("ERROR: The CSV file is missing one of the required columns: '_time', '_value', '_field', '_measurement'")
    exit()

print(f"Raw data shape: {df.shape}")
print(df.head())

Successfully loaded and selected relevant columns.
Raw data shape: (5717164, 4)
                      _time  _value _field _measurement
0  2021-04-06T16:00:00.246Z    0.87     pf       Phase3
1  2021-04-06T16:00:07.203Z    0.87     pf       Phase3
2  2021-04-06T16:00:14.167Z    0.88     pf       Phase3
3  2021-04-06T16:00:21.245Z    0.88     pf       Phase3
4  2021-04-06T16:00:29.223Z    0.87     pf       Phase3


In [2]:
print(f"Raw data shape: {df.shape}")
print(df.head())

NameError: name 'df' is not defined

In [7]:
print("Pivoting data (long to wide format)... This may take a moment.")
try:
    df_pivoted = df.pivot_table(
        index='_time', 
        columns=['_measurement', '_field'], 
        values='_value'
    )
except Exception as e:
    print(f"ERROR during pivoting: {e}")
    print("There might be duplicate entries (same time, measurement, and field). Check data integrity.")
    exit()

print("Pivot complete.")
print(f"Pivoted data shape: {df_pivoted.shape}")
print(df_pivoted.head())

Pivoting data (long to wide format)... This may take a moment.
Pivot complete.
Pivoted data shape: (1733295, 6)
_measurement              Phase2            Phase3                   
_field                   current voltage frequency  pf  power voltage
_time                                                                
2021-04-06T15:00:02.187Z    0.75   245.1       NaN NaN    NaN     NaN
2021-04-06T15:00:04.433Z     NaN     NaN      50.0 NaN  332.8   243.0
2021-04-06T15:00:09.044Z    0.75   245.1       NaN NaN    NaN     NaN
2021-04-06T15:00:11.408Z     NaN     NaN      50.0 NaN  333.1   243.1
2021-04-06T15:00:15.901Z    0.74   245.1       NaN NaN    NaN     NaN


In [8]:
print("Cleaning column names (e.g., ('Phase1', 'pf') -> 'Phase1_pf')...")

df_pivoted.columns = ['_'.join(col) for col in df_pivoted.columns.values]
print(f"New columns: {df_pivoted.columns.tolist()[:5]}...")

Cleaning column names (e.g., ('Phase1', 'pf') -> 'Phase1_pf')...
New columns: ['Phase2_current', 'Phase2_voltage', 'Phase3_frequency', 'Phase3_pf', 'Phase3_power']...


In [10]:
print("Converting index to datetime (using ISO8601 format)...")

try:
    df_pivoted.index = pd.to_datetime(df_pivoted.index, format='ISO8601')
except Exception as e:
    print(f"ERROR converting datetime index: {e}")
    print("If this error persists, try: df_pivoted.index = pd.to_datetime(df_pivoted.index, format='mixed')")
    exit()

Converting index to datetime (using ISO8601 format)...


In [11]:
print("Resampling data to 1-hour intervals and imputing missing values...")
df_final = df_pivoted.resample('1H').mean()

df_final = df_final.ffill()

df_final = df_final.bfill() # Back-fill to handle NaNs at the start

print(f"Final cleaned data shape: {df_final.shape}")
print(df_final.head())

Resampling data to 1-hour intervals and imputing missing values...
Final cleaned data shape: (3188, 6)
                           Phase2_current  Phase2_voltage  Phase3_frequency  \
_time                                                                         
2021-04-06 15:00:00+00:00        0.279607      245.913626         49.989815   
2021-04-06 16:00:00+00:00        0.748005      244.346244         49.936300   
2021-04-06 17:00:00+00:00        0.272659      237.977802         49.819121   
2021-04-06 18:00:00+00:00        0.375438      237.161598         49.954103   
2021-04-06 19:00:00+00:00        0.478763      238.679474         49.999736   

                           Phase3_pf  Phase3_power  Phase3_voltage  
_time                                                               
2021-04-06 15:00:00+00:00   0.909368    361.782176      243.481019  
2021-04-06 16:00:00+00:00   0.909368    332.292037      242.144028  
2021-04-06 17:00:00+00:00   0.946044    211.166154      235.926374 

  df_final = df_pivoted.resample('1H').mean()


In [12]:
print("Saving cleaned, model-ready data...")
os.makedirs(CLEANED_DATA_DIR, exist_ok=True)

df_final.to_csv(CLEANED_DATA_FILE)

print(f"--- Pipeline Complete ---")
print(f"Cleaned data saved to: {CLEANED_DATA_FILE}")

Saving cleaned, model-ready data...
--- Pipeline Complete ---
Cleaned data saved to: data/cleaned_bangalore_data.csv


In [None]:
df_final.head()

Unnamed: 0_level_0,Phase2_current,Phase2_voltage,Phase3_frequency,Phase3_pf,Phase3_power,Phase3_voltage
_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2021-04-06 15:00:00+00:00,0.279607,245.913626,49.989815,0.909368,361.782176,243.481019
2021-04-06 16:00:00+00:00,0.748005,244.346244,49.9363,0.909368,332.292037,242.144028
2021-04-06 17:00:00+00:00,0.272659,237.977802,49.819121,0.946044,211.166154,235.926374
2021-04-06 18:00:00+00:00,0.375438,237.161598,49.954103,0.953615,246.339487,235.372564
2021-04-06 19:00:00+00:00,0.478763,238.679474,49.999736,0.96248,224.555673,237.166491


: 