In [1]:
import pandas as pd

In [2]:
## connect to google drive
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


# **Sampling Data taking 50 unique LCLid of customers in STD and ToU Tariff**

In [8]:


filename = '/content/drive/MyDrive/LCL-FullData/CC_LCL-FullData.csv'
chunksize = 10**6  # Adjust based on available memory

std_ids = set()
tou_ids = set()
std_data = []
tou_data = []

# Read file in chunks
for chunk in pd.read_csv(filename, chunksize=chunksize):
    if len(std_ids) < 50:
        std_chunk = chunk[chunk['stdorToU'] == 'Std']
        new_std_ids = set(std_chunk['LCLid'].unique()) - std_ids
        if new_std_ids:
            std_ids.update(new_std_ids)
            if len(std_ids) > 50:
                std_ids = set(list(std_ids)[:50])
            std_data.append(std_chunk[std_chunk['LCLid'].isin(std_ids)])

    if len(tou_ids) < 50:
        tou_chunk = chunk[chunk['stdorToU'] == 'ToU']
        new_tou_ids = set(tou_chunk['LCLid'].unique()) - tou_ids
        if new_tou_ids:
            tou_ids.update(new_tou_ids)
            if len(tou_ids) > 50:
                tou_ids = set(list(tou_ids)[:50])
            tou_data.append(tou_chunk[tou_chunk['LCLid'].isin(tou_ids)])

    # Check if we have enough IDs
    if len(std_ids) >= 50 and len(tou_ids) >= 50:
        break

# Read the whole file again to collect all rows for the selected IDs
std_ids = list(std_ids)
tou_ids = list(tou_ids)

std_data = []
tou_data = []

for chunk in pd.read_csv(filename, chunksize=chunksize):
    std_data.append(chunk[chunk['LCLid'].isin(std_ids) & (chunk['stdorToU'] == 'Std')])
    tou_data.append(chunk[chunk['LCLid'].isin(tou_ids) & (chunk['stdorToU'] == 'ToU')])

# Concatenate the dataframes only if they are not empty
if std_data:
    std_data = pd.concat(std_data)
else:
    std_data = pd.DataFrame()

if tou_data:
    tou_data = pd.concat(tou_data)
else:
    tou_data = pd.DataFrame()

print("STD Data:")
print(std_data.head())
print("\nTOU Data:")
print(tou_data.head())

STD Data:
       LCLid stdorToU                     DateTime KWH/hh (per half hour) 
0  MAC000002      Std  2012-10-12 00:30:00.0000000                      0 
1  MAC000002      Std  2012-10-12 01:00:00.0000000                      0 
2  MAC000002      Std  2012-10-12 01:30:00.0000000                      0 
3  MAC000002      Std  2012-10-12 02:00:00.0000000                      0 
4  MAC000002      Std  2012-10-12 02:30:00.0000000                      0 

TOU Data:
               LCLid stdorToU                     DateTime  \
134148703  MAC000005      ToU  2012-06-01 10:30:00.0000000   
134148704  MAC000005      ToU  2012-06-01 11:00:00.0000000   
134148705  MAC000005      ToU  2012-06-01 11:30:00.0000000   
134148706  MAC000005      ToU  2012-06-01 12:00:00.0000000   
134148707  MAC000005      ToU  2012-06-01 12:30:00.0000000   

          KWH/hh (per half hour)   
134148703                  0.095   
134148704                  0.051   
134148705                  0.098   
134148706   

In [9]:
std_data.describe()

Unnamed: 0,LCLid,stdorToU,DateTime,KWH/hh (per half hour)
count,1782397,1782397,1782397,1782397
unique,50,1,39106,4938
top,MAC000018,Std,2012-12-21 00:00:00.0000000,0
freq,39082,1782397,100,88707


In [10]:
tou_data.describe()

Unnamed: 0,LCLid,stdorToU,DateTime,KWH/hh (per half hour)
count,1852623,1852623,1852623,1852623.0
unique,50,1,39755,2793.0
top,MAC000147,ToU,2012-11-20 00:00:00.0000000,0.058
freq,39752,1852623,100,11575.0


**Both data frame std_data & tou_data has 50 unique id**

## **Finding Common interval for both Std & ToU Data Frames**

In [26]:
# Ensure the date column is of datetime type
std_data['DateTime'] = pd.to_datetime(std_data['DateTime'], errors='coerce')
tou_data['DateTime'] = pd.to_datetime(tou_data['DateTime'], errors='coerce')

# Drop rows with NaT in date column
std_data = std_data.dropna(subset=['DateTime'])
tou_data = tou_data.dropna(subset=['DateTime'])


In [40]:
# Group by LCLid and calculate the start and end dates
std_dates = std_data.groupby('LCLid')['DateTime'].agg(['min', 'max']).reset_index()
tou_dates = tou_data.groupby('LCLid')['DateTime'].agg(['min', 'max']).reset_index()

print("STD Dates:")
print(std_dates.head())
print("\nTOU Dates:")
print(tou_dates.head())

STD Dates:
       LCLid                 min        max
0  MAC000002 2012-10-12 00:30:00 2014-02-28
1  MAC000003 2012-02-20 13:00:00 2014-02-28
2  MAC000004 2012-05-08 13:00:00 2014-02-28
3  MAC000007 2012-09-24 12:00:00 2014-02-28
4  MAC000008 2012-05-05 09:00:00 2013-10-30

TOU Dates:
       LCLid                 min        max
0  MAC000005 2012-06-01 10:30:00 2014-02-28
1  MAC000014 2012-10-15 11:30:00 2014-02-28
2  MAC000015 2011-12-06 13:00:00 2014-02-28
3  MAC000017 2011-12-06 15:30:00 2014-02-28
4  MAC000031 2011-12-07 12:00:00 2014-02-28


In [19]:
std_dates.to_csv('std_dates.csv')

In [14]:
tou_dates.to_csv('tou_dates.csv')

In [41]:
std_dates.rename(columns={'min': 'start', 'max': 'end'}, inplace=True)

In [42]:
tou_dates.rename(columns={'min': 'start', 'max': 'end'}, inplace=True)

In [43]:
# Combine the two dataframes
combined_dates = pd.concat([std_dates, tou_dates], axis=0, ignore_index=True)

In [44]:
combined_dates

Unnamed: 0,LCLid,start,end
0,MAC000002,2012-10-12 00:30:00,2014-02-28
1,MAC000003,2012-02-20 13:00:00,2014-02-28
2,MAC000004,2012-05-08 13:00:00,2014-02-28
3,MAC000007,2012-09-24 12:00:00,2014-02-28
4,MAC000008,2012-05-05 09:00:00,2013-10-30
...,...,...,...
95,MAC000288,2012-03-01 15:00:00,2014-02-28
96,MAC000290,2012-03-02 09:00:00,2014-02-28
97,MAC000292,2012-03-02 10:00:00,2014-02-28
98,MAC000293,2012-03-02 10:00:00,2014-02-28


In [45]:
# Calculate the common interval
common_start = combined_dates['start'].max()  # The latest start date
common_end = combined_dates['end'].min()      # The earliest end date

# Check if the common interval is valid
if common_start <= common_end:
    print(f"Common interval: Start = {common_start}, End = {common_end}")
else:
    print("No common interval found.")

Common interval: Start = 2012-10-22 11:00:00, End = 2013-01-16 00:00:00


## **Common interval: Start = 2012-10-22 11:00:00, End = 2013-01-16 00:00:00**

## **Need to find a combination of LCLid which gives best possible Common interval**

*   **Visualize the std_data & tou_date**
*   **Handle Missing Values**
