In [3]:
pip install pandas ctgan scikit-learn


Collecting ctgan
  Downloading ctgan-0.10.2-py3-none-any.whl.metadata (10 kB)
Collecting rdt>=1.11.0 (from ctgan)
  Downloading rdt-1.13.2-py3-none-any.whl.metadata (10 kB)
Collecting torch>=2.2.0 (from ctgan)
  Downloading torch-2.5.1-cp312-cp312-win_amd64.whl.metadata (28 kB)
Collecting Faker>=17 (from rdt>=1.11.0->ctgan)
  Downloading Faker-33.3.1-py3-none-any.whl.metadata (15 kB)
Collecting sympy==1.13.1 (from torch>=2.2.0->ctgan)
  Downloading sympy-1.13.1-py3-none-any.whl.metadata (12 kB)
Downloading ctgan-0.10.2-py3-none-any.whl (23 kB)
Downloading rdt-1.13.2-py3-none-any.whl (66 kB)
Downloading torch-2.5.1-cp312-cp312-win_amd64.whl (203.0 MB)
   ---------------------------------------- 0.0/203.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/203.0 MB ? eta -:--:--
   ---------------------------------------- 0.3/203.0 MB ? eta -:--:--
   ---------------------------------------- 0.3/203.0 MB ? eta -:--:--
   ---------------------------------------- 0.5/203.0 MB 

In [2]:
import pandas as pd
from ctgan import CTGAN
from sklearn.model_selection import train_test_split

In [6]:
weather_file = "puneWeather.csv"  
weather_data = pd.read_csv(weather_file)

In [7]:
weather_data = weather_data[['datetime', 'temp', 'humidity', 'solarenergy', 'conditions']]
weather_data.rename(columns={
    'datetime': 'date',
    'temp': 'avg_temperature',
    'humidity': 'avg_humidity',
    'solar_energy_kWh_m2': 'solar_energy_kWh_m2',
    'conditions': 'weather_conditions'
}, inplace=True)

In [8]:
weather_data['date'] = pd.to_datetime(weather_data['date'])

In [9]:
energy_file = "updated_union_processed_dataset.csv"  
energy_data = pd.read_csv(energy_file)

In [10]:
print(energy_data.columns)


Index(['Date', 'Temp (°C)', 'Humidity (%)', 'Season', 'Solar Energy (kWh)',
       'Precipitation (mm)', 'Population', 'Total Usage (kWh)',
       'Urban Usage (kWh)', 'Rural Usage (kWh)'],
      dtype='object')


In [11]:
energy_data.rename(columns={'Date': 'date'}, inplace=True)


In [12]:
print(energy_data.head())


         date  Temp (°C)  Humidity (%)  Season  Solar Energy (kWh)  \
0  2018-01-01       19.4          62.6  Winter                18.6   
1  2018-01-02       19.4          64.0  Winter                18.7   
2  2018-01-03       19.5          66.6  Winter                18.8   
3  2018-01-04       20.1          66.6  Winter                18.4   
4  2018-01-05       20.0          66.7  Winter                17.5   

   Precipitation (mm)  Population  Total Usage (kWh)  Urban Usage (kWh)  \
0                 0.0   3130000.0             2870.0             2009.0   
1                 0.0   3130000.0             2875.0             2012.5   
2                 0.0   3130000.0             2890.0             2023.0   
3                 0.0   3130000.0             2930.0             2051.0   
4                 0.0   3130000.0             2875.0             2012.5   

   Rural Usage (kWh)  
0              861.0  
1              862.5  
2              867.0  
3              879.0  
4            

In [14]:
energy_data['date'] = pd.date_range(start="2019-01-01", periods=len(energy_data), freq='D')


In [17]:
energy_data['date'] = pd.to_datetime(energy_data['date'])
energy_data['Total Usage (kWh)'] = energy_data['Total Usage (kWh)'].fillna(58800000)



In [18]:
print(energy_data.columns)


Index(['date', 'Temp (°C)', 'Humidity (%)', 'Season', 'Solar Energy (kWh)',
       'Precipitation (mm)', 'Population', 'Total Usage (kWh)',
       'Urban Usage (kWh)', 'Rural Usage (kWh)'],
      dtype='object')


In [19]:
energy_data = energy_data.loc[:, ~energy_data.columns.duplicated()]


In [20]:
print(energy_data.columns)

Index(['date', 'Temp (°C)', 'Humidity (%)', 'Season', 'Solar Energy (kWh)',
       'Precipitation (mm)', 'Population', 'Total Usage (kWh)',
       'Urban Usage (kWh)', 'Rural Usage (kWh)'],
      dtype='object')


In [21]:
energy_data['residential_kWh'] = energy_data['Total Usage (kWh)'] * 0.35
energy_data['commercial_kWh'] = energy_data['Total Usage (kWh)'] * 0.25
energy_data['industrial_kWh'] = energy_data['Total Usage (kWh)'] * 0.3
energy_data['agricultural_kWh'] = energy_data['Total Usage (kWh)'] * 0.1

In [22]:
def add_covid_effect(date):
    if date < pd.Timestamp("2020-03-01"):
        return 0  # Pre-COVID
    elif pd.Timestamp("2020-03-01") <= date <= pd.Timestamp("2021-06-30"):
        return 1  # Lockdown
    else:
        return 2  # Post-COVID

energy_data['covid_effect'] = energy_data['date'].apply(add_covid_effect)

In [23]:
def adjust_energy(row):
    if row['covid_effect'] == 1:  # Lockdown
        row['residential_kWh'] *= 1.2  
        row['commercial_kWh'] *= 0.5  
        row['industrial_kWh'] *= 0.7  
    return row

energy_data = energy_data.apply(adjust_energy, axis=1)

In [24]:
merged_data = pd.merge(energy_data, weather_data, on='date', how='inner')

In [25]:
categorical_columns = ['weather_conditions', 'covid_effect']  # Categorical columns
continuous_columns = ['avg_temperature', 'avg_humidity', 'solar_energy_kWh_m2', 
                      'Total Usage (kWh)', 'residential_kWh', 'commercial_kWh', 
                      'industrial_kWh', 'agricultural_kWh']

# Split the data into training and testing sets
train_data, test_data = train_test_split(merged_data, test_size=0.2, random_state=42)

In [29]:
print(energy_data.columns)

Index(['Temp (°C)', 'Humidity (%)', 'Season', 'Solar Energy (kWh)',
       'Precipitation (mm)', 'Population', 'Total Usage (kWh)',
       'Urban Usage (kWh)', 'Rural Usage (kWh)', 'residential_kWh',
       'commercial_kWh', 'industrial_kWh', 'agricultural_kWh', 'covid_effect',
       'year', 'month', 'day'],
      dtype='object')


In [30]:
categorical_columns = ['Temp (°C)', 'Humidity (%)', 'Season', 'Solar Energy (kWh)',
       'Precipitation (mm)', 'Population', 'Total Usage (kWh)',
       'Urban Usage (kWh)', 'Rural Usage (kWh)', 'residential_kWh',
       'commercial_kWh', 'industrial_kWh', 'agricultural_kWh', 'covid_effect',
       'year', 'month', 'day']

In [27]:
energy_data['year'] = energy_data['date'].dt.year
energy_data['month'] = energy_data['date'].dt.month
energy_data['day'] = energy_data['date'].dt.day

In [35]:
train_data.columns = train_data.columns.str.strip()  
train_data.columns = train_data.columns.str.lower()  
print(train_data.columns)


Index(['date', 'temp (°c)', 'humidity (%)', 'season', 'solar energy (kwh)',
       'precipitation (mm)', 'population', 'total usage (kwh)',
       'urban usage (kwh)', 'rural usage (kwh)', 'residential_kwh',
       'commercial_kwh', 'industrial_kwh', 'agricultural_kwh', 'covid_effect',
       'avg_temperature', 'avg_humidity', 'solarenergy', 'weather_conditions'],
      dtype='object')


In [36]:
train_data.head()


Unnamed: 0,date,temp (°c),humidity (%),season,solar energy (kwh),precipitation (mm),population,total usage (kwh),urban usage (kwh),rural usage (kwh),residential_kwh,commercial_kwh,industrial_kwh,agricultural_kwh,covid_effect,avg_temperature,avg_humidity,solarenergy,weather_conditions
30,2019-01-31,20.7,47.9,Winter,21.4,0.0,3130000.0,3140.0,2198.0,942.0,1099.0,785.0,942.0,314.0,0,19.8,54.9,21.0,Clear
1178,2022-03-24,26.5,55.8,Summer,20.9,0.1,3130000.0,3506.819147,2001.325,850.53,1227.386702,876.704787,1052.045744,350.681915,2,27.8,56.5,17.1,"Rain, Partially cloudy"
1628,2023-06-17,26.8,74.1,Monsoon,16.4,,,3625.93809,2538.156663,1087.781427,1269.078332,906.484523,1087.781427,362.593809,2,27.4,73.4,20.3,"Rain, Partially cloudy"
764,2021-02-03,21.6,70.7,Winter,18.4,0.0,3130000.0,3229.726703,2001.08,850.432,1356.485215,403.715838,678.242608,322.97267,1,20.5,52.5,21.1,Clear
1317,2022-08-10,24.2,85.3,Monsoon,15.1,0.7,3130000.0,2851.694,2001.21,850.484,998.0929,712.9235,855.5082,285.1694,2,22.9,92.1,10.9,"Rain, Overcast"


In [51]:

if 'year' not in train_data.columns:
    print("Expected column 'year' not found. Check the dataset.")
else:
    train_data['year'] = train_data['year'].astype(str)
    train_data['month'] = train_data['month'].astype(str)
    train_data['day'] = train_data['day'].astype(str)


categorical_columns = ['season','weather_conditions', 'covid_effect']  

Expected column 'year' not found. Check the dataset.


In [52]:
print(train_data.columns)


Index(['date', 'temp (°c)', 'humidity (%)', 'season', 'solar energy (kwh)',
       'precipitation (mm)', 'population', 'total usage (kwh)',
       'urban usage (kwh)', 'rural usage (kwh)', 'residential_kwh',
       'commercial_kwh', 'industrial_kwh', 'agricultural_kwh', 'covid_effect',
       'avg_temperature', 'avg_humidity', 'solarenergy', 'weather_conditions'],
      dtype='object')


In [53]:
print(train_data.columns)
print(categorical_columns)


Index(['date', 'temp (°c)', 'humidity (%)', 'season', 'solar energy (kwh)',
       'precipitation (mm)', 'population', 'total usage (kwh)',
       'urban usage (kwh)', 'rural usage (kwh)', 'residential_kwh',
       'commercial_kwh', 'industrial_kwh', 'agricultural_kwh', 'covid_effect',
       'avg_temperature', 'avg_humidity', 'solarenergy', 'weather_conditions'],
      dtype='object')
['season', 'weather_conditions', 'covid_effect']


In [54]:
ctgan = CTGAN(epochs=300, batch_size=500)
ctgan.fit(train_data, discrete_columns=categorical_columns)


TypeError: Cannot cast DatetimeArray to dtype float64

In [49]:
print(train_data.dtypes)


date                  datetime64[ns]
temp (°c)                    float64
humidity (%)                 float64
season                        object
solar energy (kwh)           float64
precipitation (mm)           float64
population                   float64
total usage (kwh)            float64
urban usage (kwh)            float64
rural usage (kwh)            float64
residential_kwh              float64
commercial_kwh               float64
industrial_kwh               float64
agricultural_kwh             float64
covid_effect                   int64
avg_temperature              float64
avg_humidity                 float64
solarenergy                  float64
weather_conditions            object
dtype: object


In [56]:
train_data.drop('date', axis=1, errors='ignore')


Unnamed: 0,temp (°c),humidity (%),season,solar energy (kwh),precipitation (mm),population,total usage (kwh),urban usage (kwh),rural usage (kwh),residential_kwh,commercial_kwh,industrial_kwh,agricultural_kwh,covid_effect,avg_temperature,avg_humidity,solarenergy,weather_conditions
30,20.7,47.9,Winter,21.4,0.0,3130000.0,3140.000000,2198.000000,942.000000,1099.000000,785.000000,942.000000,314.000000,0,19.8,54.9,21.0,Clear
1178,26.5,55.8,Summer,20.9,0.1,3130000.0,3506.819147,2001.325000,850.530000,1227.386702,876.704787,1052.045744,350.681915,2,27.8,56.5,17.1,"Rain, Partially cloudy"
1628,26.8,74.1,Monsoon,16.4,,,3625.938090,2538.156663,1087.781427,1269.078332,906.484523,1087.781427,362.593809,2,27.4,73.4,20.3,"Rain, Partially cloudy"
764,21.6,70.7,Winter,18.4,0.0,3130000.0,3229.726703,2001.080000,850.432000,1356.485215,403.715838,678.242608,322.972670,1,20.5,52.5,21.1,Clear
1317,24.2,85.3,Monsoon,15.1,0.7,3130000.0,2851.694000,2001.210000,850.484000,998.092900,712.923500,855.508200,285.169400,2,22.9,92.1,10.9,"Rain, Overcast"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1638,24.2,87.1,Monsoon,9.1,,,3496.676327,2447.673429,1049.002898,1223.836715,874.169082,1049.002898,349.667633,2,23.6,94.5,3.6,"Rain, Overcast"
1095,20.6,70.2,Winter,17.5,0.0,3130000.0,3375.384807,2001.030000,850.412000,1181.384682,843.846202,1012.615442,337.538481,2,19.9,77.0,17.2,Partially cloudy
1130,20.7,55.2,Winter,21.1,0.0,3130000.0,3150.081193,2001.035000,850.414000,1102.528418,787.520298,945.024358,315.008119,2,19.7,66.7,19.6,Partially cloudy
1294,24.2,89.8,Monsoon,7.4,10.0,3130000.0,2851.694000,2001.210000,850.484000,998.092900,712.923500,855.508200,285.169400,2,23.1,92.1,5.6,"Rain, Overcast"


In [58]:
train_data['year'] = train_data['date'].dt.year
train_data['month'] = train_data['date'].dt.month
train_data['day'] = train_data['date'].dt.day
train_data['weekday'] = train_data['date'].dt.weekday
train_data.drop('date', axis=1, inplace=True)


In [59]:
ctgan = CTGAN(epochs=300, batch_size=500)
ctgan.fit(train_data, discrete_columns=categorical_columns)


In [68]:
from ctgan import CTGAN
from ctgan.data_transformer import DataTransformer

transformer = DataTransformer()
transformer.fit(train_data, discrete_columns=categorical_columns)
transformed_data = transformer.transform(train_data)

print("Transformed data shape:", transformed_data.shape)


Transformed data shape: (1676, 165)


In [70]:
from ctgan import CTGAN

ctgan = CTGAN()
ctgan.fit(transformed_data, epochs=300)


In [71]:
num_samples = len(train_data)
synthetic_data = ctgan.sample(num_samples)
print("Synthetic data shape:", synthetic_data.shape)

Synthetic data shape: (1676, 165)


In [75]:

if synthetic_data.shape[1] != transformed_data.shape[1]:
    print("Shape mismatch detected.")
    raise ValueError(f"Synthetic data columns: {synthetic_data.shape[1]}, "
                     f"Expected: {transformed_data.shape[1]}")



In [80]:
print("Train data shape:", train_data.shape)
print("Synthetic data shape:", synthetic_data.shape)


Train data shape: (1676, 22)
Synthetic data shape: (1676, 165)


In [81]:
recovered_data = transformer.inverse_transform(synthetic_data)


ValueError: Shape of passed values is (1676, 2), indices imply (1676, 3)

In [79]:
import pandas as pd
synthetic_df = pd.DataFrame(synthetic_data, columns=train_data.columns)

synthetic_df.to_csv("synthetic_energy_data.csv", index=False)

print("Synthetic Data Generated and Saved as synthetic_energy_data.csv")


ValueError: Shape of passed values is (1676, 165), indices imply (1676, 22)

In [76]:
recovered_data = transformer.inverse_transform(synthetic_data)
print("Recovered data shape:", recovered_data.shape)

ValueError: Shape of passed values is (1676, 2), indices imply (1676, 3)

In [67]:
print("Train data shape:", train_data.shape)
print("Discrete columns:", categorical_columns)


Train data shape: (1676, 22)
Discrete columns: ['season', 'weather_conditions', 'covid_effect']
