In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/predict-energy-behavior-of-prosumers/client.csv
/kaggle/input/predict-energy-behavior-of-prosumers/gas_prices.csv
/kaggle/input/predict-energy-behavior-of-prosumers/electricity_prices.csv
/kaggle/input/predict-energy-behavior-of-prosumers/weather_station_to_county_mapping.csv
/kaggle/input/predict-energy-behavior-of-prosumers/public_timeseries_testing_util.py
/kaggle/input/predict-energy-behavior-of-prosumers/historical_weather.csv
/kaggle/input/predict-energy-behavior-of-prosumers/county_id_to_name_map.json
/kaggle/input/predict-energy-behavior-of-prosumers/train.csv
/kaggle/input/predict-energy-behavior-of-prosumers/forecast_weather.csv
/kaggle/input/predict-energy-behavior-of-prosumers/example_test_files/sample_submission.csv
/kaggle/input/predict-energy-behavior-of-prosumers/example_test_files/client.csv
/kaggle/input/predict-energy-behavior-of-prosumers/example_test_files/gas_prices.csv
/kaggle/input/predict-energy-behavior-of-prosumers/example_test_files/electricity

In [3]:
def generate_featuers(
    df_data,
    df_client,
    df_gas_prices,
    df_electricity_prices,
    df_forecast_weather,
    df_historical_weather,
    df_weather_station_to_county_mapping,
    train_start_timestep = '2021-09-01 11:00:00',
    gas_end_date = '2022-01-10 23:00:00'
    
    ):

    df_weather_station_to_county_mapping = df_weather_station_to_county_mapping[df_weather_station_to_county_mapping.notnull().all(axis=1)].sort_values(by="county")
    result_dict = dict(zip(zip(round(df_weather_station_to_county_mapping['latitude'],1), round(df_weather_station_to_county_mapping['longitude'],1)), df_weather_station_to_county_mapping['county']))

    df_historical_weather = df_historical_weather[df_historical_weather['datetime'] >= train_start_timestep]

    
    #Merge df_data and df_client
    df_client_try = df_client.copy(deep=True)
    df_client_try['date'] = pd.to_datetime(df_client_try['date'])
    df_client_try['datetime'] = df_client_try['date'].apply(lambda x: [x + pd.Timedelta(hours=i) for i in range(24)])
    df_client_try = df_client_try.explode('datetime')
    df_client_try = df_client_try.drop(['date','data_block_id'], axis=1)
    
    df_data['datetime'] = pd.to_datetime(df_data['datetime'])
    df_data = df_data.merge(df_client_try, on=['county','product_type','is_business','datetime'], how='left')

    endDate = df_client_try['datetime'].max()
    df_data = df_data[df_data['datetime'] <= endDate]
    

    #Merge df_data and df_gas_prices
    df_gas_try = df_gas_prices.copy(deep=True)
    df_gas_try['forecast_date'] = pd.to_datetime(df_gas_try['forecast_date'])
    df_gas_try['datetime'] = df_gas_try['forecast_date'].apply(lambda x: [x + pd.Timedelta(hours=i) for i in range(24)])
    df_gas_try = df_gas_try.explode('datetime')
    df_gas_try = df_gas_try.drop(['forecast_date' , 'origin_date' , 'data_block_id'], axis=1)
    gas_end_date = df_gas_try['datetime'].max()
    df_data = df_data[df_data['datetime'] <= gas_end_date]
    df_data = df_data.merge(df_gas_try, on=['datetime'], how='left')

    #Merge df_data and df_electricity_prices
    df_electricity_prices_try = df_electricity_prices.copy(deep=True)
    df_electricity_prices_try.drop(columns=["origin_date" , 'data_block_id'], inplace=True)
    df_electricity_prices_try['forecast_date'] = pd.to_datetime(df_electricity_prices_try['forecast_date'])
    df_electricity_prices_try.rename(columns={"forecast_date": "datetime"}, inplace=True)
    df_data = df_data.merge(df_electricity_prices_try, on=['datetime'], how='left')

    #Merge df_data and df_forecast_weather
    df_forecast_weather_copy = df_forecast_weather.copy(deep=True)
    df_forecast_weather_copy['county'] = [result_dict.get((x, y), -1) for x, y in zip(df_forecast_weather_copy['latitude'], df_forecast_weather_copy['longitude'])]
    df_forecast_weather_copy = df_forecast_weather_copy[df_forecast_weather_copy['county']!=-1]
    df_forecast_weather_copy['origin_datetime'] = pd.to_datetime(df_forecast_weather_copy['origin_datetime'])
    df_forecast_weather_copy['origin_datetime'] = pd.to_datetime(df_forecast_weather_copy['origin_datetime'].dt.date.astype(str) + ' 02:00:00')
    df_forecast_weather_copy['forecast_datetime'] = df_forecast_weather_copy['origin_datetime'] + pd.to_timedelta(df_forecast_weather_copy['hours_ahead'], unit='h')
    df_forecast_weather_copy = df_forecast_weather_copy.drop('origin_datetime',axis=1)
    df_forecast_weather_copy = df_forecast_weather_copy.sort_values(by=['latitude','longitude','forecast_datetime', 'hours_ahead'])
    df_forecast_weather_copy['cumcount'] = (df_forecast_weather_copy['hours_ahead']-1)//24+1
    
    columns_to_average = [col for col in df_forecast_weather_copy.columns if col not in ['latitude', 'longitude', 'hours_ahead' , 'forecast_datetime','cumcount']]
    agg_dict = {col: 'mean' for col in columns_to_average}
    agg_dict['cumcount'] = 'first'  # to preserve the cumcount value
    df_forecast_weather_copy = df_forecast_weather_copy.groupby(['county','forecast_datetime', 'cumcount']).agg(agg_dict)
    df_forecast_weather_copy=df_forecast_weather_copy.unstack(level=-1)
    df_forecast_weather_copy.columns = [f'{col[0]}_{col[1]}' for col in df_forecast_weather_copy.columns]
    df_forecast_weather_copy.reset_index(inplace=True)
    df_forecast_weather_copy.rename(columns={'forecast_datetime': 'datetime'}, inplace=True)
    df_forecast_weather_copy = df_forecast_weather_copy.drop(['county_1','county_2','cumcount_1','cumcount_2'],axis=1)
    df_forecast_weather_copy.fillna(0, inplace=True)
    
    df_data = df_data.merge(df_forecast_weather_copy, on=['county','datetime'], how='left')


    #Merge df_data and df_historical_weather
    df_historical_weather_copy = df_historical_weather.copy(deep=True)
    df_historical_weather_copy['datetime'] = pd.to_datetime(df_historical_weather_copy['datetime'])
    df_historical_weather_copy['county'] = [result_dict.get((x, y), -1) for x, y in zip(df_historical_weather_copy['latitude'], df_historical_weather_copy['longitude'])]
    df_historical_weather_copy = df_historical_weather_copy[df_historical_weather_copy['county']!=-1]
    df_historical_weather_copy = df_historical_weather_copy.sort_values(by=['latitude','longitude','datetime'])
    columns_to_average = [col for col in df_historical_weather_copy.columns if col not in ['latitude', 'longitude', 'datetime','county','data_block_id']]
    agg_dict = {col: 'mean' for col in columns_to_average}
    df_historical_weather_copy = df_historical_weather_copy.groupby(['county','datetime']).agg(agg_dict)
    df_historical_weather_copy.reset_index(inplace=True)

    df_data = df_data.merge(df_historical_weather_copy, on=['county','datetime'], how='left')

    return df_data

In [4]:
DATA_DIR = "/kaggle/input/predict-energy-behavior-of-prosumers/"
# Read CSVs and parse relevant date columns
df_data = pd.read_csv(DATA_DIR + "train.csv")
df_client = pd.read_csv(DATA_DIR + "client.csv")
df_historical_weather = pd.read_csv(DATA_DIR + "historical_weather.csv")
df_forecast_weather = pd.read_csv(DATA_DIR + "forecast_weather.csv")
df_electricity_prices = pd.read_csv(DATA_DIR + "electricity_prices.csv")
df_gas_prices = pd.read_csv(DATA_DIR + "gas_prices.csv")
df_weather_station_to_county_mapping = pd.read_csv(DATA_DIR + "weather_station_to_county_mapping.csv")


In [5]:
combined_df = generate_featuers(
    df_data,
    df_client,
    df_gas_prices,
    df_electricity_prices,
    df_forecast_weather,
    df_historical_weather,
    df_weather_station_to_county_mapping)

In [6]:
combined_df.to_csv('/kaggle/working/combined_df.csv', index=False)

In [7]:
combined_df.columns

Index(['county', 'is_business', 'product_type', 'target', 'is_consumption',
       'datetime', 'data_block_id', 'row_id', 'prediction_unit_id',
       'eic_count', 'installed_capacity', 'lowest_price_per_mwh',
       'highest_price_per_mwh', 'euros_per_mwh', 'temperature_1',
       'temperature_2', 'dewpoint_1', 'dewpoint_2', 'cloudcover_high_1',
       'cloudcover_high_2', 'cloudcover_low_1', 'cloudcover_low_2',
       'cloudcover_mid_1', 'cloudcover_mid_2', 'cloudcover_total_1',
       'cloudcover_total_2', '10_metre_u_wind_component_1',
       '10_metre_u_wind_component_2', '10_metre_v_wind_component_1',
       '10_metre_v_wind_component_2', 'data_block_id_1', 'data_block_id_2',
       'direct_solar_radiation_1', 'direct_solar_radiation_2',
       'surface_solar_radiation_downwards_1',
       'surface_solar_radiation_downwards_2', 'snowfall_1', 'snowfall_2',
       'total_precipitation_1', 'total_precipitation_2', 'temperature',
       'dewpoint', 'rain', 'snowfall', 'surface_pressu

In [8]:
grp = combined_df.groupby(['county','is_business','product_type','is_consumption'])

In [9]:
grp.head()

Unnamed: 0,county,is_business,product_type,target,is_consumption,datetime,data_block_id,row_id,prediction_unit_id,eic_count,...,surface_pressure,cloudcover_total,cloudcover_low,cloudcover_mid,cloudcover_high,windspeed_10m,winddirection_10m,shortwave_radiation,direct_solar_radiation,diffuse_radiation
0,0,0,1,0.713,0,2021-09-01 00:00:00,0,0,0,108.0,...,,,,,,,,,,
1,0,0,1,96.590,1,2021-09-01 00:00:00,0,1,0,108.0,...,,,,,,,,,,
2,0,0,2,0.000,0,2021-09-01 00:00:00,0,2,1,17.0,...,,,,,,,,,,
3,0,0,2,17.314,1,2021-09-01 00:00:00,0,3,1,17.0,...,,,,,,,,,,
4,0,0,3,2.904,0,2021-09-01 00:00:00,0,4,2,688.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1338977,14,1,2,22.009,1,2022-11-01 02:00:00,426,1338977,68,5.0,...,1009.833333,99.666667,100.0,6.000000,26.333333,4.111111,224.333333,0.0,0.0,0.0
1339114,14,1,2,0.000,0,2022-11-01 03:00:00,426,1339114,68,5.0,...,1009.566667,99.000000,100.0,2.000000,35.333333,3.805556,218.333333,0.0,0.0,0.0
1339115,14,1,2,25.434,1,2022-11-01 03:00:00,426,1339115,68,5.0,...,1009.566667,99.000000,100.0,2.000000,35.333333,3.805556,218.333333,0.0,0.0,0.0
1339252,14,1,2,0.000,0,2022-11-01 04:00:00,426,1339252,68,5.0,...,1009.333333,92.000000,100.0,0.333333,6.333333,3.472222,213.000000,0.0,0.0,0.0


In [10]:
grp.describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,target,target,target,target,target,target,target,target,datetime,datetime,...,direct_solar_radiation,direct_solar_radiation,diffuse_radiation,diffuse_radiation,diffuse_radiation,diffuse_radiation,diffuse_radiation,diffuse_radiation,diffuse_radiation,diffuse_radiation
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,count,mean,min,25%,50%,75%,max,std,count,mean,...,max,std,count,mean,min,25%,50%,75%,max,std
county,is_business,product_type,is_consumption,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2
0,0,1,0,15260.0,220.606349,0.000,0.18300,1.5010,112.36225,3445.013,509.590282,15264,2022-07-15 23:30:00,...,685.0,130.358260,15253.0,41.033097,0.0,0.0,0.666667,73.0,339.5,60.169203
0,0,1,1,15260.0,442.797622,22.972,185.90725,379.2575,602.03550,1638.667,316.116769,15264,2022-07-15 23:30:00,...,685.0,130.358260,15253.0,41.033097,0.0,0.0,0.666667,73.0,339.5,60.169203
0,0,2,0,15260.0,8.372791,0.000,0.00000,0.0010,5.07525,113.911,18.232197,15264,2022-07-15 23:30:00,...,685.0,130.358260,15253.0,41.033097,0.0,0.0,0.666667,73.0,339.5,60.169203
0,0,2,1,15260.0,23.173754,0.644,10.93075,21.3120,33.35375,76.821,14.296245,15264,2022-07-15 23:30:00,...,685.0,130.358260,15253.0,41.033097,0.0,0.0,0.666667,73.0,339.5,60.169203
0,0,3,0,15260.0,759.214344,0.000,1.64175,8.6195,466.78150,11209.014,1664.822219,15264,2022-07-15 23:30:00,...,685.0,130.358260,15253.0,41.033097,0.0,0.0,0.666667,73.0,339.5,60.169203
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15,1,0,1,13077.0,331.619296,23.526,178.26700,244.6720,474.40200,1030.669,213.190981,13080,2022-08-30 11:30:00,...,710.6,131.992968,13080.0,45.685336,0.0,0.0,1.400000,80.8,364.0,65.918644
15,1,1,0,15260.0,41.582167,0.000,0.00000,0.0230,26.43675,474.244,86.664974,15264,2022-07-15 23:30:00,...,710.6,125.435838,15253.0,43.485295,0.0,0.0,0.800000,75.6,364.0,63.893546
15,1,1,1,15260.0,79.601392,0.485,32.20950,55.8570,99.91600,426.108,69.265675,15264,2022-07-15 23:30:00,...,710.6,125.435838,15253.0,43.485295,0.0,0.0,0.800000,75.6,364.0,63.893546
15,1,3,0,15260.0,94.133136,0.000,0.00000,0.7130,53.52350,1421.238,205.638481,15264,2022-07-15 23:30:00,...,710.6,125.435838,15253.0,43.485295,0.0,0.0,0.800000,75.6,364.0,63.893546


In [11]:
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Define the main determining columns and the target variable
main_columns = ['is_business', 'county', 'product_type', 'eic_count','cloudcover_total','direct_solar_radiation'] 
target_column = 'target'

# Remove rows with missing values in the target variable
combined_df = combined_df.dropna(subset=[target_column])

# Remove rows with missing values in the main determining columns
combined_df = combined_df.dropna(subset=main_columns)

# Split the dataset into features and target variable
X = combined_df[main_columns]
y = combined_df[target_column]

# Debugging step to check for any remaining NaNs
print(f'Number of NaNs in target variable y: {y.isna().sum()}')
print(f'Number of NaNs in feature variables X:\n{X.isna().sum()}')

# Ensure no NaNs remain in the data
if y.isna().sum() > 0 or X.isna().sum().sum() > 0:
    raise ValueError("There are still NaNs in the data after attempting to drop them.")

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Debugging step to check the shapes of training and testing sets
print(f'Shape of X_train: {X_train.shape}')
print(f'Shape of X_test: {X_test.shape}')
print(f'Shape of y_train: {y_train.shape}')
print(f'Shape of y_test: {y_test.shape}')

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Build the linear regression model
model = LinearRegression()

# Train the model
model.fit(X_train_scaled, y_train)

# Make predictions
y_pred = model.predict(X_test_scaled)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'R^2 Score: {r2}')

Number of NaNs in target variable y: 0
Number of NaNs in feature variables X:
is_business               0
county                    0
product_type              0
eic_count                 0
cloudcover_total          0
direct_solar_radiation    0
dtype: int64
Shape of X_train: (1583795, 6)
Shape of X_test: (395949, 6)
Shape of y_train: (1583795,)
Shape of y_test: (395949,)
Mean Squared Error: 702038.7355248864
R^2 Score: 0.16274737608969336


In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Define the main determining columns and the target variable
main_columns = ['is_business', 'county', 'product_type', 'eic_count','cloudcover_total','direct_solar_radiation'] 
target_column = 'target'

# Remove rows with missing values in the target variable
combined_df = combined_df.dropna(subset=[target_column])

# Remove rows with missing values in the main determining columns
combined_df = combined_df.dropna(subset=main_columns)

# Split the dataset into features and target variable
X = combined_df[main_columns]
y = combined_df[target_column]

# Debugging step to check for any remaining NaNs
print(f'Number of NaNs in target variable y: {y.isna().sum()}')
print(f'Number of NaNs in feature variables X:\n{X.isna().sum()}')

# Ensure no NaNs remain in the data
if y.isna().sum() > 0 or X.isna().sum().sum() > 0:
    raise ValueError("There are still NaNs in the data after attempting to drop them.")

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Debugging step to check the shapes of training and testing sets
print(f'Shape of X_train: {X_train.shape}')
print(f'Shape of X_test: {X_test.shape}')
print(f'Shape of y_train: {y_train.shape}')
print(f'Shape of y_test: {y_test.shape}')

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Random Forest
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train_scaled, y_train)
y_pred_rf = rf_model.predict(X_test_scaled)
mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)


print(f'Mean Squared Error RF: {mse_rf}')
print(f'R^2 Score RF: {r2_rf}')

Number of NaNs in target variable y: 0
Number of NaNs in feature variables X:
is_business               0
county                    0
product_type              0
eic_count                 0
cloudcover_total          0
direct_solar_radiation    0
dtype: int64
Shape of X_train: (1583795, 6)
Shape of X_test: (395949, 6)
Shape of y_train: (1583795,)
Shape of y_test: (395949,)
Mean Squared Error RF: 802762.5352803902
R^2 Score RF: 0.042623996327092795


**Auto regression**

In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from statsmodels.tsa.ar_model import AutoReg
from sklearn.metrics import mean_squared_error, r2_score

# Define the main determining columns and the target variable
main_columns = ['is_business', 'county', 'product_type', 'eic_count','cloudcover_total','direct_solar_radiation'] 
target_column = 'target'

# Remove rows with missing values in the target variable
combined_df = combined_df.dropna(subset=[target_column])

# Remove rows with missing values in the main determining columns
combined_df = combined_df.dropna(subset=main_columns)

# Split the dataset into features and target variable
X = combined_df[main_columns]
y = combined_df[target_column]

# Debugging step to check for any remaining NaNs
print(f'Number of NaNs in target variable y: {y.isna().sum()}')
print(f'Number of NaNs in feature variables X:\n{X.isna().sum()}')

# Ensure no NaNs remain in the data
if y.isna().sum() > 0 or X.isna().sum().sum() > 0:
    raise ValueError("There are still NaNs in the data after attempting to drop them.")

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Debugging step to check the shapes of training and testing sets
print(f'Shape of X_train: {X_train.shape}')
print(f'Shape of X_test: {X_test.shape}')
print(f'Shape of y_train: {y_train.shape}')
print(f'Shape of y_test: {y_test.shape}')

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

y_train_ar = y_train.reset_index(drop=True)
ar_model = AutoReg(y_train_ar, lags=1).fit()
y_pred_ar = ar_model.predict(start=len(y_train_ar), end=len(y_train_ar) + len(y_test) - 1)
mse_ar = mean_squared_error(y_test, y_pred_ar)
r2_ar = r2_score(y_test, y_pred_ar)

print(f'Mean Squared Error: {mse_ar}')
print(f'R^2 Score: {r2_ar}')

Number of NaNs in target variable y: 0
Number of NaNs in feature variables X:
is_business               0
county                    0
product_type              0
eic_count                 0
cloudcover_total          0
direct_solar_radiation    0
dtype: int64
Shape of X_train: (1583795, 6)
Shape of X_test: (395949, 6)
Shape of y_train: (1583795,)
Shape of y_test: (395949,)
Mean Squared Error: 838502.9285783482
R^2 Score: -5.9232931004515876e-08


**SARIMA**

In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Define the main determining columns and the target variable
main_columns = ['is_business', 'county', 'product_type','cloudcover_total','direct_solar_radiation'] 
target_column = 'target'

# Remove rows with missing values in the target variable
combined_df = combined_df.dropna(subset=[target_column])

# Remove rows with missing values in the main determining columns
combined_df = combined_df.dropna(subset=main_columns)

# Split the dataset into features and target variable
X = combined_df[main_columns]
y = combined_df[target_column]

# Debugging step to check for any remaining NaNs
print(f'Number of NaNs in target variable y: {y.isna().sum()}')
print(f'Number of NaNs in feature variables X:\n{X.isna().sum()}')

# Ensure no NaNs remain in the data
if y.isna().sum() > 0 or X.isna().sum().sum() > 0:
    raise ValueError("There are still NaNs in the data after attempting to drop them.")

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Debugging step to check the shapes of training and testing sets
print(f'Shape of X_train: {X_train.shape}')
print(f'Shape of X_test: {X_test.shape}')
print(f'Shape of y_train: {y_train.shape}')
print(f'Shape of y_test: {y_test.shape}')

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Random Forest
rf_model_dif = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model_dif.fit(X_train_scaled, y_train)
y_pred_rf_dif = rf_model_dif.predict(X_test_scaled)
mse_rf_dif = mean_squared_error(y_test, y_pred_rf_dif)
r2_rf_dif = r2_score(y_test, y_pred_rf_dif)

print(f'Mean Squared Error RF: {mse_rf_dif}')
print(f'R^2 Score RF: {r2_rf_dif}')

Number of NaNs in target variable y: 0
Number of NaNs in feature variables X:
is_business               0
county                    0
product_type              0
cloudcover_total          0
direct_solar_radiation    0
dtype: int64
Shape of X_train: (1583795, 5)
Shape of X_test: (395949, 5)
Shape of y_train: (1583795,)
Shape of y_test: (395949,)
Mean Squared Error RF: 664868.4905432126
R^2 Score RF: 0.2070766752686446
