In [93]:
import pandas as pd
import numpy as np

In [94]:
traindf = pd.read_csv('train.csv')
testdf = pd.read_csv('test.csv')

print(f"Train shape: {traindf.shape}, Test shape: {testdf.shape}")

Train shape: (50400, 14), Test shape: (21600, 13)


In [95]:
traindf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50400 entries, 0 to 50399
Data columns (total 14 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   uid                            50400 non-null  int64  
 1   day                            44921 non-null  object 
 2   hour                           44787 non-null  float64
 3   minute                         44930 non-null  float64
 4   C_motion                       44883 non-null  float64
 5   feed_water_motion              44803 non-null  float64
 6   faucet_hole                    44834 non-null  float64
 7   vapour_pressure                44921 non-null  float64
 8   vapour_enthalpy                44963 non-null  float64
 9   vapour_pressure_at_division    44923 non-null  float64
 10  vapour_motion                  44923 non-null  float64
 11  feed_water_enth                44904 non-null  float64
 12  vapour_temperature             44883 non-null 

In [96]:
traindf.columns

Index(['uid', 'day', 'hour', 'minute', 'C_motion', 'feed_water_motion',
       'faucet_hole', 'vapour_pressure', 'vapour_enthalpy',
       'vapour_pressure_at_division', 'vapour_motion', 'feed_water_enth',
       'vapour_temperature', 'output_electricity_generation'],
      dtype='object')

In [97]:
print(traindf['day'].value_counts())

day
Saturday    23612
Friday      21309
Name: count, dtype: int64


In [98]:
# mode impute day
traindf['day'] = traindf['day'].fillna(traindf['day'].mode()[0])
testdf['day'] = testdf['day'].fillna(testdf['day'].mode()[0])

# From memory there were 10% nulls in both train and test

In [99]:
# using XgBoost imputer for all numerical columns

from xgboost import XGBRegressor
from sklearn.impute import SimpleImputer

xgb = XGBRegressor()
si = SimpleImputer(strategy='mean')

traindf[['hour', 'minute', 'C_motion', 'feed_water_motion', 'faucet_hole', 'vapour_pressure', 'vapour_enthalpy', 
         'vapour_pressure_at_division', 'vapour_motion', 'feed_water_enth',
         'vapour_temperature']] = si.fit_transform(traindf[['hour', 'minute', 'C_motion', 'feed_water_motion', 'faucet_hole', 'vapour_pressure', 'vapour_enthalpy', 
         'vapour_pressure_at_division', 'vapour_motion', 'feed_water_enth', 'vapour_temperature']])
testdf[['hour', 'minute', 'C_motion', 'feed_water_motion', 'faucet_hole', 'vapour_pressure', 'vapour_enthalpy', 
         'vapour_pressure_at_division', 'vapour_motion', 'feed_water_enth',
         'vapour_temperature']] = si.fit_transform(testdf[['hour', 'minute', 'C_motion', 'feed_water_motion', 'faucet_hole', 'vapour_pressure', 'vapour_enthalpy', 
         'vapour_pressure_at_division', 'vapour_motion', 'feed_water_enth', 'vapour_temperature']])


In [100]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures

# one hot encoding 
le = LabelEncoder()
traindf['day'] = le.fit_transform(traindf['day'])
testdf['day'] = le.transform(testdf['day'])

# Standard Scaling 
ss = StandardScaler()
traindf[['hour', 'minute', 'C_motion', 'feed_water_motion', 'faucet_hole', 'vapour_pressure', 'vapour_enthalpy', 
         'vapour_pressure_at_division', 'vapour_motion', 'feed_water_enth',
         'vapour_temperature']] = ss.fit_transform(traindf[['hour', 'minute', 'C_motion', 'feed_water_motion', 'faucet_hole', 'vapour_pressure', 'vapour_enthalpy', 
         'vapour_pressure_at_division', 'vapour_motion', 'feed_water_enth',
         'vapour_temperature']])

testdf[['hour', 'minute', 'C_motion', 'feed_water_motion', 'faucet_hole', 'vapour_pressure', 'vapour_enthalpy', 
         'vapour_pressure_at_division', 'vapour_motion', 'feed_water_enth',
         'vapour_temperature']] = ss.transform(testdf[['hour', 'minute', 'C_motion', 'feed_water_motion', 'faucet_hole', 'vapour_pressure', 'vapour_enthalpy', 
         'vapour_pressure_at_division', 'vapour_motion', 'feed_water_enth',
         'vapour_temperature']])

# Polynomial Features on traindf
pf = PolynomialFeatures(degree=2)
traindf = pd.concat([traindf, pd.DataFrame(pf.fit_transform(traindf[['hour', 'minute', 'C_motion', 'feed_water_motion', 'faucet_hole', 'vapour_pressure', 'vapour_enthalpy', 
         'vapour_pressure_at_division', 'vapour_motion', 'feed_water_enth',
         'vapour_temperature']]))], axis=1)

# Polynomial Features on testdf
testdf = pd.concat([testdf, pd.DataFrame(pf.transform(testdf[['hour', 'minute', 'C_motion', 'feed_water_motion', 'faucet_hole', 'vapour_pressure', 'vapour_enthalpy', 
         'vapour_pressure_at_division', 'vapour_motion', 'feed_water_enth',
         'vapour_temperature']]))], axis=1)

In [101]:
traindf.head()

Unnamed: 0,uid,day,hour,minute,C_motion,feed_water_motion,faucet_hole,vapour_pressure,vapour_enthalpy,vapour_pressure_at_division,...,68,69,70,71,72,73,74,75,76,77
0,0,1,-0.858735,0.0,0.731529,0.928694,0.661587,0.613071,-0.754919,0.627823,...,0.394161,0.435992,0.149127,0.200315,0.482261,0.164953,0.221573,0.056421,0.075787,0.101801
1,1,1,0.019945,-0.075448,-1.149833,-1.612888,0.788187,-1.244842,2.670613,0.0,...,0.0,-0.0,-0.0,0.0,1.739762,0.466046,-0.517105,0.124844,-0.138521,0.153698
2,2,0,0.547152,-0.994029,0.112431,0.261686,-0.776872,0.239487,-0.201321,0.237293,...,0.056308,0.0,0.026328,0.080641,0.0,0.0,0.0,0.01231,0.037705,0.115488
3,3,1,0.371417,0.0,-0.946324,-1.575552,0.81348,-1.280101,0.376943,-1.292974,...,1.671783,1.664304,0.467363,0.172289,1.656859,0.465272,0.171519,0.130656,0.048165,0.017756
4,4,1,1.07436,-0.810312,-1.59065,-1.892631,1.408782,-1.676231,0.326588,-1.695164,...,2.873583,2.59805,0.910009,5.257501,2.348936,0.822753,4.753386,0.288182,1.66495,9.619112


In [102]:
# ensembling XgBoost and CatBoost
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import VotingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Assuming you have your data loaded into a Pandas DataFrame called 'df'
# and your target variable is in a column called 'target'
# and your features are in the remaining columns.

# 1. Separate features and target
X = traindf.drop(['output_electricity_generation', 'uid'], axis=1)
y = traindf['output_electricity_generation']

# 2. Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 3. Initialize CatBoost Regressor
catboost_model = CatBoostRegressor(
    iterations=1000,  # Adjust as needed
    learning_rate=0.05, # Adjust as needed
    depth=6,         # Adjust as needed
    loss_function='RMSE', # Or another appropriate loss function
    verbose=0, # Set to True to see training progress
    random_state=42
)

# 4. Initialize XGBoost Regressor
xgboost_model = XGBRegressor(
    n_estimators=1000, # Adjust as needed
    learning_rate=0.05, # Adjust as needed
    max_depth=6,        # Adjust as needed
    random_state=42,
    objective='reg:squarederror' # Explicitly set the objective
)

# 5. Train the models
catboost_model.fit(X_train, y_train)
xgboost_model.fit(X_train, y_train)

# 6. Create the VotingRegressor ensemble
ensemble = VotingRegressor(estimators=[('catboost', catboost_model), ('xgboost', xgboost_model)])

# 7. Train the ensemble
ensemble.fit(X_train, y_train)

# 8. Make predictions
catboost_predictions = catboost_model.predict(X_test)
xgboost_predictions = xgboost_model.predict(X_test)
ensemble_predictions = ensemble.predict(X_test)

# 8.a. Make predictions on the test set
catboost_test = catboost_model.predict(testdf.drop('uid', axis=1))
xgboost_test = xgboost_model.predict(testdf.drop('uid', axis=1))
ensemble_test = ensemble.predict(testdf.drop('uid', axis=1))

# 9. Calculate RMSE for each model and the ensemble
catboost_rmse = np.sqrt(mean_squared_error(y_test, catboost_predictions))
xgboost_rmse = np.sqrt(mean_squared_error(y_test, xgboost_predictions))
ensemble_rmse = np.sqrt(mean_squared_error(y_test, ensemble_predictions))

# 10. Print RMSE scores
print(f"CatBoost RMSE: {catboost_rmse}")
print(f"XGBoost RMSE: {xgboost_rmse}")
print(f"Ensemble RMSE: {ensemble_rmse}")

# Optional: Get predictions from the individual models within the ensemble.
# individual_predictions = ensemble.transform(X_test) #Get predictions from each model
# print(individual_predictions)

CatBoost RMSE: 4.083713858735628
XGBoost RMSE: 4.691265465755194
Ensemble RMSE: 3.7954958679777433


In [None]:
# ensembling XgBoost and LightGBM
from lightgbm import LGBMRegressor

# 1. Separate features and target
X = traindf.drop(['output_electricity_generation', 'uid'], axis=1)
y = traindf['output_electricity_generation']

# 2. Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 3. Initialize LightGBM Regressor
lgbm_model = LGBMRegressor(
    n_estimators=1000,  # Adjust as needed
    learning_rate=0.05, # Adjust as needed
    max_depth=6,        # Adjust as needed
    random_state=42,
    objective='regression', # Explicitly set the objective
    verbose=-1 # Suppress verbose output
)

# 4. Initialize XGBoost Regressor
xgboost_model = XGBRegressor(
    n_estimators=1000, # Adjust as needed
    learning_rate=0.05, # Adjust as needed
    max_depth=6,        # Adjust as needed
    random_state=42,
    objective='reg:squarederror' # Explicitly set the objective
)

# 5. Train the models
lgbm_model.fit(X_train, y_train)
xgboost_model.fit(X_train, y_train)

# 6. Create the VotingRegressor ensemble
ensemble = VotingRegressor(estimators=[('lgbm', lgbm_model), ('xgboost', xgboost_model)])

# 7. Train the ensemble
ensemble.fit(X_train, y_train)

# 8. Make predictions
lgbm_predictions = lgbm_model.predict(X_test)
xgboost_predictions = xgboost_model.predict(X_test)
ensemble_predictions = ensemble.predict(X_test)

# 8.a. Make predictions on the test set
lgbm_test = lgbm_model.predict(testdf.drop('uid', axis=1))
xgboost_test = xgboost_model.predict(testdf.drop('uid', axis=1))
ensemble_test = ensemble.predict(testdf.drop('uid', axis=1))

# 9. Calculate RMSE for each model and the ensemble
lgbm_rmse = np.sqrt(mean_squared_error(y_test, lgbm_predictions))
xgboost_rmse = np.sqrt(mean_squared_error(y_test, xgboost_predictions))
ensemble_rmse = np.sqrt(mean_squared_error(y_test, ensemble_predictions))

# 10. Print RMSE scores
print(f"LightGBM RMSE: {lgbm_rmse}")
print(f"XGBoost RMSE: {xgboost_rmse}")
print(f"Ensemble RMSE: {ensemble_rmse}")

LightGBM RMSE: 3.9434333918901276
XGBoost RMSE: 4.691265465755194
Ensemble RMSE: 3.819214363461345


In [104]:
# 1. Separate features and target
X = traindf.drop(['output_electricity_generation', 'uid'], axis=1)
y = traindf['output_electricity_generation']

# 2. Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 3. Initialize LightGBM Regressor
lgbm_model = LGBMRegressor(
    n_estimators=1000,  # Adjust as needed
    learning_rate=0.05, # Adjust as needed
    max_depth=6,        # Adjust as needed
    random_state=42,
    objective='regression', # Explicitly set the objective
    verbose=-1 # Suppress verbose output
)

# 4. Initialize CatBoost Regressor
catboost_model = CatBoostRegressor(
    iterations=1000,  # Adjust as needed
    learning_rate=0.05, # Adjust as needed
    depth=6,         # Adjust as needed
    loss_function='RMSE', # Or another appropriate loss function
    verbose=0, # Set to True to see training progress
    random_state=42
)

# 5. Train the models
lgbm_model.fit(X_train, y_train)
catboost_model.fit(X_train, y_train)

# 6. Create the VotingRegressor ensemble
ensemble = VotingRegressor(estimators=[('lgbm', lgbm_model), ('catboost', catboost_model)])

# 7. Train the ensemble
ensemble.fit(X_train, y_train)

# 8. Make predictions
lgbm_predictions = lgbm_model.predict(X_test)
catboost_predictions = catboost_model.predict(X_test)
ensemble_predictions = ensemble.predict(X_test)

# 8.a. Make predictions on the test set
lgbm_test = lgbm_model.predict(testdf.drop('uid', axis=1))
catboost_test = catboost_model.predict(testdf.drop('uid', axis=1))
ensemble_test = ensemble.predict(testdf.drop('uid', axis=1))

# 9. Calculate RMSE for each model and the ensemble
lgbm_rmse = np.sqrt(mean_squared_error(y_test, lgbm_predictions))
catboost_rmse = np.sqrt(mean_squared_error(y_test, catboost_predictions))
ensemble_rmse = np.sqrt(mean_squared_error(y_test, ensemble_predictions))

# 10. Print RMSE scores
print(f"LightGBM RMSE: {lgbm_rmse}")
print(f"CatBoost RMSE: {catboost_rmse}")
print(f"Ensemble RMSE: {ensemble_rmse}")

LightGBM RMSE: 3.9434333918901276
CatBoost RMSE: 4.083713858735628
Ensemble RMSE: 3.69074347099389
