In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split # sklearn - ML
from sklearn.preprocessing import StandardScaler

In [2]:
df=pd.read_csv("C:/Users/pandi/Desktop/PROJECTS/greenai project/project-wind-generation/final_dataset.csv")
df = pd.get_dummies(df, columns=['Location'], drop_first=True)
df = df.astype({col: int for col in df.select_dtypes(bool).columns})
numerical_columns=df.select_dtypes(include=["number"]).columns
df.drop('Time', axis=1, inplace=True)

In [3]:
df

Unnamed: 0,temperature_2m,relativehumidity_2m,dewpoint_2m,windspeed_10m,windspeed_100m,winddirection_10m,winddirection_100m,windgusts_10m,Power,Location_Location2,Location_Location3,Location_Location4
0,28.5,85,24.5,1.44,1.26,146,162,1.4,0.1635,0,0,0
1,28.4,86,24.7,2.06,3.99,151,158,4.4,0.1424,0,0,0
2,26.8,91,24.5,1.30,2.78,148,150,3.2,0.1214,0,0,0
3,27.4,88,24.3,1.30,2.69,58,105,1.6,0.1003,0,0,0
4,27.3,88,24.1,2.47,4.43,58,84,4.0,0.0793,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
175195,25.5,94,24.0,4.01,6.66,4,8,8.2,0.3417,0,0,1
175196,25.3,92,23.2,4.70,7.52,1,5,7.9,0.3342,0,0,1
175197,24.9,90,22.5,4.61,7.57,4,8,7.9,0.3267,0,0,1
175198,24.9,89,22.1,4.60,7.44,2,6,7.8,0.3192,0,0,1


In [4]:
X = df.drop('Power', axis=1) # features
y = df['Power'] # target
# Split the data into training and test sets - 80% & 20%
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
# Scale the numerical features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
# Train with Linear Regression Model

# initialize the model
model1 = LinearRegression()
# Train the model
model1.fit(X_train, y_train)

In [8]:
y_pred1 = model1.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred1)
mse = mean_squared_error(y_test, y_pred1)
r2 = r2_score(y_test, y_pred1)

print(f'Mean Absolute Error: {mae}')
print(f'Mean Squared Error: {mse}')
print(f'R Squared Score: {r2}')

Mean Absolute Error: 0.13770257172514175
Mean Squared Error: 0.03248994443591342
R Squared Score: 0.5122261500969816


In [10]:
from sklearn.neighbors import KNeighborsRegressor
model2 = KNeighborsRegressor()
model2.fit(X_train, y_train)


In [14]:
# Evaluate the model
y_pred2 = model2.predict(X_test)
mae_rf = mean_absolute_error(y_test, y_pred2)
mse_rf = mean_squared_error(y_test, y_pred2)
r2_rf = r2_score(y_test, y_pred2)

print(f'RF MAE: {mae_rf}')
print(f'RF MSE: {mse_rf}')
print(f'RF R2 Score: {r2_rf}')

RF MAE: 0.10320378652968036
RF MSE: 0.022078732355000002
RF R2 Score: 0.6685304186032255


In [16]:
from sklearn.ensemble import AdaBoostRegressor
model3 = AdaBoostRegressor(n_estimators=100)
model3.fit(X_train, y_train)

In [18]:
y_pred_3 = model3.predict(X_test)

mae_rf = mean_absolute_error(y_test, y_pred_3)
mse_rf = mean_squared_error(y_test, y_pred_3)
r2_rf = r2_score(y_test, y_pred_3)

print(f'RF MAE: {mae_rf}')
print(f'RF MSE: {mse_rf}')
print(f'RF R2 Score: {r2_rf}')

RF MAE: 0.15170634524004395
RF MSE: 0.0352896595203501
RF R2 Score: 0.47019382812545785


In [20]:
import lightgbm as lgb
model4 = lgb.LGBMRegressor()
model4.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004309 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1872
[LightGBM] [Info] Number of data points in the train set: 157680, number of used features: 11
[LightGBM] [Info] Start training from score 0.303189


In [22]:
y_pred_4 = model4.predict(X_test)

mae_rf = mean_absolute_error(y_test, y_pred_4)
mse_rf = mean_squared_error(y_test, y_pred_4)
r2_rf = r2_score(y_test, y_pred_4)

print(f'RF MAE: {mae_rf}')
print(f'RF MSE: {mse_rf}')
print(f'RF R2 Score: {r2_rf}')

RF MAE: 0.11791753133767617
RF MSE: 0.025456312089531273
RF R2 Score: 0.6178225734815933


In [24]:
# Hyperparameter tuning
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV
# Define the parameter grid for GridSearchCV
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

# Initialize GridSearchCV
grid_search = GridSearchCV(
    estimator=XGBRegressor(random_state=42),
    param_grid=param_grid,
    scoring='neg_mean_absolute_error',  # Use MAE for evaluation
    cv=3,  # 3-fold cross-validation
    n_jobs=-1  # Use all available CPU cores
)

# Fit the GridSearchCV
grid_search.fit(X_train, y_train)

# Best parameters and best score
print(f'Best Parameters: {grid_search.best_params_}')
print(f'Best MAE: {-grid_search.best_score_}')

# Evaluate the tuned model
best_model = grid_search.best_estimator_
y_pred_tuned = best_model.predict(X_test)

mae_tuned = mean_absolute_error(y_test, y_pred_tuned)
mse_tuned = mean_squared_error(y_test, y_pred_tuned)
r2_tuned = r2_score(y_test, y_pred_tuned)

print(f'Tuned Model Mean Absolute Error (MAE): {mae_tuned}')
print(f'Tuned Model Mean Squared Error (MSE): {mse_tuned}')
print(f'Tuned Model R^2 Score: {r2_tuned}')

Best Parameters: {'colsample_bytree': 1.0, 'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 300, 'subsample': 0.8}
Best MAE: 0.11339136975717788
Tuned Model Mean Absolute Error (MAE): 0.11316843974248476
Tuned Model Mean Squared Error (MSE): 0.023851872165811076
Tuned Model R^2 Score: 0.641910144332162
