In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
wind_data = pd.read_csv('./Wind_Speed_Annual/Wind_Speed_Annual.csv')

# Drop the block column
wind_data.drop(['Block'], axis=1, inplace=True)

corr_matrix = wind_data.corr()

# Set the threshold
threshold = 0.90

# List to keep track of columns to drop
columns_to_drop = set()

# Iterate over the columns of the correlation matrix
for i in range(corr_matrix.shape[0]):
    for j in range(i+1, corr_matrix.shape[0]):
        if abs(corr_matrix.iloc[i, j]) > threshold:
            colname = corr_matrix.columns[j]
            columns_to_drop.add(colname)

# Drop the identified columns
reduced_wind_data = wind_data.drop(columns=columns_to_drop)

print(f"Original dataframe had {wind_data.shape[1]} columns. Reduced dataframe has {reduced_wind_data.shape[1]} columns.")
print(reduced_wind_data.head())


X = reduced_wind_data[['Lon', 'Lat', 'Wind Speed', 'Hour 10 Weibull K', 'Hour 11 Weibull K']]  # Select your features
y = reduced_wind_data['Weibull K']  # Target variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)




Original dataframe had 77 columns. Reduced dataframe has 6 columns.
       Lon      Lat  Wind Speed  Weibull K  Hour 10 Weibull K  \
0 -83.0406  23.8024        7.32       2.50               2.38   
1 -83.0086  23.8029        7.32       2.50               2.36   
2 -83.4677  23.8398        7.34       2.48               2.40   
3 -83.4203  23.8405        7.33       2.48               2.38   
4 -83.3729  23.8413        7.33       2.48               2.36   

   Hour 11 Weibull K  
0               2.30  
1               2.28  
2               2.32  
3               2.34  
4               2.34  


## let's get some statistics

In [10]:

statistics_df = reduced_wind_data
mean_values = statistics_df.mean()
median_values = statistics_df.median()
mode_values = statistics_df.mode().iloc[0]
std_dev = statistics_df.std()
variance = statistics_df.var()
range_values = statistics_df.max() - statistics_df.min()
iqr = statistics_df.quantile(0.75) - statistics_df.quantile(0.25)



print(f"Mean: \n{mean_values}\n")
print(f"Median: \n{median_values}\n")
print(f"Mode: \n{mode_values}\n")
print(f"Standard Deviation: \n{std_dev}\n")
print(f"Variance: \n{variance}\n")
print(f"Range: \n{range_values}\n")
print(f"Interquartile Range: \n{iqr}\n")


Mean: 
Lon                 -88.817263
Lat                  27.347716
Wind Speed            6.968373
Weibull K             2.151248
Hour 10 Weibull K     2.148891
Hour 11 Weibull K     2.138107
dtype: float64

Median: 
Lon                 -88.3718
Lat                  27.3475
Wind Speed            6.9600
Weibull K             2.1200
Hour 10 Weibull K     2.1400
Hour 11 Weibull K     2.1400
dtype: float64

Mode: 
Lon                 -87.0030
Lat                  26.4757
Wind Speed            6.9200
Weibull K             2.0400
Hour 10 Weibull K     2.1400
Hour 11 Weibull K     2.1400
Name: 0, dtype: float64

Standard Deviation: 
Lon                  4.132522
Lat                  1.366301
Wind Speed           0.402487
Weibull K            0.135068
Hour 10 Weibull K    0.070551
Hour 11 Weibull K    0.084922
dtype: float64

Variance: 
Lon                  17.077734
Lat                   1.866778
Wind Speed            0.161996
Weibull K             0.018243
Hour 10 Weibull K     0.004977
Hou

# Let's do some training!

## Data Preprocessing

In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load dataset
# wind_data = pd.read_csv('./Wind_Speed_Annual/Wind_Speed_Annual.csv')

# Exploratory Data Analysis (EDA)
print(wind_data.describe())
print(wind_data.isnull().sum())

# Handling missing values (if any)
wind_data.fillna(wind_data.mean(), inplace=True)

# Feature selection
X = wind_data.drop('Wind Speed', axis=1)  # Replace 'target_column' with your target variable name
y = wind_data['Wind Speed']

# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Scaling features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


                Lon           Lat    Wind Speed     Weibull C     Weibull K  \
count  29101.000000  29101.000000  29101.000000  29101.000000  29101.000000   
mean     -88.817263     27.347716      6.968373      7.865366      2.151248   
std        4.132522      1.366301      0.402487      0.453983      0.135068   
min      -97.214200     23.802400      5.830000      6.580000      1.900000   
25%      -92.365300     26.319400      6.810000      7.690000      2.060000   
50%      -88.371800     27.347500      6.960000      7.860000      2.120000   
75%      -85.211100     28.432800      7.180000      8.110000      2.240000   
max      -81.194300     30.267000      8.260000      9.290000      2.680000   

       Hour 00 Wind Speed  Hour 00 Weibull C  Hour 00 Weibull K  \
count        29101.000000       29101.000000       29101.000000   
mean             7.254403           8.184387           2.231133   
std              0.478671           0.533617           0.206458   
min              6.2

## Linear Regression

In [12]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Linear Regression
linear_model = LinearRegression()
linear_model.fit(X_train_scaled, y_train)
linear_predictions = linear_model.predict(X_test_scaled)

# Evaluation
linear_mse = mean_squared_error(y_test, linear_predictions)
linear_r2 = r2_score(y_test, linear_predictions)
print('Linear Regression - MSE:', linear_mse, 'R2 Score:', linear_r2)


Linear Regression - MSE: 8.759340110167196e-06 R2 Score: 0.999947351448233


## SVR (Support Vector Regression)

In [15]:
from sklearn.svm import SVR

# SVR
svr_model = SVR()
svr_model.fit(X_train_scaled, y_train)
svr_predictions = svr_model.predict(X_test_scaled)

# Evaluation
svr_mse = mean_squared_error(y_test, svr_predictions)
svr_r2 = r2_score(y_test, svr_predictions)
print('SVR - MSE:', svr_mse, 'R2 Score:', svr_r2)


SVR - MSE: 0.0027382268686884067 R2 Score: 0.9835417192125421


## Decision Tree Regression

In [18]:
from sklearn.tree import DecisionTreeRegressor

# Decision Tree Regression
dt_model = DecisionTreeRegressor()
dt_model.fit(X_train_scaled, y_train)
dt_predictions = dt_model.predict(X_test_scaled)

# Evaluation
dt_mse = mean_squared_error(y_test, dt_predictions)
dt_r2 = r2_score(y_test, dt_predictions)
print('Decision Tree - MSE:', dt_mse, 'R2 Score:', dt_r2)


Decision Tree - MSE: 2.5518268239605944e-05 R2 Score: 0.9998466208813084


## kNN regression

In [21]:
from sklearn.neighbors import KNeighborsRegressor

# kNN Regression
knn_model = KNeighborsRegressor()
knn_model.fit(X_train_scaled, y_train)
knn_predictions = knn_model.predict(X_test_scaled)

# Evaluation
knn_mse = mean_squared_error(y_test, knn_predictions)
knn_r2 = r2_score(y_test, knn_predictions)
print('kNN Regression - MSE:', knn_mse, 'R2 Score:', knn_r2)


kNN Regression - MSE: 3.368686290230227e-05 R2 Score: 0.9997975230413394


## Hypertuning for kNN (Random Search)

In [25]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import RandomizedSearchCV

# Define the parameter space for kNN
param_distributions = {
    'n_neighbors': range(1, 30),
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
}

# Randomized Search
knn_random_search = RandomizedSearchCV(KNeighborsRegressor(), param_distributions, n_iter=100, cv=5, scoring='r2', verbose=2, random_state=42, n_jobs=-1)
knn_random_search.fit(X_train_scaled, y_train)

# Best Parameters and Score
print("Best Parameters for kNN Regression:", knn_random_search.best_params_)
print("Best Score for kNN Regression:", knn_random_search.best_score_)


Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best Parameters for kNN Regression: {'weights': 'distance', 'n_neighbors': 9, 'algorithm': 'ball_tree'}
Best Score for kNN Regression: 0.9997792610548089


## Hypertuning for SVR (RandomSearch)

In [28]:
import numpy as np

# Define the parameter space for SVR
param_distributions = {
    'C': np.logspace(-2, 2, 20),
    'gamma': ['scale', 'auto'],
    'kernel': ['rbf', 'linear']
}

# Randomized Search
svr_random_search = RandomizedSearchCV(SVR(), param_distributions, n_iter=100, cv=5, scoring='r2', verbose=2, random_state=42, n_jobs=-1)
svr_random_search.fit(X_train_scaled, y_train)

# Best Parameters and Score
print("Best Parameters for SVR:", svr_random_search.best_params_)
print("Best Score for SVR:", svr_random_search.best_score_)

Fitting 5 folds for each of 80 candidates, totalling 400 fits




Best Parameters for SVR: {'kernel': 'rbf', 'gamma': 'auto', 'C': 0.4832930238571752}
Best Score for SVR: 0.9827971194154774


## Hypertuning for Decision Tree Regression using GridSearch

In [30]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV

# Define the parameter grid for Decision Tree Regression
param_grid = {
    'max_depth': [None, 3, 5, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Grid Search for Decision Tree Regression
dt_grid_search = GridSearchCV(DecisionTreeRegressor(), param_grid, cv=5, scoring='r2', verbose=1)
dt_grid_search.fit(X_train_scaled, y_train)

# Best Parameters and Score
print("Best Parameters for Decision Tree Regression:", dt_grid_search.best_params_)
print("Best Score for Decision Tree Regression:", dt_grid_search.best_score_)


Fitting 5 folds for each of 45 candidates, totalling 225 fits
Best Parameters for Decision Tree Regression: {'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 10}
Best Score for Decision Tree Regression: 0.9998815669156119


In [36]:
from sklearn.model_selection import GridSearchCV

param_grid = {'C': [0.1, 1, 10], 'gamma': ['scale', 'auto'], 'kernel': ['rbf', 'linear']}
svr_grid_search = GridSearchCV(SVR(), param_grid, cv=5, scoring='r2', verbose=1)
svr_grid_search.fit(X_train_scaled, y_train)

print("Best Parameters for SVR:", svr_grid_search.best_params_)
print("Best Score for SVR:", svr_grid_search.best_score_)

svr_best_params = svr_grid_search.best_params_


Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best Parameters for SVR: {'C': 1, 'gamma': 'scale', 'kernel': 'rbf'}
Best Score for SVR: 0.9827172988305009


### Retraining the models

In [37]:
from sklearn.svm import SVR

# Assuming best_params_ for SVR from GridSearchCV is stored in svr_best_params
# svr_best_params = svr_grid_search.best_params_

svr_model = SVR(C=svr_best_params['C'], gamma=svr_best_params['gamma'], kernel=svr_best_params['kernel'])
svr_model.fit(X_train_scaled, y_train)

# Evaluate this model on the test data
