# Playground

## Import necessary packages

In [68]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np
import tensorflow as tf
from keras.layers import Dense, LeakyReLU, BatchNormalization
from keras.models import Sequential
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import Lasso

## Data Ingestion

In [None]:
# Load the dataset
data = pd.read_csv("../data/raw/raw.csv")

# Display the first few rows of the dataset to understand its structure
data.head()

## Exploratory Data Analysis (EDA)

In [None]:
# Set the style of the visualization
sns.set(style="whitegrid")

# Prepare a list of columns to visualize
cols_to_visualize = ["Type", "Priority", "Story_Points"]

# Create subplots
fig, axs = plt.subplots(nrows=len(cols_to_visualize), figsize=(12, 15))

# Plot each column
for i, col in enumerate(cols_to_visualize):
    sns.countplot(y=data[col], ax=axs[i], order=data[col].value_counts().index)
    axs[i].set_title(f'Distribution of {col}')
    axs[i].set_xlabel('Count')
    axs[i].set_ylabel(col)

# Adjust layout
plt.tight_layout()
plt.show()

In [None]:
data.columns

In [None]:
# Adjust the column to visualize
cols_to_visualize[2] = "Story_Point"

# Create subplots
fig, axs = plt.subplots(nrows=len(cols_to_visualize), figsize=(12, 15))

# Plot each column
for i, col in enumerate(cols_to_visualize):
    sns.countplot(y=data[col], ax=axs[i], order=data[col].value_counts().index)
    axs[i].set_title(f'Distribution of {col}')
    axs[i].set_xlabel('Count')
    axs[i].set_ylabel(col)

# Adjust layout
plt.tight_layout()
plt.show()


## Data Preparation

### Data Cleaning

In [None]:
# Check for missing values in the dataset
missing_data = data.isnull().sum().sort_values(ascending=False)

# Calculate the percentage of missing data
missing_percentage = (data.isnull().sum() / data.shape[0]).sort_values(ascending=False) * 100

# Combine the missing data count and percentage into a DataFrame
missing_df = pd.DataFrame({'Missing Count': missing_data, 'Missing Percentage (%)': missing_percentage})

# Display the columns with missing data
missing_df[missing_df['Missing Count'] > 0]

In [None]:
# Drop columns with 100% missing values
data.drop(['Timespent', 'Pull_Request_URL'], axis=1, inplace=True)

# Introduce a binary column for Description_Code
data['Has_Description_Code'] = data['Description_Code'].notnull().astype(int)

# Drop the original Description_Code column
data.drop('Description_Code', axis=1, inplace=True)

# Display the first few rows of the modified dataset
data.head()

In [None]:
# Impute missing values

# Story_Point: Impute with median
data['Story_Point'].fillna(data['Story_Point'].median(), inplace=True)

# Priority: Impute with mode
data['Priority'].fillna(data['Priority'].mode()[0], inplace=True)

# Estimation_Date: Impute with median date
data['Estimation_Date'].fillna(pd.to_datetime(data['Estimation_Date']).median(), inplace=True)

# Check for remaining missing values
remaining_missing = data.isnull().sum().sort_values(ascending=False)
remaining_missing_df = pd.DataFrame({'Missing Count': remaining_missing})
remaining_missing_df[remaining_missing_df['Missing Count'] > 0]

In [None]:
# Impute missing values for Description and Description_Text with "No Description"
data['Description'].fillna("No Description", inplace=True)
data['Description_Text'].fillna("No Description", inplace=True)

# Check for remaining missing values
remaining_missing = data.isnull().sum().sort_values(ascending=False)
remaining_missing_df = pd.DataFrame({'Missing Count': remaining_missing})
remaining_missing_df[remaining_missing_df['Missing Count'] > 0]


In [None]:
# Convert the date columns to datetime format
data['Creation_Date'] = pd.to_datetime(data['Creation_Date'])
data['Estimation_Date'] = pd.to_datetime(data['Estimation_Date'])

# Extract the time difference between Creation_Date and Estimation_Date
data['Time_To_Estimate'] = (data['Estimation_Date'] - data['Creation_Date']).dt.days

# Display the first few rows to check the new feature
data[['Creation_Date', 'Estimation_Date', 'Time_To_Estimate']].head()


In [None]:
# One-hot encode the 'Type' and 'Priority' columns
data_encoded = pd.get_dummies(data, columns=['Type', 'Priority'], drop_first=True)

# Display the first few rows of the encoded dataset
data_encoded.head()


### Feature Selection

In [None]:
# Drop irrelevant columns
columns_to_drop = ['ID', 'Jira_ID', 'Issue_Key', 'URL', 'Title', 'Description', 'Description_Text', 'Creation_Date', 'Estimation_Date', 'Resolution_Date', 'Last_Updated', 'Repository_Name', 'Poject_Name']
data_selected = data_encoded.drop(columns=columns_to_drop)

# Display the first few rows of the dataset after dropping irrelevant columns
data_selected.head()


In [None]:
# Generate a correlation matrix
correlation_matrix = data_selected.corr()

# Get correlations with the target variable 'Story_Point'
story_point_corr = correlation_matrix['Story_Point'].sort_values(ascending=False)

# Display correlations
story_point_corr


In [None]:
# Generate data for heatmap based on the provided correlation details
story_point_corr_data = story_point_corr.to_frame()

# Plot heatmap
plt.figure(figsize=(30, 10))
sns.heatmap(story_point_corr_data, annot=True, cmap='coolwarm', cbar=True, square=True, linewidths=0.5)
plt.title('Correlation with Story_Point')
plt.show()

In [None]:
# Define the features (X) and the target (y)
X = data_selected.drop(columns=['Story_Point', 'Status', 'Resolution'])
y = data_selected['Story_Point']

# Split the data into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train.shape, X_test.shape


## Baseline Model

In [None]:
# Train the linear regression model
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# Predict on the test set
y_pred = lr_model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

mse, rmse, mae, r2

In [None]:
# Check the data types of the training set
non_numeric_columns = X_train.select_dtypes(include=['object']).columns
non_numeric_columns


In [None]:
# One-hot encode the 'Repository_Name' and 'Poject_Name' columns
X_train_encoded = pd.get_dummies(X_train, columns=['Repository_Name', 'Poject_Name'], drop_first=True)
X_test_encoded = pd.get_dummies(X_test, columns=['Repository_Name', 'Poject_Name'], drop_first=True)

# Ensure consistent columns between train and test sets after encoding
missing_cols = set(X_train_encoded.columns) - set(X_test_encoded.columns)
for col in missing_cols:
    X_test_encoded[col] = 0

X_test_encoded = X_test_encoded[X_train_encoded.columns]

# Train the linear regression model again
lr_model.fit(X_train_encoded, y_train)

# Predict on the test set
y_pred = lr_model.predict(X_test_encoded)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

mse, rmse, mae, r2


In [None]:
# Identify columns with NaN or infinite values in the training set
nan_inf_columns = X_train_encoded.columns[X_train_encoded.isnull().any() | ~np.isfinite(X_train_encoded).all()]
nan_inf_columns


In [None]:
# Impute NaN values with a placeholder value (-1) for 'Assignee_ID' and 'Sprint_ID'
X_train_encoded[['Assignee_ID', 'Sprint_ID']] = X_train_encoded[['Assignee_ID', 'Sprint_ID']].fillna(-1)
X_test_encoded[['Assignee_ID', 'Sprint_ID']] = X_test_encoded[['Assignee_ID', 'Sprint_ID']].fillna(-1)

# Train the linear regression model again
lr_model.fit(X_train_encoded, y_train)

# Predict on the test set
y_pred = lr_model.predict(X_test_encoded)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

mse, rmse, mae, r2


In [None]:
# Plot actual vs. predicted values
plt.figure(figsize=(10, 6))
sns.scatterplot(y_test, y_pred)
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], '--', color='red')
plt.xlabel('Actual Story Points')
plt.ylabel('Predicted Story Points')
plt.title('Actual vs. Predicted Story Points')
plt.show()


## Model Development

### Random Forest Regressor

In [None]:
# Initialize and train the Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train_encoded, y_train)

# Predict on the test set
y_pred_rf = rf_model.predict(X_test_encoded)

# Evaluate the Random Forest model
mse_rf = mean_squared_error(y_test, y_pred_rf)
rmse_rf = mean_squared_error(y_test, y_pred_rf, squared=False)
mae_rf = mean_absolute_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

mse_rf, rmse_rf, mae_rf, r2_rf


In [None]:
# Plot actual vs. predicted values for Random Forest Regressor
plt.figure(figsize=(10, 6))
sns.scatterplot(y_test, y_pred_rf)
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], '--', color='red')
plt.xlabel('Actual Story Points')
plt.ylabel('Predicted Story Points')
plt.title('Actual vs. Predicted Story Points (Random Forest)')
plt.show()


### Gradient Boosting Regressor

In [None]:
# Initialize and train the Gradient Boosting model
gb_model = GradientBoostingRegressor(n_estimators=100, random_state=42)
gb_model.fit(X_train_encoded, y_train)

# Predict on the test set
y_pred_gb = gb_model.predict(X_test_encoded)

# Evaluate the Gradient Boosting model
mse_gb = mean_squared_error(y_test, y_pred_gb)
rmse_gb = mean_squared_error(y_test, y_pred_gb, squared=False)
mae_gb = mean_absolute_error(y_test, y_pred_gb)
r2_gb = r2_score(y_test, y_pred_gb)

mse_gb, rmse_gb, mae_gb, r2_gb


In [None]:
# Plot actual vs. predicted values for Gradient Boosting Regressor
plt.figure(figsize=(10, 6))
sns.scatterplot(y_test, y_pred_gb)
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], '--', color='red')
plt.xlabel('Actual Story Points')
plt.ylabel('Predicted Story Points')
plt.title('Actual vs. Predicted Story Points (Gradient Boosting)')
plt.show()


## Generating Synthetic Data using GANs

In [None]:
# Select a subset of features for simplicity
selected_features = ['In_Progress_Minutes', 'Total_Effort_Minutes', 'Time_To_Estimate', 'Story_Point']
gan_data = data_selected[selected_features]

# Normalize the data
scaler = MinMaxScaler()
gan_data_normalized = scaler.fit_transform(gan_data)

# Convert the normalized data back to a dataframe
gan_data_normalized_df = pd.DataFrame(gan_data_normalized, columns=selected_features)

gan_data_normalized_df.head()


In [None]:
# GAN Parameters
input_dim = len(selected_features)
generator_output_dim = input_dim
discriminator_output_dim = 1
hidden_dim = 128

# Build the generator
generator = Sequential([
    Dense(hidden_dim, input_dim=input_dim),
    LeakyReLU(alpha=0.2),
    BatchNormalization(momentum=0.8),
    Dense(hidden_dim),
    LeakyReLU(alpha=0.2),
    BatchNormalization(momentum=0.8),
    Dense(generator_output_dim, activation='tanh')
])

# Build the discriminator
discriminator = Sequential([
    Dense(hidden_dim, input_dim=generator_output_dim),
    LeakyReLU(alpha=0.2),
    Dense(hidden_dim),
    LeakyReLU(alpha=0.2),
    Dense(discriminator_output_dim, activation='sigmoid')
])

discriminator.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# The generator takes noise as input and generates samples
z = tf.keras.layers.Input(shape=(input_dim,))
sample = generator(z)

# For the combined model, only train the generator
discriminator.trainable = False

# The discriminator takes generated samples as input and determines validity
validity = discriminator(sample)

# The combined model (stacked generator and discriminator) takes
# noise as input => generates samples => determines validity 
combined = tf.keras.models.Model(z, validity)
combined.compile(loss='binary_crossentropy', optimizer='adam')

combined.summary()


In [None]:
# Define Generator
def build_generator():
    model = Sequential()
    model.add(Dense(128, activation='relu', input_dim=noise_dim))
    model.add(Dense(data_dim, activation='sigmoid'))
    return model

# Define Discriminator
def build_discriminator():
    model = Sequential()
    model.add(Dense(128, activation='relu', input_dim=data_dim))
    model.add(Dense(1, activation='sigmoid'))
    return model

# Compile models
optimizer = tf.keras.optimizers.Adam(0.0002, 0.5)
discriminator = build_discriminator()
discriminator.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
generator = build_generator()

# Combined model (for training the generator)
z = tf.keras.layers.Input(shape=(noise_dim,))
img = generator(z)
discriminator.trainable = False
validity = discriminator(img)
combined = tf.keras.models.Model(z, validity)
combined.compile(loss='binary_crossentropy', optimizer=optimizer)

# Training Loop
for epoch in range(epochs):
    # Train Discriminator
    idx = np.random.randint(0, real_data.shape[0], batch_size)
    real_samples = real_data[idx]
    noise = np.random.normal(0, 1, (batch_size, noise_dim))
    generated_samples = generator.predict(noise)
    real_labels = np.ones((batch_size, 1))
    fake_labels = np.zeros((batch_size, 1))
    d_loss_real = discriminator.train_on_batch(real_samples, real_labels)
    d_loss_fake = discriminator.train_on_batch(generated_samples, fake_labels)
    d_loss = 0.5 * np.add(d_loss_real, d_loss_fake)

    # Train Generator
    noise = np.random.normal(0, 1, (batch_size, noise_dim))
    valid_labels = np.ones((batch_size, 1))
    g_loss = combined.train_on_batch(noise, valid_labels)


### Support Vector Regression (SVR)

In [None]:
# SVR requires feature scaling for better performance
scaler_X = StandardScaler()
scaler_y = StandardScaler()
X_train_scaled = scaler_X.fit_transform(X_train_encoded)
y_train_scaled = scaler_y.fit_transform(y_train.values.reshape(-1, 1))
X_test_scaled = scaler_X.transform(X_test_encoded)

# Train the SVR model
svr_model = SVR(kernel='rbf')
svr_model.fit(X_train_scaled, y_train_scaled.ravel())

# Predict on the test set
y_pred_svr = svr_model.predict(X_test_scaled)
y_pred_svr = scaler_y.inverse_transform(y_pred_svr)  # Inverse transform to original scale

# Evaluate the SVR model
mse_svr = mean_squared_error(y_test, y_pred_svr)
rmse_svr = mean_squared_error(y_test, y_pred_svr, squared=False)
mae_svr = mean_absolute_error(y_test, y_pred_svr)
r2_svr = r2_score(y_test, y_pred_svr)

mse_svr, rmse_svr, mae_svr, r2_svr


In [None]:
# Reshape the predictions and perform inverse transformation
y_pred_svr_reshaped = y_pred_svr.reshape(-1, 1)
y_pred_svr_original = scaler_y.inverse_transform(y_pred_svr_reshaped)

# Evaluate the SVR model again
mse_svr = mean_squared_error(y_test, y_pred_svr_original)
rmse_svr = mean_squared_error(y_test, y_pred_svr_original, squared=False)
mae_svr = mean_absolute_error(y_test, y_pred_svr_original)
r2_svr = r2_score(y_test, y_pred_svr_original)

mse_svr, rmse_svr, mae_svr, r2_svr

#### Comparison with Previous SVR Model

In [None]:
# Plot actual vs. predicted values for SVR
plt.figure(figsize=(10, 6))
sns.scatterplot(y_test, y_pred_svr_original.ravel())
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], '--', color='red')
plt.xlabel('Actual Story Points')
plt.ylabel('Predicted Story Points')
plt.title('Actual vs. Predicted Story Points (Support Vector Regression)')
plt.show()


### Ridge Regression

In [None]:
# Initialize and train the Ridge Regression model
ridge_model = Ridge(alpha=1.0)
ridge_model.fit(X_train_encoded, y_train)

# Predict on the test set
y_pred_ridge = ridge_model.predict(X_test_encoded)

# Evaluate the Ridge model
mse_ridge = mean_squared_error(y_test, y_pred_ridge)
rmse_ridge = mean_squared_error(y_test, y_pred_ridge, squared=False)
mae_ridge = mean_absolute_error(y_test, y_pred_ridge)
r2_ridge = r2_score(y_test, y_pred_ridge)

mse_ridge, rmse_ridge, mae_ridge, r2_ridge


### Lasso Regression

In [None]:
# Initialize and train the Lasso Regression model
lasso_model = Lasso(alpha=0.1)
lasso_model.fit(X_train_encoded, y_train)

# Predict on the test set
y_pred_lasso = lasso_model.predict(X_test_encoded)

# Evaluate the Lasso model
mse_lasso = mean_squared_error(y_test, y_pred_lasso)
rmse_lasso = mean_squared_error(y_test, y_pred_lasso, squared=False)
mae_lasso = mean_absolute_error(y_test, y_pred_lasso)
r2_lasso = r2_score(y_test, y_pred_lasso)

mse_lasso, rmse_lasso, mae_lasso, r2_lasso


#### Comparison with Previous Lasso Model

In [None]:
# Plot actual vs. predicted values for Lasso Regression
plt.figure(figsize=(10, 6))
sns.scatterplot(y_test, y_pred_lasso)
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], '--', color='red')
plt.xlabel('Actual Story Points')
plt.ylabel('Predicted Story Points')
plt.title('Actual vs. Predicted Story Points (Lasso Regression)')
plt.show()


### Neural Network

In [None]:
# Initialize and train a simple neural network
nn_model = MLPRegressor(hidden_layer_sizes=(128, 64), max_iter=1000, random_state=42)
nn_model.fit(X_train_encoded, y_train)

# Predict on the test set
y_pred_nn = nn_model.predict(X_test_encoded)

# Evaluate the neural network model
mse_nn = mean_squared_error(y_test, y_pred_nn)
rmse_nn = mean_squared_error(y_test, y_pred_nn, squared=False)
mae_nn = mean_absolute_error(y_test, y_pred_nn)
r2_nn = r2_score(y_test, y_pred_nn)

mse_nn, rmse_nn, mae_nn, r2_nn


In [None]:
# Plot actual vs. predicted values for Neural Network
plt.figure(figsize=(10, 6))
sns.scatterplot(y_test, y_pred_nn)
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], '--', color='red')
plt.xlabel('Actual Story Points')
plt.ylabel('Predicted Story Points')
plt.title('Actual vs. Predicted Story Points (Neural Network)')
plt.show()


## Fuzzy Grey Relational Analysis (GRA)

In [None]:
def grey_relational_coefficient(data, ref_seq, rho=0.5):
    """
    Compute the Grey Relational Coefficient for data against a reference sequence.
    """
    min_diff = np.min(np.abs(data - ref_seq))
    max_diff = np.max(np.abs(data - ref_seq))
    GRC = (min_diff + rho * max_diff) / (np.abs(data - ref_seq) + rho * max_diff)
    return GRC

# Normalize the data
normalized_data = (data_selected - data_selected.min()) / (data_selected.max() - data_selected.min())
ref_seq = normalized_data['Story_Point'].values

# Calculate Grey Relational Coefficients for each feature
grc_values = {}
for column in normalized_data.columns:
    if column != 'Story_Point':
        grc_values[column] = np.mean(grey_relational_coefficient(normalized_data[column].values, ref_seq))

# Sort features by Grey Relational Grade
sorted_grc_values = dict(sorted(grc_values.items(), key=lambda item: item[1], reverse=True))
sorted_grc_values


In [None]:
# Filter out non-numeric columns for normalization
numeric_data = data_selected.select_dtypes(include=[np.number])

# Normalize the numeric data
normalized_data = (numeric_data - numeric_data.min()) / (numeric_data.max() - numeric_data.min())
ref_seq = normalized_data['Story_Point'].values

# Calculate Grey Relational Coefficients for each feature
grc_values = {}
for column in normalized_data.columns:
    if column != 'Story_Point':
        grc_values[column] = np.mean(grey_relational_coefficient(normalized_data[column].values, ref_seq))

# Sort features by Grey Relational Grade
sorted_grc_values = dict(sorted(grc_values.items(), key=lambda item: item[1], reverse=True))
sorted_grc_values


In [None]:
# Select the top 10 features based on GRG values
top_features = list(sorted_grc_values.keys())[:10]

# Extract these features from the original dataset
selected_data_top = data_selected[top_features + ['Story_Point']]

# Split the data into training and test sets
X_top = selected_data_top.drop(columns=['Story_Point'])
y_top = selected_data_top['Story_Point']

# Use the same split ratio and random state for consistency
X_train_top, X_test_top, y_train_top, y_test_top = train_test_split(X_top, y_top, test_size=0.3, random_state=42)

# Encoding the categorical features
X_train_encoded_top = pd.get_dummies(X_train_top, drop_first=True)
X_test_encoded_top = pd.get_dummies(X_test_top, drop_first=True)

# Aligning the train and test data for consistent columns
X_train_encoded_top, X_test_encoded_top = X_train_encoded_top.align(X_test_encoded_top, join='left', axis=1)

# Fill any new NaN values (due to alignment) with 0
X_train_encoded_top.fillna(0, inplace=True)
X_test_encoded_top.fillna(0, inplace=True)

X_train_encoded_top.shape, X_test_encoded_top.shape


## Random Forest Regressor with Fuzzy Grey Relational Analysis (GRA)

In [None]:
# Train the Random Forest Regressor using top-ranked features
rf_model_top = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model_top.fit(X_train_encoded_top, y_train_top)

# Predict on the test set
y_pred_rf_top = rf_model_top.predict(X_test_encoded_top)

# Evaluate the Random Forest model
mse_rf_top = mean_squared_error(y_test_top, y_pred_rf_top)
rmse_rf_top = mean_squared_error(y_test_top, y_pred_rf_top, squared=False)
mae_rf_top = mean_absolute_error(y_test_top, y_pred_rf_top)
r2_rf_top = r2_score(y_test_top, y_pred_rf_top)

mse_rf_top, rmse_rf_top, mae_rf_top, r2_rf_top


### Gradient Boosting Regression with Fuzzy Grey Relational Analysis (GRA)

In [None]:
# Train the Gradient Boosting Regressor using top-ranked features
gb_model_top = GradientBoostingRegressor(n_estimators=100, random_state=42)
gb_model_top.fit(X_train_encoded_top, y_train_top)

# Predict on the test set
y_pred_gb_top = gb_model_top.predict(X_test_encoded_top)

# Evaluate the Gradient Boosting model
mse_gb_top = mean_squared_error(y_test_top, y_pred_gb_top)
rmse_gb_top = mean_squared_error(y_test_top, y_pred_gb_top, squared=False)
mae_gb_top = mean_absolute_error(y_test_top, y_pred_gb_top)
r2_gb_top = r2_score(y_test_top, y_pred_gb_top)

mse_gb_top, rmse_gb_top, mae_gb_top, r2_gb_top


## Support Vector Machine Regressor with Fuzzy Grey Relational Analysis (GRA)

In [None]:
# Scale the data for SVR using top-ranked features
X_train_scaled_top = scaler_X.fit_transform(X_train_encoded_top)
y_train_scaled_top = scaler_y.fit_transform(y_train_top.values.reshape(-1, 1))
X_test_scaled_top = scaler_X.transform(X_test_encoded_top)

# Train the SVR model
svr_model_top = SVR(kernel='rbf')
svr_model_top.fit(X_train_scaled_top, y_train_scaled_top.ravel())

# Predict on the test set
y_pred_svr_top = svr_model_top.predict(X_test_scaled_top)
y_pred_svr_top_original = scaler_y.inverse_transform(y_pred_svr_top.reshape(-1, 1))

# Evaluate the SVR model
mse_svr_top = mean_squared_error(y_test_top, y_pred_svr_top_original)
rmse_svr_top = mean_squared_error(y_test_top, y_pred_svr_top_original, squared=False)
mae_svr_top = mean_absolute_error(y_test_top, y_pred_svr_top_original)
r2_svr_top = r2_score(y_test_top, y_pred_svr_top_original)

mse_svr_top, rmse_svr_top, mae_svr_top, r2_svr_top


## Hyperparameter Tuning

In [None]:
# Define the hyperparameters and their possible values
param_grid = {
    'n_estimators': [50, 100, 150, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize the GridSearchCV object
grid_search = GridSearchCV(estimator=RandomForestRegressor(random_state=42),
                           param_grid=param_grid,
                           cv=3,
                           n_jobs=-1,
                           verbose=2,
                           scoring='neg_mean_squared_error')

# Fit to the training data
grid_search.fit(X_train_encoded_top, y_train_top)

# Get the best parameters from the grid search
best_params = grid_search.best_params_
best_params


### Random Forest Regressor with Hyperparameter Tuning

In [None]:
# Train the Random Forest model with the best hyperparameters
best_rf_model = RandomForestRegressor(**best_params, random_state=42)
best_rf_model.fit(X_train_encoded_top, y_train_top)

# Predict on the test set
y_pred_best_rf = best_rf_model.predict(X_test_encoded_top)

# Evaluate the optimized Random Forest model
mse_best_rf = mean_squared_error(y_test_top, y_pred_best_rf)
rmse_best_rf = mean_squared_error(y_test_top, y_pred_best_rf, squared=False)
mae_best_rf = mean_absolute_error(y_test_top, y_pred_best_rf)
r2_best_rf = r2_score(y_test_top, y_pred_best_rf)

mse_best_rf, rmse_best_rf, mae_best_rf, r2_best_rf


## Model Evaluation

In [None]:
# Plot actual vs. predicted values for the optimized Random Forest model
plt.figure(figsize=(10, 6))
sns.scatterplot(y_test_top, y_pred_best_rf)
plt.plot([min(y_test_top), max(y_test_top)], [min(y_test_top), max(y_test_top)], '--', color='red')
plt.xlabel('Actual Story Points')
plt.ylabel('Predicted Story Points')
plt.title('Actual vs. Predicted Story Points (Optimized Random Forest)')
plt.show()
