In [125]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt 

In [None]:
# Load dataset
columns = ['Project', 'TeamExp', 'ManagerExp', 'YearEnd', 'Length', 'Effort', 'Transactions', 'Entities', 'PointsAdjust', 'Envergure', 'PointsNonAjust', 'Language']
df=pd.read_csv('./desharnais.txt', names=columns, comment='%', na_values='?', skipinitialspace=True, delimiter=',')

df.head()

In [None]:
# Missing values and data types
df.info()
df.isnull().sum()

In [None]:
#Basic stats for dataset
#mean
# percentile for p=0.25 & 0.75
# third quartile/median
df.describe()

In [None]:
# Duplicate rows
df.duplicated().sum()

In [None]:
# Adequacy of data using heatmap
sns.heatmap(df.corr())

In [None]:
# Checking for class imbalance
for col in columns[1:]:
    df[col].plot(kind='hist', bins=20, title=f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')
    plt.show()


In [None]:
# Plotting boxplots for each numerical column
for col in columns:
    plt.figure(figsize=(15,15))
    sns.boxplot(y=df[col])
    plt.title(f'Boxplot of {col}')
    plt.xlabel(col)
    plt.show()

In [None]:
from scipy.stats import trim_mean

trim_frac=0.10
for col in columns[1:]:
    print(f'Trimmed Mean for {col} : {trim_mean(df[col], proportiontocut=trim_frac)}')

In [None]:
trim_frac = 0.1

trimmed_df = pd.DataFrame()

for col in df.columns:
    sorted_data = np.sort(df[col])
    
    lower_idx = int(len(sorted_data) * trim_frac)
    upper_idx = int(len(sorted_data) * (1 - trim_frac))
    
    trimmed_data = sorted_data[lower_idx:upper_idx]
    
    trimmed_column = np.full_like(df[col], np.nan)
    trimmed_column[lower_idx:upper_idx] = trimmed_data
    
    trimmed_df[col] = trimmed_column

print(trimmed_df)


In [None]:
trimmed_median_dict = {}
trimmed_std_dict = {}

for col in df.columns:
    sorted_data = np.sort(df[col])
    
    lower_idx = int(len(sorted_data) * trim_frac)
    upper_idx = int(len(sorted_data) * (1 - trim_frac))
    
    trimmed_data = sorted_data[lower_idx:upper_idx]
    
    trimmed_median = np.median(trimmed_data)
    trimmed_median_dict[col] = trimmed_median
    
    trimmed_std = np.std(trimmed_data, ddof=1) 
    trimmed_std_dict[col] = trimmed_std

trimmed_median_df = pd.DataFrame(list(trimmed_median_dict.items()), columns=['Column', 'Trimmed Median'])
trimmed_std_df = pd.DataFrame(list(trimmed_std_dict.items()), columns=['Column', 'Trimmed Std Deviation'])

print("Trimmed Medians:")
print(trimmed_median_df[1:])

print("\nTrimmed Standard Deviations:")
print(trimmed_std_df[1:])


COCOMO81


In [None]:
import arff

with open('./cocomo811.arff') as f:
    dataset=arff.load(f)

cocomo = pd.DataFrame(dataset['data'], columns=[attr[0] for attr in dataset['attributes']])

cocomo.head()

In [None]:
# Missing values and data types
cocomo.info()
cocomo.isnull().sum()

In [None]:
#Basic stats for dataset
#mean
# percentile for p=0.25 & 0.75
# third quartile/median
cocomo.describe()

In [None]:
# Duplicate rows
cocomo.duplicated().sum()

In [None]:
# Adequacy of data using heatmap
sns.heatmap(cocomo.corr())

In [None]:
# Checking for class imbalance
for col in cocomo.columns:
    cocomo[col].plot(kind='hist', bins=20, title=f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')
    plt.show()

In [None]:
# Plotting boxplots for each numerical column
for col in cocomo.columns:
    plt.figure(figsize=(15,15))
    sns.boxplot(y=cocomo[col])
    plt.title(f'Boxplot of {col}')
    plt.xlabel(col)
    plt.show()

In [None]:
trim_frac=0.01
for col in cocomo.columns:
    print(f'Trimmed Mean for {col} : {trim_mean(cocomo[col], proportiontocut=trim_frac)}')

In [None]:
trim_frac = 0.05

trimmed_cocomo = pd.DataFrame()

for col in cocomo.columns:
    sorted_data = np.sort(cocomo[col])
    
    lower_idx = int(len(sorted_data) * trim_frac)
    upper_idx = int(len(sorted_data) * (1 - trim_frac))
    
    trimmed_data = sorted_data[lower_idx:upper_idx]
    
    trimmed_column_cocomo = np.full_like(cocomo[col], np.nan)
    trimmed_column_cocomo[lower_idx:upper_idx] = trimmed_data
    
    trimmed_cocomo[col] = trimmed_column_cocomo

trimmed_cocomo

In [None]:
trim_cocomo_cleaned=trimmed_cocomo.dropna()

trim_cocomo_cleaned

In [None]:

trimmed_median_dict = {}
trimmed_std_dict = {}

for col in trim_cocomo_cleaned.columns:
    sorted_data = np.sort(trim_cocomo_cleaned[col])
    
    lower_idx = int(len(sorted_data) * trim_frac)
    upper_idx = int(len(sorted_data) * (1 - trim_frac))
    
    trimmed_data = sorted_data[lower_idx:upper_idx]
    
    trimmed_median = np.median(trimmed_data)
    trimmed_median_dict[col] = trimmed_median
    
    trimmed_std = np.std(trimmed_data, ddof=1) 
    trimmed_std_dict[col] = trimmed_std

trimmed_median_cocomo = pd.DataFrame(list(trimmed_median_dict.items()), columns=['Column', 'Trimmed Median'])
trimmed_std_cocomo = pd.DataFrame(list(trimmed_std_dict.items()), columns=['Column', 'Trimmed Std Deviation'])

trimmed_median_cocomo

In [None]:
trimmed_std_cocomo

China

In [None]:
import arff

with open('./china.arff') as f:
    dataset=arff.load(f)

china = pd.DataFrame(dataset['data'], columns=[attr[0] for attr in dataset['attributes']])

china.head()

In [None]:
# Missing values and data types
china.info()
china.isnull().sum()

In [None]:
#Basic stats for dataset
#mean
# percentile for p=0.25 & 0.75
# third quartile/median
china.describe()

In [None]:
# Duplicate rows
china.duplicated().sum()

In [None]:
# # Adequacy of data using heatmap
# sns.heatmap(china.corr())

In [None]:
print(china.dtypes)
print(china.head())

#Encontured categorical data which is not in numeric form


In [None]:
#Converting categorical data into numerical data
print(china['DevType'].unique())


In [None]:
china['devtype_numeric'], devtype_mapping = pd.factorize(china['DevType'])
print(devtype_mapping)


In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
china['devtype_numeric'] = le.fit_transform(china['DevType'])
print(le.classes_)


In [157]:
china = china.drop(columns=['DevType'])


In [None]:
china.head()
#Successfully converted categorical data into numerical where 0='NewDev' & 1= 'Maint'


In [None]:
# Adequacy of data using heatmap
sns.heatmap(china.corr())

In [None]:
# # Checking for class imbalance
# for col in china.columns:
#     china[col].plot(kind='hist', bins=20, title=f'Distribution of {col}')
#     plt.xlabel(col)
#     plt.ylabel('Frequency')
#     # plt.show()

In [161]:
#Problem : Although Resource appears in numerical form it is of type object {1,2,3,4}, i.e. chnaging it to numeric

china[china.columns]=china[china.columns].apply(pd.to_numeric, errors='coerce')

In [None]:
# Checking for class imbalance
for col in china.columns:
    china[col].plot(kind='hist', bins=20, title=f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')
    plt.show()

In [None]:
# Plotting boxplots for each numerical column
for col in china.columns:
    plt.figure(figsize=(15,15))
    sns.boxplot(y=china[col])
    plt.title(f'Boxplot of {col}')
    plt.xlabel(col)
    plt.show()

In [None]:
trim_frac=0.1
for col in china.columns:
    print(f'Trimmed Mean for {col} : {trim_mean(china[col], proportiontocut=trim_frac)}')

In [None]:
trimmed_china = pd.DataFrame()
trim_frac=0.01
for col in china.columns:
    sorted_data = np.sort(china[col])
    
    lower_idx = int(len(sorted_data) * trim_frac)
    upper_idx = int(len(sorted_data) * (1 - trim_frac))
    
    trimmed_data = sorted_data[lower_idx:upper_idx]
    
    trimmed_column_china = np.full_like(china[col], np.nan)
    trimmed_column_china[lower_idx:upper_idx] = trimmed_data
    
    trimmed_china[col] = trimmed_column_china

trimmed_china

In [None]:
trim_china_cleaned=trimmed_china.dropna()

trim_china_cleaned

In [None]:
trimmed_median_dict = {}
trimmed_std_dict = {}

for col in trim_china_cleaned.columns:
    sorted_data = np.sort(trim_china_cleaned[col])
    
    lower_idx = int(len(sorted_data) * trim_frac)
    upper_idx = int(len(sorted_data) * (1 - trim_frac))
    
    trimmed_data = sorted_data[lower_idx:upper_idx]
    
    trimmed_median = np.median(trimmed_data)
    trimmed_median_dict[col] = trimmed_median
    
    trimmed_std = np.std(trimmed_data, ddof=1) 
    trimmed_std_dict[col] = trimmed_std

trimmed_median_china = pd.DataFrame(list(trimmed_median_dict.items()), columns=['Column', 'Trimmed Median'])
trimmed_std_china = pd.DataFrame(list(trimmed_std_dict.items()), columns=['Column', 'Trimmed Std Deviation'])

trimmed_median_china

In [None]:
trimmed_std_china

Kitchenham

In [None]:
import arff

with open('./kitchenham.arff') as f:
    dataset=arff.load(f)

kitchenham = pd.DataFrame(dataset['data'], columns=[attr[0] for attr in dataset['attributes']])

kitchenham.head()

In [None]:
# Missing values and data types
kitchenham.info()
kitchenham.isnull().sum()

In [None]:
#Basic stats for dataset
#mean
# percentile for p=0.25 & 0.75
# third quartile/median
kitchenham.describe()

In [None]:
# Duplicate rows
kitchenham.duplicated().sum()

In [None]:
# Adequacy of data using heatmap
sns.heatmap(kitchenham.corr())

In [None]:
# Checking for class imbalance
for col in kitchenham.columns:
    kitchenham[col].plot(kind='hist', bins=20, title=f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')
    plt.show()

In [None]:
# Plotting boxplots for each numerical column
for col in kitchenham.columns:
    plt.figure(figsize=(15,15))
    sns.boxplot(y=kitchenham[col])
    plt.title(f'Boxplot of {col}')
    plt.xlabel(col)
    plt.show()

In [None]:
trim_frac=0.05
for col in kitchenham.columns:
    print(f'Trimmed Mean for {col} : {trim_mean(kitchenham[col], proportiontocut=trim_frac)}')

In [None]:
trimmed_kitchenham = pd.DataFrame()

for col in kitchenham.columns:
    sorted_data = np.sort(kitchenham[col])
    
    lower_idx = int(len(sorted_data) * trim_frac)
    upper_idx = int(len(sorted_data) * (1 - trim_frac))
    
    trimmed_data = sorted_data[lower_idx:upper_idx]
    
    trimmed_column_kitchenham = np.full_like(kitchenham[col], np.nan)
    trimmed_column_kitchenham[lower_idx:upper_idx] = trimmed_data
    
    trimmed_kitchenham[col] = trimmed_column_kitchenham

trimmed_kitchenham

In [None]:
trim_kitchenham_cleaned=trimmed_kitchenham.dropna()

trim_kitchenham_cleaned

In [None]:
trimmed_median_dict = {}
trimmed_std_dict = {}

for col in trim_kitchenham_cleaned.columns:
    sorted_data = np.sort(trim_kitchenham_cleaned[col])
    
    lower_idx = int(len(sorted_data) * trim_frac)
    upper_idx = int(len(sorted_data) * (1 - trim_frac))
    
    trimmed_data = sorted_data[lower_idx:upper_idx]
    
    trimmed_median = np.median(trimmed_data)
    trimmed_median_dict[col] = trimmed_median
    
    trimmed_std = np.std(trimmed_data, ddof=1) 
    trimmed_std_dict[col] = trimmed_std

trimmed_median_kitchenham = pd.DataFrame(list(trimmed_median_dict.items()), columns=['Column', 'Trimmed Median'])
trimmed_std_kitchenham = pd.DataFrame(list(trimmed_std_dict.items()), columns=['Column', 'Trimmed Std Deviation'])

trimmed_median_kitchenham

In [None]:
trimmed_std_kitchenham

In [181]:
# Split the data into training and test sets
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

df=pd.read_csv('./desharnais.txt', names=columns, comment='%', skipinitialspace=True, delimiter=',')



In [None]:

df.isna().sum()

print(df['TeamExp'].unique())
print(df['ManagerExp'].unique())

df['TeamExp'] = df['TeamExp'].replace('?', np.nan)
df['TeamExp'] = pd.to_numeric(df['TeamExp'], errors='coerce')

# Step 3: Replace NaN with the median (or mean if you prefer)
df['TeamExp'].fillna(df['TeamExp'].median(), inplace=True)
df['ManagerExp'] = df['ManagerExp'].replace('?', np.nan)
df['ManagerExp'] = pd.to_numeric(df['ManagerExp'], errors='coerce')

# Step 3: Replace NaN with the median (or mean if you prefer)
df['ManagerExp'].fillna(df['ManagerExp'].median(), inplace=True)



In [None]:
print(df)

In [None]:
# Replace NaN in 'TeamExp' with the median value
df['TeamExp'].fillna(df['TeamExp'].median(), inplace=True)

# Replace NaN in 'ManagerExp' with the median value
df['ManagerExp'].fillna(df['ManagerExp'].median(), inplace=True)


In [None]:
print(df['TeamExp'].unique())
print(df['ManagerExp'].unique())

In [None]:
X = df.drop(columns=['Effort'])  # 'Effort' is assumed to be the target variable
y = df['Effort']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the model
model = LinearRegression()

# Train the model
model.fit(X_train, y_train)


In [187]:
y_pred = model.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

In [None]:
# Print evaluation metrics
print(f"Mean Absolute Error (MAE): {mae}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"R-squared (R2): {r2}")

# Print model coefficients
print("Coefficients:", model.coef_)
print("Intercept:", model.intercept_)

In [None]:
from sklearn.model_selection import cross_val_score

# Assuming X is your feature set and y is your target variable
scores = cross_val_score(model, X, y, cv=10, scoring='neg_mean_squared_error')

# Convert scores to positive and calculate RMSE
rmse_scores = np.sqrt(-scores)

print(f'Cross-Validation RMSE Scores: {rmse_scores}')
print(f'Mean RMSE: {rmse_scores.mean()}')
print(f'Standard Deviation of RMSE: {rmse_scores.std()}')


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Load the dataset
data=pd.read_csv('./desharnais.txt', names=columns, comment='%', skipinitialspace=True, delimiter=',')

# Preprocessing: Replace '?' with NaN, and handle NaNs (for experience-related columns)
data.replace('?', np.nan, inplace=True)
data['TeamExp'] = pd.to_numeric(data['TeamExp'], errors='coerce').fillna(0)
data['ManagerExp'] = pd.to_numeric(data['ManagerExp'], errors='coerce').fillna(0)

# Features and target variable
X = data[['TeamExp', 'ManagerExp', 'YearEnd', 'Length', 'Transactions', 'Entities', 'PointsAdjust', 'Envergure', 'PointsNonAjust', 'Language']]
y = data['Effort']

# Split the dataset into training and testing sets (70-30 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize the model
model = LinearRegression()

# Fit the model on the training data without cross-validation
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate performance without cross-validation
mse_before = mean_squared_error(y_test, y_pred)
mae_before = mean_absolute_error(y_test, y_pred)

# 10-Fold Cross-Validation
cv_scores = cross_val_score(model, X, y, cv=10, scoring='neg_mean_squared_error')
rmse_cv_scores = np.sqrt(-cv_scores)

# Average RMSE from cross-validation
mean_rmse_cv = rmse_cv_scores.mean()
std_rmse_cv = rmse_cv_scores.std()

# Print results
print("Performance Without Cross-Validation:")
print(f"Mean Squared Error: {mse_before:.2f}")
print(f"Mean Absolute Error: {mae_before:.2f}")

print("\nPerformance With 10-Fold Cross-Validation:")
print(f"Mean RMSE: {mean_rmse_cv:.2f}")
print(f"Standard Deviation of RMSE: {std_rmse_cv:.2f}")

# Optional: Visualize the cross-validation results
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
plt.plot(range(1, 11), rmse_cv_scores, marker='o', linestyle='-', color='blue')
plt.title('RMSE for Each Fold in 10-Fold Cross-Validation')
plt.xlabel('Fold Number')
plt.ylabel('RMSE')
plt.xticks(range(1, 11))
plt.grid()
plt.show()


In [None]:
X = data[['TeamExp', 'ManagerExp', 'Length', 'Transactions', 'Entities', 'PointsAdjust', 'Envergure', 'Language']]
y = data['Effort']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and fit the model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate R^2 for the initial model
r2_before = r2_score(y_test, y_pred)
accuracy_before = r2_before * 100  # Convert to percentage

# Print before K-fold R^2
print(f'R-squared before K-fold: {accuracy_before:.2f}%')

# Perform K-fold cross-validation
k_fold_r2_scores = cross_val_score(model, X, y, cv=10, scoring='r2')
accuracy_after = np.mean(k_fold_r2_scores) * 100  # Convert to percentage

# Print after K-fold R^2
print(f'Average R-squared after K-fold: {accuracy_after:.2f}%')

In [None]:
from sklearn.svm import SVR
# Create and fit the SVR model
svr_model = SVR(kernel='linear')  # You can change the kernel to 'rbf', 'poly', etc.
svr_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = svr_model.predict(X_test)

# Calculate R^2 for the initial model
r2_before = r2_score(y_test, y_pred)
accuracy_before = r2_before * 100  # Convert to percentage

# Print before K-fold R^2
print(f'R-squared before K-fold (SVR): {accuracy_before:.2f}%')

# Perform K-fold cross-validation
k_fold_r2_scores = cross_val_score(svr_model, X, y, cv=10, scoring='r2')
accuracy_after = np.mean(k_fold_r2_scores) * 100  # Convert to percentage

# Print after K-fold R^2
print(f'Average R-squared after K-fold (SVR): {accuracy_after:.2f}%')

# Calculate and print RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f'Root Mean Squared Error (SVR): {rmse:.2f}')

In [None]:
from sklearn.tree import DecisionTreeRegressor

# Create and fit the Decision Tree model
dt_model = DecisionTreeRegressor(random_state=42)
dt_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred_dt = dt_model.predict(X_test)

# Calculate R^2 for the initial model
r2_dt_before = r2_score(y_test, y_pred_dt)
accuracy_dt_before = r2_dt_before * 100  # Convert to percentage

# Print before K-fold R^2 for Decision Tree
print(f'R-squared before K-fold (Decision Tree): {accuracy_dt_before:.2f}%')

# Perform K-fold cross-validation for Decision Tree
k_fold_r2_dt_scores = cross_val_score(dt_model, X, y, cv=10, scoring='r2')
accuracy_dt_after = np.mean(k_fold_r2_dt_scores) * 100  # Convert to percentage

# Print after K-fold R^2 for Decision Tree
print(f'Average R-squared after K-fold (Decision Tree): {accuracy_dt_after:.2f}%')

# Calculate and print RMSE for Decision Tree
rmse_dt = np.sqrt(mean_squared_error(y_test, y_pred_dt))
print(f'Root Mean Squared Error (Decision Tree): {rmse_dt:.2f}')

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Load your dataset
# df = pd.read_csv('your_dataset.csv') # Replace with your dataset loading method

# Define your features and target variable
X = df[['TeamExp', 'ManagerExp', 'Length', 'Transactions', 'Entities', 'PointsAdjust', 'Envergure', 'Language']]
y = df['Effort']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and fit the Decision Tree model
dt_model = DecisionTreeRegressor(random_state=42)
dt_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred_dt = dt_model.predict(X_test)

# Calculate R^2 for the initial model
r2_dt_before = r2_score(y_test, y_pred_dt)

# Print before K-fold R^2 for Decision Tree
print(f'R-squared before K-fold (Decision Tree): {r2_dt_before:.4f}')

# Perform K-fold cross-validation for Decision Tree
k_fold_r2_dt_scores = cross_val_score(dt_model, X, y, cv=10, scoring='r2')
average_r2_dt_after = np.mean(k_fold_r2_dt_scores)

# Print after K-fold R^2 for Decision Tree
print(f'Average R-squared after K-fold (Decision Tree): {average_r2_dt_after:.4f}')

# Calculate and print RMSE for Decision Tree
rmse_dt = np.sqrt(mean_squared_error(y_test, y_pred_dt))
print(f'Root Mean Squared Error (Decision Tree): {rmse_dt:.4f}')


In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Load your dataset
data=pd.read_csv('./desharnais.txt', names=columns, comment='%', skipinitialspace=True, delimiter=',')

# Train-test split
X = data[['TeamExp', 'ManagerExp', 'Length', 'Transactions', 'Entities', 'PointsAdjust', 'Envergure', 'Language']]
y = data['Effort']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize models, including Random Forest, Gradient Boosting, and KNN
models = {
    'Linear Regression': LinearRegression(),
    'Support Vector Regression': SVR(kernel='linear'),
    'Decision Tree': DecisionTreeRegressor(),
    'Random Forest': RandomForestRegressor(),
    'Gradient Boosting': GradientBoostingRegressor(),
    'KNN': KNeighborsRegressor()
}

# Train models and evaluate performance
results = {}

for model_name, model in models.items():
    # Perform 10-fold cross-validation
    cv_scores = cross_val_score(model, X_train, y_train, cv=10, scoring='neg_mean_squared_error')
    mean_cv_score = -np.mean(cv_scores)
    
    # Fit the model
    model.fit(X_train, y_train)
    
    # Predict on the test set
    y_pred = model.predict(X_test)
    
    # Calculate performance metrics
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    
    # Store results
    results[model_name] = {
        'Mean CV MSE': mean_cv_score,
        'RMSE': rmse,
        'R^2': r2
    }

# Print results
for model_name, metrics in results.items():
    print(f"{model_name}:")
    print(f"  Mean CV MSE: {metrics['Mean CV MSE']:.2f}")
    print(f"  RMSE: {metrics['RMSE']:.2f}")
    print(f"  R^2: {metrics['R^2']:.2f}\n")
