In [12]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Load the data
df = pd.read_csv('merged_results.csv')

# Drop the Filename column
df = df.drop(columns=['Filename'])

# Split the data into features and target
features = ['Ave_mac', 'eTime_macos', 'eTime_windows10', 'ReturnStmt', 'FieldDecl',
       'Class Defs', 'StructDecl', 'Static_Cast<>', 'FloatDecl',
       'MemberRefExpr', 'Max Depth', 'Ave_win', 'Med_win', 'BoolLiteral',
       'letdata']
X = df[features]
y = df['Med_mac']

# Remove outliers (e.g., data points that are more than 3 standard deviations from the mean)
y_mean = np.mean(y)
y_std = np.std(y)
outliers = (y > y_mean + 3 * y_std) | (y < y_mean - 3 * y_std)

# Filter out the outliers
X = X[~outliers]
y = y[~outliers]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features (mean=0, variance=1)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((140, 15), (35, 15), (140,), (35,))

In [13]:
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.model_selection import GridSearchCV

# Feature selection: Select the top 20 features
selector = SelectKBest(score_func=f_regression, k=20)
X_train_selected = selector.fit_transform(X_train, y_train)
X_test_selected = selector.transform(X_test)

# Hyperparameter tuning for Random Forest Regressor using Grid Search
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize the model
rf_model = RandomForestRegressor(random_state=42)

# Grid Search
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train_selected, y_train)

# Best parameters from grid search
best_params = grid_search.best_params_

# Train the model with the best parameters
best_rf_model = grid_search.best_estimator_

# Predict on the test set
y_best_rf_pred = best_rf_model.predict(X_test_selected)

# Evaluate the model
best_rf_mse = mean_squared_error(y_test, y_best_rf_pred)
best_rf_r2 = r2_score(y_test, y_best_rf_pred)

best_params, best_rf_mse, best_rf_r2




Fitting 5 folds for each of 108 candidates, totalling 540 fits


({'max_depth': None,
  'min_samples_leaf': 4,
  'min_samples_split': 10,
  'n_estimators': 100},
 4.805922025458726e-09,
 0.28108909925272185)