In [4]:
import os 
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
import numpy as np 

In [5]:
# Initialize an empty DataFrame to store the results
results_df = pd.DataFrame(columns=['Iteration', 'RMSE', 'Denormalized_RMSE'])

In [6]:
# Initialize an empty list to store the results
results_list = []

for iteration in range(30):
    # Load the train and test pickles for the current iteration
    train_pickle_path = os.path.join('train_test_pickles', f'train_df_{iteration}.pickle')
    test_pickle_path = os.path.join('train_test_pickles', f'test_df_{iteration}.pickle')
    
    # Load the pickles
    train_df = pd.read_pickle(train_pickle_path)
    test_df = pd.read_pickle(test_pickle_path)
    
    #get the training std 
    trainrtt_mean = train_df['last_rtt'].mean()
    trainrtt_std = train_df['last_rtt'].std()
    
    cols_dropped = ['date','last_rtt','normalizzed_rtt','src_names']

    X_train = train_df.drop(columns=cols_dropped, axis=1)
    y_train = train_df['normalizzed_rtt']

    X_test = test_df.drop(columns=cols_dropped, axis=1)
    y_test = test_df['normalizzed_rtt'] 
    
    # Train a Random Forest model
    rand_forest = RandomForestRegressor(
                                        n_estimators=10 ,  # The number of trees in the forest.
                                        criterion='squared_error',   # This is the function used to measure the quality of a split (Mean Squared Error)
                                        max_depth=None,    # The maximum depth of the trees. None means nodes are expanded until all leaves are pure.
                                        random_state=42,    # Seed used by the random number generator for randomness.
                                        max_leaf_nodes=None,   # Grow trees with max_leaf_nodes in best-first fashion.
                                        min_impurity_decrease=0.0,   # A node will be split if this split induces a decrease of the impurity greater than or equal to this value.
                                        bootstrap=True,  # Whether bootstrap samples are used when building trees.
                                        oob_score=False,  # Whether to use out-of-bag samples to estimate the generalization accuracy.
                                        n_jobs=None,   # The number of jobs to run in parallel for both fitting and predicting.
                                    )
    rand_forest.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = rand_forest.predict(X_test)

    # Calculate evaluation metrics
    rmse_rf = np.sqrt(mean_squared_error(y_test, y_pred))
    drmse_rf = rmse_rf * trainrtt_std
    
    # Append the results to the DataFrame
    # Append the results to the list
    results_list.append({'Iteration': iteration, 'RMSE': rmse_rf, 'Denormalized_RMSE': drmse_rf})


# Convert the list of dictionaries to a DataFrame
results_df = pd.DataFrame(results_list)

# Save the results to a CSV file
results_folder = 'model_results'
results_csv_path = os.path.join(results_folder, 'rf_evaluation_results.csv')
results_df.to_csv(results_csv_path, index=False)

print('Evaluation results saved to:', results_csv_path)


Evaluation results saved to: model_results/rf_evaluation_results.csv
