In [6]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split

# Function to evaluate the model
def evaluate_model(X_train, X_test, y_train, y_test, model):
    model.fit(X_train, y_train)  # Train the model
    y_pred = model.predict(X_test)  # Make predictions
    
    # Calculate evaluation metrics
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    return mse, mae, r2

datasets = {
    'StandardScaler_ZScore': 'termproject/preprocessed_data_standardized.csv',
    'Normalizer_ZScore': 'termproject/preprocessed_data_normalized.csv',
    'MinMaxScaler_ZScore': 'termproject/preprocessed_data_2.csv',
    'MinMaxScaler_LabelEncoder': 'termproject/preprocessed_data.csv',
    'Normalizer_IQR': 'termproject/preprocessed_data_normalized_iqr.csv',
    'StandardScaler_IQR': 'termproject/preprocessed_data_standard_iqr.csv',
}

# Define models to be used
models = {
    'LinearRegression': LinearRegression(),
    'DecisionTreeRegressor': DecisionTreeRegressor(),
    'RandomForestRegressor': RandomForestRegressor(),
    'GradientBoostingRegressor': GradientBoostingRegressor()
}

# Initialize a list to store results
results = []

# Evaluate each combination of dataset and model
for dataset_name, filepath in datasets.items():
    try:
        # Load the dataset
        data = pd.read_csv(filepath)
    except FileNotFoundError:
        print(f"File {filepath} not found.")
        continue
    
    # Separate features and target variable
    X = data.drop(columns=['salary_in_usd'])
    y = data['salary_in_usd']
    
    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Evaluate each model on the dataset
    for model_name, model in models.items():
        mse, mae, r2 = evaluate_model(X_train, X_test, y_train, y_test, model)
        results.append({
            'Dataset': dataset_name,
            'Model': model_name,
            'MSE': mse,
            'MAE': mae,
            'R^2': r2
        })

# Convert results to a DataFrame
results_df = pd.DataFrame(results)

# Rank each metric
results_df['MSE_rank'] = results_df['MSE'].rank(ascending=True)  # MSE: lower is better
results_df['MAE_rank'] = results_df['MAE'].rank(ascending=True)  # MAE: lower is better
results_df['R^2_rank'] = results_df['R^2'].rank(ascending=False)  # R^2: higher is better

# Calculate the total rank
results_df['Total_rank'] = results_df[['MSE_rank', 'MAE_rank', 'R^2_rank']].mean(axis=1)

# Select the top 5 combinations by overall rank
top_5_overall = results_df.nsmallest(5, 'Total_rank')

# Print the top 5 combinations
print("Top 5 Combinations by Overall Rank:")
print(top_5_overall)


Top 5 Combinations by Overall Rank:
                  Dataset                      Model           MSE  \
23     StandardScaler_IQR  GradientBoostingRegressor  2.504240e+09   
19         Normalizer_IQR  GradientBoostingRegressor  2.520261e+09   
22     StandardScaler_IQR      RandomForestRegressor  2.523651e+09   
18         Normalizer_IQR      RandomForestRegressor  2.539994e+09   
3   StandardScaler_ZScore  GradientBoostingRegressor  2.662289e+09   

             MAE       R^2  MSE_rank  MAE_rank  R^2_rank  Total_rank  
23  39228.714645  0.271588       1.0       1.0       6.0    2.666667  
19  39340.376000  0.266928       2.0       4.0       9.0    5.000000  
22  39266.381384  0.265942       3.0       3.0      10.0    5.333333  
18  39263.755028  0.261188       4.0       2.0      12.0    6.000000  
3   40652.725491  0.275543       7.0      12.0       1.0    6.666667  
