In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split

# Function to evaluate classification models
def evaluate_classification_model(X_train, X_test, y_train, y_test, model):
    model.fit(X_train, y_train)  # Train the model
    y_pred = model.predict(X_test)  # Make predictions

    # Calculate evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')

    return accuracy, precision, recall, f1

# Define dataset paths
datasets = {
    'StandardScaler_ZScore': 'termproject/preprocessed_data_standardized.csv',
    'Normalizer_ZScore': 'termproject/preprocessed_data_normalized.csv',
    'MinMaxScaler_ZScore': 'termproject/preprocessed_data_2.csv',
    'MinMaxScaler_LabelEncoder': 'termproject/preprocessed_data.csv',
    'Normalizer_IQR': 'termproject/preprocessed_data_normalized_iqr.csv',
    'StandardScaler_IQR': 'termproject/preprocessed_data_standard_iqr.csv',
}

# Define models to be used
models = {
    'LogisticRegression': LogisticRegression(max_iter=1000),
    'DecisionTreeClassifier': DecisionTreeClassifier(),
    'RandomForestClassifier': RandomForestClassifier(),
    'GradientBoostingClassifier': GradientBoostingClassifier()
}

# Initialize a list to store results
results = []

# Evaluate each combination of dataset and model
for dataset_name, filepath in datasets.items():
    try:
        # Load the dataset
        data = pd.read_csv(filepath)
    except FileNotFoundError:
        print(f"File {filepath} not found.")
        continue

    # Separate features and target variable
    X = data.drop(columns=['salary_in_usd'])
    y = data['salary_in_usd']
    
    # Convert target variable to categorical (for classification)
    y = pd.qcut(y, q=3, labels=[0, 1, 2])

    # Split the data into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Evaluate each model
    for model_name, model in models.items():
        accuracy, precision, recall, f1 = evaluate_classification_model(X_train, X_test, y_train, y_test, model)
        results.append({
            'Dataset': dataset_name,
            'Model': model_name,
            'Accuracy': accuracy,
            'Precision': precision,
            'Recall': recall,
            'F1 Score': f1
        })

# Convert results to a DataFrame
results_df = pd.DataFrame(results)

# Rank each metric
results_df['Accuracy_rank'] = results_df['Accuracy'].rank(ascending=False)  # Accuracy: higher is better
results_df['Precision_rank'] = results_df['Precision'].rank(ascending=False)  # Precision: higher is better
results_df['Recall_rank'] = results_df['Recall'].rank(ascending=False)  # Recall: higher is better
results_df['F1_rank'] = results_df['F1 Score'].rank(ascending=False)  # F1 Score: higher is better

# Calculate total rank
results_df['Total_rank'] = results_df[['Accuracy_rank', 'Precision_rank', 'Recall_rank', 'F1_rank']].mean(axis=1)

# Select the top 5 combinations by overall rank
top_5_overall = results_df.nsmallest(5, 'Total_rank')

# Print the results
print("Top 5 Combinations by Overall Rank:")
print(top_5_overall)



Top 5 Combinations by Overall Rank:
                Dataset                       Model  Accuracy  Precision  \
18       Normalizer_IQR      RandomForestClassifier  0.539919   0.536365   
21   StandardScaler_IQR      DecisionTreeClassifier  0.534506   0.533239   
22   StandardScaler_IQR      RandomForestClassifier  0.535859   0.531831   
17       Normalizer_IQR      DecisionTreeClassifier  0.527740   0.524488   
11  MinMaxScaler_ZScore  GradientBoostingClassifier  0.507363   0.503708   

      Recall  F1 Score  Accuracy_rank  Precision_rank  Recall_rank  F1_rank  \
18  0.539919  0.533849            1.0             1.0          1.0      1.0   
21  0.534506  0.529345            3.0             2.0          3.0      2.0   
22  0.535859  0.527997            2.0             3.0          2.0      3.0   
17  0.527740  0.522468            4.0             4.0          4.0      4.0   
11  0.507363  0.495151            7.5             7.5          7.5     11.5   

    Total_rank  
18         1.0 