In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from scipy import stats

file_path = 'clean_upd-2.xlsx'
data = pd.read_excel(file_path)

age_categories = {'18-24': 1, '25-34': 2, '35-44': 3, '45-54': 4, '55-64': 5, '65 or older': 6}
data['Age'] = data['Age'].map(age_categories)


scaler = StandardScaler()
data['Salary_Z'] = scaler.fit_transform(data[['Salary']])

# remove outliers
data_filtered = data[(np.abs(stats.zscore(data['Salary'])) < 3)]

data_preprocessed = pd.get_dummies(data_filtered, columns=['Industry', 'Country', 'Years job', 'Highest education', 'Gender', 'Race'], drop_first=True)

data_preprocessed_clean = data_preprocessed.dropna()

X_clean = data_preprocessed_clean.drop(['Salary', 'Salary_Z', 'Job Title'], axis=1)
y_clean = data_preprocessed_clean['Salary_Z']


X_train_clean, X_val_clean, y_train_clean, y_val_clean = train_test_split(X_clean, y_clean, test_size=0.2, random_state=42)

# models to try out
models = {
    'LinearRegression': LinearRegression(),
    'Ridge': Ridge(),
    'Lasso': Lasso(),
    'RandomForest': RandomForestRegressor()
}

# fitting
results = []
for name, model in models.items():
    model.fit(X_train_clean, y_train_clean)
    y_pred_clean = model.predict(X_val_clean)
    mse_clean = mean_squared_error(y_val_clean, y_pred_clean)
    r2_clean = r2_score(y_val_clean, y_pred_clean)
    results.append({'Model': name, 'MSE': mse_clean, 'R2': r2_clean})


results_df = pd.DataFrame(results)
scaler_metrics = MinMaxScaler()
results_df[['MSE', 'R2']] = scaler_metrics.fit_transform(results_df[['MSE', 'R2']])


print(results_df)


              Model       MSE        R2
0  LinearRegression  0.005815  0.994185
1             Ridge  0.000000  1.000000
2             Lasso  1.000000  0.000000
3      RandomForest  0.400120  0.599880


In [12]:
X_train_clean.head()

Unnamed: 0,Age,Job Title Cluster,Industry_AGRICULTURE OR FORESTRY,Industry_ART & DESIGN,Industry_BUSINESS OR CONSULTING,Industry_COMPUTING OR TECH,Industry_EDUCATION (HIGHER EDUCATION),Industry_EDUCATION (PRIMARY/SECONDARY),Industry_ENGINEERING OR MANUFACTURING,Industry_ENTERTAINMENT,...,"Race_Hispanic, Latino, or Spanish origin, White",Race_Middle Eastern or Northern African,"Race_Middle Eastern or Northern African, Native American or Alaska Native, White","Race_Middle Eastern or Northern African, White","Race_Middle Eastern or Northern African, White, Another option not listed here or prefer not to answer",Race_Native American or Alaska Native,"Race_Native American or Alaska Native, White","Race_Native American or Alaska Native, White, Another option not listed here or prefer not to answer",Race_White,"Race_White, Another option not listed here or prefer not to answer"
181,2.0,13,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
1072,2.0,35,False,False,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
9991,2.0,17,False,False,False,False,False,False,False,False,...,True,False,False,False,False,False,False,False,False,False
9260,2.0,6,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
16507,2.0,34,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [13]:
y_train.head()

19726    0.491690
13342   -0.022379
5547    -0.231501
1783     0.395264
19584   -0.356305
Name: Salary, dtype: float64

In [15]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from scipy import stats

# Load data
file_path = 'clean_upd-2.xlsx'
data = pd.read_excel(file_path)

# Map age categories to numerical values
age_categories = {'18-24': 1, '25-34': 2, '35-44': 3, '45-54': 4, '55-64': 5, '65 or older': 6}
data['Age'] = data['Age'].map(age_categories)

# Standardize the salary
scaler = StandardScaler()
data['Salary_Z'] = scaler.fit_transform(data[['Salary']])

# Remove outliers
data_filtered = data[(np.abs(stats.zscore(data['Salary'])) < 3)]

# One-hot encode categorical variables excluding 'Race'
data_preprocessed = pd.get_dummies(data_filtered, columns=['Industry', 'Country', 'Years job', 'Highest education', 'Gender'], drop_first=True)

# Drop rows with any missing values
data_preprocessed_clean = data_preprocessed.dropna()

# Ensure the 'Race' column is excluded
columns_to_drop = ['Salary', 'Salary_Z', 'Job Title', 'Race']
X_clean = data_preprocessed_clean.drop(columns=columns_to_drop, errors='ignore')
y_clean = data_preprocessed_clean['Salary_Z']

# Split data into training and validation sets
X_train_clean, X_val_clean, y_train_clean, y_val_clean = train_test_split(X_clean, y_clean, test_size=0.2, random_state=42)

# Define models
models = {
    'LinearRegression': LinearRegression(),
    'Ridge': Ridge(),
    'Lasso': Lasso(),
    'RandomForest': RandomForestRegressor()
}

# Fit models and evaluate performance
results = []
for name, model in models.items():
    model.fit(X_train_clean, y_train_clean)
    y_pred_clean = model.predict(X_val_clean)
    mse_clean = mean_squared_error(y_val_clean, y_pred_clean)
    r2_clean = r2_score(y_val_clean, y_pred_clean)
    results.append({'Model': name, 'MSE': mse_clean, 'R2': r2_clean})

# Scale results to compare performance
results_df = pd.DataFrame(results)
scaler_metrics = MinMaxScaler()
results_df[['MSE', 'R2']] = scaler_metrics.fit_transform(results_df[['MSE', 'R2']])

# Output results
print(results_df)


              Model       MSE        R2
0  LinearRegression  0.005060  0.994940
1             Ridge  0.000000  1.000000
2             Lasso  1.000000  0.000000
3      RandomForest  0.427989  0.572011


In [None]:
# avoid overfitting by using cross-validation
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from scipy import stats


file_path = 'clean_upd-2.xlsx'
data = pd.read_excel(file_path)


age_categories = {'18-24': 1, '25-34': 2, '35-44': 3, '45-54': 4, '55-64': 5, '65 or older': 6}
data['Age'] = data['Age'].map(age_categories)


scaler = StandardScaler()
data['Salary_Z'] = scaler.fit_transform(data[['Salary']])


data_filtered = data[(np.abs(stats.zscore(data['Salary'])) < 3)]

# exclude Race
data_preprocessed = pd.get_dummies(data_filtered, columns=['Industry', 'Country', 'Years job', 'Highest education', 'Gender'], drop_first=True)


data_preprocessed_clean = data_preprocessed.dropna()


columns_to_drop = ['Salary', 'Salary_Z', 'Job Title', 'Race']
X_clean = data_preprocessed_clean.drop(columns=columns_to_drop, errors='ignore')
y_clean = data_preprocessed_clean['Salary_Z']


models = {
    'LinearRegression': LinearRegression(),
    'RandomForest': RandomForestRegressor()
}

# k-fold validation
kf = KFold(n_splits=10, shuffle=True, random_state=42)
results = []
for name, model in models.items():
    mse_scores = cross_val_score(model, X_clean, y_clean, cv=kf, scoring='neg_mean_squared_error')
    r2_scores = cross_val_score(model, X_clean, y_clean, cv=kf, scoring='r2')
    results.append({
        'Model': name,
        'MSE': -np.mean(mse_scores),
        'R2': np.mean(r2_scores)
    })

results_df = pd.DataFrame(results)
scaler_metrics = MinMaxScaler()
results_df[['MSE', 'R2']] = scaler_metrics.fit_transform(results_df[['MSE', 'R2']])


print(results_df)


In [17]:
# avoid overfitting by using cross-validation

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from scipy import stats


file_path = 'clean_upd-2.xlsx'
data = pd.read_excel(file_path)

age_categories = {'18-24': 1, '25-34': 2, '35-44': 3, '45-54': 4, '55-64': 5, '65 or older': 6}
data['Age'] = data['Age'].map(age_categories)

scaler = StandardScaler()
data['Salary_Z'] = scaler.fit_transform(data[['Salary']])

# remove outliers
data_filtered = data[(np.abs(stats.zscore(data['Salary'])) < 3)]

# exclude Race
data_preprocessed = pd.get_dummies(data_filtered, columns=['Industry', 'Country', 'Years job', 'Highest education', 'Gender'], drop_first=True)

data_preprocessed_clean = data_preprocessed.dropna()

columns_to_drop = ['Salary', 'Salary_Z', 'Job Title', 'Race']
X_clean = data_preprocessed_clean.drop(columns=columns_to_drop, errors='ignore')
y_clean = data_preprocessed_clean['Salary_Z']

models = {
    'LinearRegression': LinearRegression(),
    'RandomForest': RandomForestRegressor(),
    'GradientBoosting': GradientBoostingRegressor(),
    'SVR': SVR(),
    'XGBoost': XGBRegressor()
}

# k-fold cross-validation
kf = KFold(n_splits=10, shuffle=True, random_state=42)
results = []
for name, model in models.items():
    mse_scores = cross_val_score(model, X_clean, y_clean, cv=kf, scoring='neg_mean_squared_error')
    r2_scores = cross_val_score(model, X_clean, y_clean, cv=kf, scoring='r2')
    results.append({
        'Model': name,
        'MSE': -np.mean(mse_scores),
        'R2': np.mean(r2_scores)
    })


results_df = pd.DataFrame(results)
scaler_metrics = MinMaxScaler()
results_df[['MSE', 'R2']] = scaler_metrics.fit_transform(results_df[['MSE', 'R2']])


print(results_df)


              Model       MSE        R2
0  LinearRegression  0.067076  0.934843
1      RandomForest  1.000000  0.000000
2  GradientBoosting  0.060171  0.940691
3               SVR  0.546657  0.465034
4           XGBoost  0.000000  1.000000


In [None]:
# with feature importance
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from scipy import stats
import matplotlib.pyplot as plt

file_path = 'clean_upd-2.xlsx'
data = pd.read_excel(file_path)


age_categories = {'18-24': 1, '25-34': 2, '35-44': 3, '45-54': 4, '55-64': 5, '65 or older': 6}
data['Age'] = data['Age'].map(age_categories)


scaler = StandardScaler()
data['Salary_Z'] = scaler.fit_transform(data[['Salary']])

# remove outliers
data_filtered = data[(np.abs(stats.zscore(data['Salary'])) < 3)]

data_preprocessed = pd.get_dummies(data_filtered, columns=['Industry', 'Country', 'Years job', 'Highest education', 'Gender'], drop_first=True)

data_preprocessed_clean = data_preprocessed.dropna()

columns_to_drop = ['Salary', 'Salary_Z', 'Job Title', 'Race']
X_clean = data_preprocessed_clean.drop(columns=columns_to_drop, errors='ignore')
y_clean = data_preprocessed_clean['Salary_Z']


models = {
    'LinearRegression': LinearRegression(),
    'RandomForest': RandomForestRegressor(),
    'GradientBoosting': GradientBoostingRegressor(),
    'SVR': SVR(),
    'XGBoost': XGBRegressor()
}

# k-fold cross-validation
kf = KFold(n_splits=10, shuffle=True, random_state=42)
results = []
feature_importances = {}

for name, model in models.items():
    mse_scores = cross_val_score(model, X_clean, y_clean, cv=kf, scoring='neg_mean_squared_error')
    r2_scores = cross_val_score(model, X_clean, y_clean, cv=kf, scoring='r2')
    results.append({
        'Model': name,
        'MSE': -np.mean(mse_scores),
        'R2': np.mean(r2_scores)
    })
    # get feature importance
    model.fit(X_clean, y_clean)
    if hasattr(model, 'feature_importances_'):
        feature_importances[name] = model.feature_importances_


results_df = pd.DataFrame(results)
scaler_metrics = MinMaxScaler()
results_df[['MSE', 'R2']] = scaler_metrics.fit_transform(results_df[['MSE', 'R2']])


print(results_df)

# feature importance graph
for model_name, importances in feature_importances.items():
    indices = np.argsort(importances)[::-1]
    plt.figure(figsize=(10, 6))
    plt.title(f'Feature Importances for {model_name}')
    plt.bar(range(X_clean.shape[1]), importances[indices], align='center')
    plt.xticks(range(X_clean.shape[1]), X_clean.columns[indices], rotation=90)
    plt.tight_layout()
    plt.show()
