<a href="https://colab.research.google.com/github/Reno20/EMPLOYEE_SALARY_PREDICTION/blob/main/Model_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Importing all libraries

In [None]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

# Import all models to be tested
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier


Load and Clean data

In [None]:
try:
    col_names = [
        'age', 'workclass', 'fnlwgt', 'education', 'educational-num', 'marital-status',
        'occupation', 'relationship', 'race', 'gender', 'capital-gain', 'capital-loss',
        'hours-per-week', 'native-country', 'income'
    ]

    data = pd.read_csv('adult.csv', header=None, names=col_names, na_values='?', skipinitialspace=True)

    data.dropna(inplace=True)

    # Drop the 'education' column as 'educational-num' represents the same information numerically
    data.drop('education', axis=1, inplace=True)

    print("Data loaded and cleaned successfully.")
    print(f"Data shape after cleaning: {data.shape}")

except FileNotFoundError:
    print("Error: 'adult.csv' not found.")
    exit()

Data loaded and cleaned successfully.
Data shape after cleaning: (45223, 14)


In [None]:
data

Unnamed: 0,age,workclass,fnlwgt,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,age,workclass,fnlwgt,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
1,25,Private,226802,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
2,38,Private,89814,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
3,28,Local-gov,336951,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
4,44,Private,160323,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48838,27,Private,257302,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
48839,40,Private,154374,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
48840,58,Private,151910,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
48841,22,Private,201490,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


Prepare & preprocess data


In [None]:
#Converting columns to their correct data types ---
numerical_cols_to_convert = [
    'age', 'fnlwgt', 'educational-num', 'capital-gain', 'capital-loss', 'hours-per-week'
]

# Remove the header row before converting to numeric
data = data.iloc[1:]

for col in numerical_cols_to_convert:
    data[col] = pd.to_numeric(data[col])

print("\nCorrected column data types.")


Corrected column data types.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[col] = pd.to_numeric(data[col])


In [None]:
X = data.drop('income', axis=1)
y = data['income'].apply(lambda x: 1 if x == '>50K' else 0) # Encode target to 0 and 1

# Automatically identify numerical and categorical features
numerical_features = X.select_dtypes(include=np.number).columns.tolist()
categorical_features = X.select_dtypes(include=['object']).columns.tolist()

print(f"\nIdentified {len(numerical_features)} numerical features: {numerical_features}")
print(f"Identified {len(categorical_features)} categorical features: {categorical_features}")


Identified 6 numerical features: ['age', 'fnlwgt', 'educational-num', 'capital-gain', 'capital-loss', 'hours-per-week']
Identified 7 categorical features: ['workclass', 'marital-status', 'occupation', 'relationship', 'race', 'gender', 'native-country']


In [None]:
# Create the master preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ],
    remainder='passthrough'
)

In [None]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"\nData split into training and testing sets.")
print(f"Training set size: {len(X_train)}")
print(f"Testing set size: {len(X_test)}")


Data split into training and testing sets.
Training set size: 36177
Testing set size: 9045


Train models

In [None]:

models = {
    "LogisticRegression": LogisticRegression(max_iter=1000, random_state=42),
    "KNeighborsClassifier": KNeighborsClassifier(),
    "MLPClassifier": MLPClassifier(max_iter=1000, random_state=42, early_stopping=True),
    "RandomForestClassifier": RandomForestClassifier(random_state=42),
    "XGBClassifier": XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
    "GradientBoostingClassifier": GradientBoostingClassifier(random_state=42),
    "SVC": SVC(random_state=42)
}

best_model = None
best_accuracy = 0.0

# Loop through each model to build a pipeline, train, and evaluate
for name, model in models.items():
    # Create the full pipeline: Preprocessor -> Classifier
    full_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', model)
    ])

    print(f"\n--- Training {name} ---")
    # Train the model
    full_pipeline.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = full_pipeline.predict(X_test)

    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    print(f" Accuracy for {name}: {accuracy:.4f}")

    # Print a detailed classification report
    print("Classification Report:")
    print(classification_report(y_test, y_pred))


    # Check if this is the best model so far
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model = full_pipeline
        best_model_name = name

print("\n" + "="*50)
print(f" Best Model Found: {best_model_name}")
print(f" Best Accuracy: {best_accuracy:.4f}")
print("="*50)


--- Training LogisticRegression ---
✅ Accuracy for LogisticRegression: 0.8465
Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.93      0.90      6803
           1       0.73      0.60      0.66      2242

    accuracy                           0.85      9045
   macro avg       0.80      0.76      0.78      9045
weighted avg       0.84      0.85      0.84      9045


--- Training KNeighborsClassifier ---
✅ Accuracy for KNeighborsClassifier: 0.8259
Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.91      0.89      6803
           1       0.67      0.59      0.62      2242

    accuracy                           0.83      9045
   macro avg       0.77      0.75      0.76      9045
weighted avg       0.82      0.83      0.82      9045


--- Training MLPClassifier ---
✅ Accuracy for MLPClassifier: 0.8489
Classification Report:
              precision    recall  f1-score   

Parameters: { "use_label_encoder" } are not used.



✅ Accuracy for XGBClassifier: 0.8693
Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.93      0.91      6803
           1       0.77      0.67      0.72      2242

    accuracy                           0.87      9045
   macro avg       0.83      0.80      0.82      9045
weighted avg       0.87      0.87      0.87      9045


--- Training GradientBoostingClassifier ---
✅ Accuracy for GradientBoostingClassifier: 0.8587
Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.94      0.91      6803
           1       0.78      0.60      0.68      2242

    accuracy                           0.86      9045
   macro avg       0.83      0.77      0.79      9045
weighted avg       0.85      0.86      0.85      9045


--- Training SVC ---
✅ Accuracy for SVC: 0.8503
Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.93      0.

save best model

In [None]:
model_filename = 'best_salary_predictor.pkl'
joblib.dump(best_model, model_filename)

print(f"\n✅ Best model pipeline has been saved to '{model_filename}'.")


✅ Best model pipeline has been saved to 'best_salary_predictor.pkl'.
