# Nigerian Graduate Employment Salary Prediction

## Mission
To analyze the factors affecting graduate employment in Nigeria and predict employment outcomes based on education, demographics, and socioeconomic background, helping young Africans make informed career and education decisions.

## Problem Statement
Many Nigerian graduates struggle to find well-paying jobs despite their education. Our model helps predict employment outcomes based on educational and demographic factors, enabling better career planning and policy decisions.

## Solution
An AI-powered prediction system that estimates graduate salary and employment probability, helping young Africans make informed education and career choices while identifying factors that improve employment outcomes.

## 1. Import Required Libraries

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import joblib
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('ggplot')
sns.set_palette("husl")

## 2. Data Loading and Initial Exploration

In [None]:
# Load the dataset
df = pd.read_csv('Nigerian_Graduate_Survey_with_Salary.csv')

print("Dataset Shape:", df.shape)
print("\nFirst 5 rows:")
df.head()

In [None]:
# Basic information about the dataset
print("Dataset Info:")
df.info()
print("\nDataset Description:")
df.describe()

In [None]:
# Check for missing values
print("Missing Values:")
print(df.isnull().sum())

print("\nUnique values in categorical columns:")
categorical_cols = df.select_dtypes(include=['object']).columns
for col in categorical_cols:
    print(f"{col}: {df[col].unique()}")

## 3. Data Visualization and Analysis

In [None]:
# Distribution of target variable (Net Salary)
plt.figure(figsize=(15, 5))

plt.subplot(1, 3, 1)
plt.hist(df['Net_Salary'], bins=50, alpha=0.7, color='skyblue')
plt.title('Distribution of Net Salary')
plt.xlabel('Net Salary')
plt.ylabel('Frequency')

plt.subplot(1, 3, 2)
df['Employment_Status'].value_counts().plot(kind='bar', color='lightcoral')
plt.title('Employment Status Distribution')
plt.xticks(rotation=45)

plt.subplot(1, 3, 3)
df['Salary_Level'].value_counts().plot(kind='bar', color='lightgreen')
plt.title('Salary Level Distribution')
plt.xticks(rotation=45)

plt.tight_layout()
plt.show()

In [None]:
# Salary analysis by different factors
plt.figure(figsize=(20, 15))

plt.subplot(3, 3, 1)
sns.boxplot(data=df, x='Gender', y='Net_Salary')
plt.title('Salary by Gender')
plt.xticks(rotation=45)

plt.subplot(3, 3, 2)
sns.boxplot(data=df, x='Field_of_Study', y='Net_Salary')
plt.title('Salary by Field of Study')
plt.xticks(rotation=45)

plt.subplot(3, 3, 3)
sns.boxplot(data=df, x='University_Type', y='Net_Salary')
plt.title('Salary by University Type')
plt.xticks(rotation=45)

plt.subplot(3, 3, 4)
sns.boxplot(data=df, x='GPA_or_Class_of_Degree', y='Net_Salary')
plt.title('Salary by GPA/Class of Degree')
plt.xticks(rotation=45)

plt.subplot(3, 3, 5)
sns.boxplot(data=df, x='Has_Postgrad_Degree', y='Net_Salary')
plt.title('Salary by Postgraduate Degree')
plt.xticks(rotation=45)

plt.subplot(3, 3, 6)
sns.boxplot(data=df, x='Region', y='Net_Salary')
plt.title('Salary by Region')
plt.xticks(rotation=45)

plt.subplot(3, 3, 7)
sns.boxplot(data=df, x='Urban_or_Rural', y='Net_Salary')
plt.title('Salary by Urban/Rural')
plt.xticks(rotation=45)

plt.subplot(3, 3, 8)
sns.boxplot(data=df, x='Household_Income_Bracket', y='Net_Salary')
plt.title('Salary by Household Income')
plt.xticks(rotation=45)

plt.subplot(3, 3, 9)
sns.scatterplot(data=df, x='Age', y='Net_Salary', alpha=0.6)
plt.title('Salary vs Age')

plt.tight_layout()
plt.show()

In [None]:
# Correlation analysis
# First, let's create a correlation matrix for numerical columns
numerical_cols = ['Age', 'Years_Since_Graduation', 'Net_Salary']
corr_matrix = df[numerical_cols].corr()

plt.figure(figsize=(8, 6))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0)
plt.title('Correlation Matrix of Numerical Variables')
plt.show()

## 4. Feature Engineering and Data Preprocessing

In [None]:
# Create a copy for preprocessing
df_processed = df.copy()

# Remove unemployed graduates as they have 0 salary
df_processed = df_processed[df_processed['Employment_Status'] == 'Employed'].copy()
print(f"Dataset after removing unemployed: {df_processed.shape}")

# Drop columns that won't be useful for prediction
columns_to_drop = ['Graduate_ID', 'Employment_Status', 'Salary_Level', 'State_of_Origin']
df_processed = df_processed.drop(columns=columns_to_drop)

print(f"Columns after dropping: {df_processed.columns.tolist()}")

In [None]:
# Convert categorical variables to numerical
label_encoders = {}

categorical_columns = ['Gender', 'Region', 'Urban_or_Rural', 'Household_Income_Bracket', 
                      'Field_of_Study', 'University_Type', 'GPA_or_Class_of_Degree', 'Has_Postgrad_Degree']

for col in categorical_columns:
    le = LabelEncoder()
    df_processed[col] = le.fit_transform(df_processed[col])
    label_encoders[col] = le
    print(f"{col}: {dict(zip(le.classes_, le.transform(le.classes_)))}")

print("\nProcessed dataset shape:", df_processed.shape)
df_processed.head()

In [None]:
# Prepare features and target
X = df_processed.drop('Net_Salary', axis=1)
y = df_processed['Net_Salary']

print("Features:", X.columns.tolist())
print("Target:", y.name)
print("Features shape:", X.shape)
print("Target shape:", y.shape)

In [None]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Training set shape:", X_train.shape)
print("Test set shape:", X_test.shape)

In [None]:
# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Features standardized successfully")
print("Mean of scaled training features:", np.mean(X_train_scaled, axis=0))
print("Std of scaled training features:", np.std(X_train_scaled, axis=0))

## 5. Model Training and Comparison

In [None]:
# Initialize models
models = {
    'Linear Regression': LinearRegression(),
    'SGD Regressor (Gradient Descent)': SGDRegressor(random_state=42, max_iter=1000),
    'Decision Tree': DecisionTreeRegressor(random_state=42),
    'Random Forest': RandomForestRegressor(random_state=42, n_estimators=100)
}

# Train and evaluate models
model_results = {}

for name, model in models.items():
    print(f"\nTraining {name}...")
    
    # Use scaled data for linear models, original for tree-based models
    if 'Linear' in name or 'SGD' in name:
        model.fit(X_train_scaled, y_train)
        y_train_pred = model.predict(X_train_scaled)
        y_test_pred = model.predict(X_test_scaled)
    else:
        model.fit(X_train, y_train)
        y_train_pred = model.predict(X_train)
        y_test_pred = model.predict(X_test)
    
    # Calculate metrics
    train_mse = mean_squared_error(y_train, y_train_pred)
    test_mse = mean_squared_error(y_test, y_test_pred)
    train_r2 = r2_score(y_train, y_train_pred)
    test_r2 = r2_score(y_test, y_test_pred)
    
    model_results[name] = {
        'model': model,
        'train_mse': train_mse,
        'test_mse': test_mse,
        'train_r2': train_r2,
        'test_r2': test_r2,
        'y_train_pred': y_train_pred,
        'y_test_pred': y_test_pred
    }
    
    print(f"Train MSE: {train_mse:.2f}")
    print(f"Test MSE: {test_mse:.2f}")
    print(f"Train R²: {train_r2:.4f}")
    print(f"Test R²: {test_r2:.4f}")

In [None]:
# Compare model performance
comparison_df = pd.DataFrame({
    'Model': list(model_results.keys()),
    'Train MSE': [results['train_mse'] for results in model_results.values()],
    'Test MSE': [results['test_mse'] for results in model_results.values()],
    'Train R²': [results['train_r2'] for results in model_results.values()],
    'Test R²': [results['test_r2'] for results in model_results.values()]
})

print("Model Comparison:")
print(comparison_df)

# Find best model based on test R²
best_model_name = comparison_df.loc[comparison_df['Test R²'].idxmax(), 'Model']
print(f"\nBest performing model: {best_model_name}")

In [None]:
# Plot model comparison
plt.figure(figsize=(15, 10))

# MSE comparison
plt.subplot(2, 2, 1)
x_pos = np.arange(len(comparison_df))
plt.bar(x_pos - 0.2, comparison_df['Train MSE'], 0.4, label='Train MSE', alpha=0.7)
plt.bar(x_pos + 0.2, comparison_df['Test MSE'], 0.4, label='Test MSE', alpha=0.7)
plt.xlabel('Models')
plt.ylabel('MSE')
plt.title('Mean Squared Error Comparison')
plt.xticks(x_pos, comparison_df['Model'], rotation=45)
plt.legend()

# R² comparison
plt.subplot(2, 2, 2)
plt.bar(x_pos - 0.2, comparison_df['Train R²'], 0.4, label='Train R²', alpha=0.7)
plt.bar(x_pos + 0.2, comparison_df['Test R²'], 0.4, label='Test R²', alpha=0.7)
plt.xlabel('Models')
plt.ylabel('R² Score')
plt.title('R² Score Comparison')
plt.xticks(x_pos, comparison_df['Model'], rotation=45)
plt.legend()

# Loss curves for SGD (Gradient Descent)
plt.subplot(2, 2, 3)
sgd_model = SGDRegressor(random_state=42, max_iter=1000)
train_losses = []
test_losses = []

for i in range(1, 101, 5):
    sgd_temp = SGDRegressor(random_state=42, max_iter=i)
    sgd_temp.fit(X_train_scaled, y_train)
    train_pred = sgd_temp.predict(X_train_scaled)
    test_pred = sgd_temp.predict(X_test_scaled)
    train_losses.append(mean_squared_error(y_train, train_pred))
    test_losses.append(mean_squared_error(y_test, test_pred))

iterations = list(range(1, 101, 5))
plt.plot(iterations, train_losses, label='Training Loss', marker='o')
plt.plot(iterations, test_losses, label='Test Loss', marker='s')
plt.xlabel('Iterations')
plt.ylabel('MSE')
plt.title('SGD Training vs Test Loss Curve')
plt.legend()

# Prediction scatter plot for best model
plt.subplot(2, 2, 4)
best_results = model_results[best_model_name]
plt.scatter(y_test, best_results['y_test_pred'], alpha=0.6)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
plt.xlabel('Actual Salary')
plt.ylabel('Predicted Salary')
plt.title(f'{best_model_name} - Actual vs Predicted')

plt.tight_layout()
plt.show()

## 6. Linear Regression Analysis and Visualization

In [None]:
# Detailed analysis of Linear Regression
lr_model = model_results['Linear Regression']['model']
lr_train_pred = model_results['Linear Regression']['y_train_pred']
lr_test_pred = model_results['Linear Regression']['y_test_pred']

# Feature importance (coefficients)
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': lr_model.coef_
})
feature_importance['Abs_Coefficient'] = abs(feature_importance['Coefficient'])
feature_importance = feature_importance.sort_values('Abs_Coefficient', ascending=False)

plt.figure(figsize=(12, 8))
plt.barh(feature_importance['Feature'], feature_importance['Coefficient'])
plt.xlabel('Coefficient Value')
plt.title('Linear Regression Feature Importance (Coefficients)')
plt.tight_layout()
plt.show()

print("Feature Importance (Linear Regression):")
print(feature_importance)

In [None]:
# Before and After scatter plots for Linear Regression
plt.figure(figsize=(15, 5))

# Before: Show data distribution
plt.subplot(1, 3, 1)
plt.scatter(range(len(y_train)), y_train, alpha=0.6, label='Training Data')
plt.scatter(range(len(y_train), len(y_train) + len(y_test)), y_test, alpha=0.6, label='Test Data')
plt.xlabel('Sample Index')
plt.ylabel('Actual Salary')
plt.title('Before: Raw Data Distribution')
plt.legend()

# After: Show predictions with regression line
plt.subplot(1, 3, 2)
plt.scatter(y_train, lr_train_pred, alpha=0.6, label='Training Predictions')
plt.scatter(y_test, lr_test_pred, alpha=0.6, label='Test Predictions')
plt.plot([min(y_train.min(), y_test.min()), max(y_train.max(), y_test.max())], 
         [min(y_train.min(), y_test.min()), max(y_train.max(), y_test.max())], 'r--', lw=2)
plt.xlabel('Actual Salary')
plt.ylabel('Predicted Salary')
plt.title('After: Linear Regression Line')
plt.legend()

# Residuals plot
plt.subplot(1, 3, 3)
residuals_train = y_train - lr_train_pred
residuals_test = y_test - lr_test_pred
plt.scatter(lr_train_pred, residuals_train, alpha=0.6, label='Training Residuals')
plt.scatter(lr_test_pred, residuals_test, alpha=0.6, label='Test Residuals')
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel('Predicted Salary')
plt.ylabel('Residuals')
plt.title('Residuals Plot')
plt.legend()

plt.tight_layout()
plt.show()

## 7. Save Models and Preprocessing Objects

In [None]:
# Save the best model and preprocessing objects
best_model = model_results[best_model_name]['model']

# Save models
joblib.dump(best_model, 'best_model.pkl')
joblib.dump(model_results['Linear Regression']['model'], 'linear_regression_model.pkl')
joblib.dump(model_results['Decision Tree']['model'], 'decision_tree_model.pkl')
joblib.dump(model_results['Random Forest']['model'], 'random_forest_model.pkl')

# Save preprocessing objects
joblib.dump(scaler, 'scaler.pkl')
joblib.dump(label_encoders, 'label_encoders.pkl')

# Save feature names
joblib.dump(X.columns.tolist(), 'feature_names.pkl')

print(f"Best model ({best_model_name}) and preprocessing objects saved successfully!")
print(f"Best model Test R²: {model_results[best_model_name]['test_r2']:.4f}")

## 8. Prediction Function for API

In [None]:
def predict_salary(age, gender, region, urban_or_rural, household_income_bracket, 
                  field_of_study, university_type, gpa_or_class_of_degree, 
                  has_postgrad_degree, years_since_graduation):
    """
    Predict salary based on input features
    """
    # Load models and preprocessing objects
    model = joblib.load('best_model.pkl')
    scaler = joblib.load('scaler.pkl')
    label_encoders = joblib.load('label_encoders.pkl')
    feature_names = joblib.load('feature_names.pkl')
    
    # Create input dataframe
    input_data = pd.DataFrame({
        'Age': [age],
        'Gender': [gender],
        'Region': [region],
        'Urban_or_Rural': [urban_or_rural],
        'Household_Income_Bracket': [household_income_bracket],
        'Field_of_Study': [field_of_study],
        'University_Type': [university_type],
        'GPA_or_Class_of_Degree': [gpa_or_class_of_degree],
        'Has_Postgrad_Degree': [has_postgrad_degree],
        'Years_Since_Graduation': [years_since_graduation]
    })
    
    # Encode categorical variables
    for col in ['Gender', 'Region', 'Urban_or_Rural', 'Household_Income_Bracket', 
                'Field_of_Study', 'University_Type', 'GPA_or_Class_of_Degree', 'Has_Postgrad_Degree']:
        input_data[col] = label_encoders[col].transform(input_data[col])
    
    # Scale features if the best model requires it
    if best_model_name in ['Linear Regression', 'SGD Regressor (Gradient Descent)']:
        input_scaled = scaler.transform(input_data)
        prediction = model.predict(input_scaled)[0]
    else:
        prediction = model.predict(input_data)[0]
    
    return max(0, prediction)  # Ensure non-negative salary

# Test the prediction function
test_prediction = predict_salary(
    age=25,
    gender='Male',
    region='South',
    urban_or_rural='Urban',
    household_income_bracket='Middle',
    field_of_study='Engineering',
    university_type='Federal',
    gpa_or_class_of_degree='Second Class Upper',
    has_postgrad_degree='Yes',
    years_since_graduation=2
)

print(f"Test prediction: ₦{test_prediction:,.2f}")

## 9. Model Summary and Insights

In [None]:
print("=" * 60)
print("NIGERIAN GRADUATE EMPLOYMENT SALARY PREDICTION - SUMMARY")
print("=" * 60)

print(f"\nDataset: {df.shape[0]} graduates surveyed")
print(f"Employed graduates used for modeling: {df_processed.shape[0]}")
print(f"Features used: {len(X.columns)}")

print("\nModel Performance Comparison:")
for model_name, results in model_results.items():
    print(f"{model_name}:")
    print(f"  - Test R²: {results['test_r2']:.4f}")
    print(f"  - Test MSE: {results['test_mse']:.2f}")

print(f"\nBest Model: {best_model_name}")
print(f"Best Model Test R²: {model_results[best_model_name]['test_r2']:.4f}")

print("\nTop 3 Most Important Features (Linear Regression):")
for i, row in feature_importance.head(3).iterrows():
    print(f"{row['Feature']}: {row['Coefficient']:.2f}")

print("\nKey Insights:")
print("- The model can help predict graduate salaries based on educational and demographic factors")
print("- This enables better career planning and policy decisions for Nigerian graduates")
print("- The system addresses the mission of helping young Africans make informed career choices")

print("\nFiles saved for API deployment:")
print("- best_model.pkl")
print("- scaler.pkl")
print("- label_encoders.pkl")
print("- feature_names.pkl")