In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
file_path = 'DeveloperData.csv' # Replace with the actual path to your CSV file
df = pd.read_csv(file_path)
print(df.head())

   Database Fundamentals  Computer Architecture  \
0                      6                      4   
1                      6                      5   
2                      6                      1   
3                      6                      0   
4                      6                      3   

   Distributed Computing Systems  Cyber Security  Networking  \
0                              4               4           4   
1                              4               4           4   
2                              4               4           4   
3                              4               4           4   
4                              4               4           4   

   Software Development  Programming Skills  Project Management  \
0                     4                   4                   4   
1                     4                   4                   4   
2                     4                   4                   4   
3                     4                 

In [3]:
df['ID'] = range(1, len(df) + 1)
# Reorder columns to place 'id' at the beginning
cols = ['ID'] + [col for col in df.columns if col != 'ID']
df = df[cols]
print(df.head())

   ID  Database Fundamentals  Computer Architecture  \
0   1                      6                      4   
1   2                      6                      5   
2   3                      6                      1   
3   4                      6                      0   
4   5                      6                      3   

   Distributed Computing Systems  Cyber Security  Networking  \
0                              4               4           4   
1                              4               4           4   
2                              4               4           4   
3                              4               4           4   
4                              4               4           4   

   Software Development  Programming Skills  Project Management  \
0                     4                   4                   4   
1                     4                   4                   4   
2                     4                   4                   4   
3               

In [4]:
def personality_type(row):
    # Corrected column names based on df.head() output
    o = 'H' if row['Openness'] > df['Openness'].median() else 'L'
    c = 'H' if row['Conscientousness'] > df['Conscientousness'].median() else 'L' # Corrected spelling
    e = 'H' if row['Extraversion'] > df['Extraversion'].median() else 'L'
    a = 'H' if row['Agreeableness'] > df['Agreeableness'].median() else 'L'
    n = 'H' if row['Emotional_Range'] > df['Emotional_Range'].median() else 'L' # Corrected name
    return f"{o}{c}{e}{a}{n}"  # Example: HHLLH

df['PersonalityType'] = df.apply(personality_type, axis=1)

In [5]:
df.to_csv("DeveloperDataset.csv", index=False)
print("Dataset Saved")

Dataset Saved


In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
import joblib
import pickle

# Load the data
df = pd.read_csv('DeveloperDataset.csv')

# Select only specific features: skills + selected personality traits
selected_features = [
    # Skills
    'Database Fundamentals', 'Computer Architecture',
    'Distributed Computing Systems', 'Cyber Security',
    'Networking', 'Software Development',
    'Programming Skills', 'Project Management',
    'Computer Forensics Fundamentals',

    # Selected personality traits
    'Openness to Change', 'Conscientousness', 'Extraversion',
    'Agreeableness', 'Emotional_Range'
]

X = df[selected_features]
y = df['Role']

# Display selected features being used
print("Selected features used for training:")
print(X.columns.tolist())
print(f"\nTotal features: {len(X.columns)}")

# Encode the target variable
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize and train SVM model with selected features
svm_model = SVC(probability=True, random_state=42)
svm_model.fit(X_train_scaled, y_train)

# Make predictions on training and testing sets
y_train_pred = svm_model.predict(X_train_scaled)
y_test_pred = svm_model.predict(X_test_scaled)

# Evaluate the model
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

print("\nSVM Model Evaluation (Selected Features)")
print("=" * 60)
print(f"Training Accuracy: {train_accuracy:.4f}")
print(f"Testing Accuracy: {test_accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_test_pred, target_names=label_encoder.classes_))

# Train final SVM model on all data
final_scaler = StandardScaler()
X_final_scaled = final_scaler.fit_transform(X)

final_svm_model = SVC(probability=True, random_state=42)
final_svm_model.fit(X_final_scaled, y_encoded)

# Get selected feature names for reference
selected_feature_names = X.columns.tolist()

# Prediction function for SVM with selected features
def predict_role_svm_selected_features(feature_data):
    """
    Predict role using SVM model with selected features only

    Parameters:
    feature_data: List or array with feature values in this exact order:
        [Database Fundamentals, Computer Architecture, Distributed Computing Systems,
         Cyber Security, Networking, Software Development, Programming Skills,
         Project Management, Computer Forensics Fundamentals,
         Openness to Change, Conscientousness, Extraversion, Agreeableness, Emotional_Range]

    Returns: Dictionary with prediction results
    """
    # Convert input to numpy array and reshape
    input_data = np.array(feature_data)
    if input_data.ndim == 1:
        input_data = input_data.reshape(1, -1)

    # Check if input has correct number of features
    if input_data.shape[1] != len(selected_feature_names):
        raise ValueError(f"Expected {len(selected_feature_names)} features, got {input_data.shape[1]}")

    # Scale the input
    input_scaled = final_scaler.transform(input_data)

    # Make prediction
    prediction = final_svm_model.predict(input_scaled)
    prediction_proba = final_svm_model.predict_proba(input_scaled)

    # Get results
    predicted_role = label_encoder.inverse_transform(prediction)[0]
    confidence = np.max(prediction_proba)

    return {
        'predicted_role': predicted_role,
        'confidence': float(confidence),
        'probabilities': dict(zip(label_encoder.classes_, prediction_proba[0])),
        'features_used': selected_feature_names
    }

# Display the exact feature order for input
print(f"\nFeature order for input (14 features total):")
for i, feature in enumerate(selected_feature_names):
    print(f"{i+1}. {feature}")

# Example: Using actual data from the dataset for testing
sample_row = X.iloc[0].values  # Get first row's selected features
prediction = predict_role_svm_selected_features(sample_row)

print(f"\nExample Prediction with Selected Features:")
print(f"Actual Role: {df['Role'].iloc[0]}")
print(f"Predicted Role: {prediction['predicted_role']}")
print(f"Confidence: {prediction['confidence']:.3f}")

# Save the SVM model with selected features as .pkl file
model_data = {
    'model': final_svm_model,
    'scaler': final_scaler,
    'label_encoder': label_encoder,
    'feature_names': selected_feature_names,
    'target_names': label_encoder.classes_.tolist()
}

# Save using pickle to create .pkl file
with open('svm_role_model.pkl', 'wb') as f:
    pickle.dump(model_data, f)

print("\nSVM model with selected features saved as 'svm_role_model.pkl'")

# Function to load and use the saved model from .pkl file
def load_svm_model_selected_features(model_path='svm_role_model_selected_features.pkl'):
    """Load the saved SVM model with selected features from .pkl file"""
    try:
        with open(model_path, 'rb') as f:
            model_data = pickle.load(f)
        return model_data
    except Exception as e:
        print(f"Error loading model: {e}")
        return None

def predict_with_saved_svm_selected_features(new_features, model_path='svm_role_model_selected_features.pkl'):
    """Make predictions using the saved SVM model with selected features"""
    model_data = load_svm_model_selected_features(model_path)
    if model_data is None:
        return None

    # Convert and validate input
    input_data = np.array(new_features)
    if input_data.ndim == 1:
        input_data = input_data.reshape(1, -1)

    if input_data.shape[1] != len(model_data['feature_names']):
        raise ValueError(f"Expected {len(model_data['feature_names'])} features, got {input_data.shape[1]}")

    # Scale and predict
    input_scaled = model_data['scaler'].transform(input_data)
    prediction = model_data['model'].predict(input_scaled)
    probability = model_data['model'].predict_proba(input_scaled)

    return {
        'role': model_data['label_encoder'].inverse_transform(prediction)[0],
        'confidence': float(np.max(probability)),
        'all_probabilities': dict(zip(model_data['target_names'], probability[0])),
        'features_used': model_data['feature_names']
    }

# Test the loading function
print("\n" + "="*60)
print("TESTING MODEL LOADING FUNCTION")
print("="*60)

test_features = X.iloc[1].values.tolist()  # Second row's selected features
loaded_prediction = predict_with_saved_svm_selected_features(test_features)
if loaded_prediction:
    print(f"Actual Role: {df['Role'].iloc[1]}")
    print(f"Predicted role: {loaded_prediction['role']}")
    print(f"Confidence: {loaded_prediction['confidence']:.3f}")

# Example of how to create input for new predictions
print("\n" + "="*60)
print("HOW TO CREATE INPUT FOR NEW PREDICTIONS")
print("="*60)

print("Create a list with 14 values in this exact order:")
print("1-9. Skills (0-10 scale): Database Fundamentals, Computer Architecture, ...")
print("10. Openness to Change (0-1 scale)")
print("11. Conscientousness (0-1 scale)")
print("12. Extraversion (0-1 scale)")
print("13. Agreeableness (0-1 scale)")
print("14. Emotional_Range (0-1 scale)")

# Example input creation
example_input = [
    # Skills (9 values, 0-10 scale)
    7, 6, 5, 8, 6, 9, 8, 7, 4,

    # Personality traits (5 values, 0-1 scale)
    0.45, 0.52, 0.38, 0.12, 0.67
]

print(f"\nExample input: {example_input}")

# Also save using joblib as backup
joblib.dump(model_data, 'svm_role_model.joblib')
print("\nModel also saved as 'svm_role_model.joblib' for backup")

print("\nSVM model with selected features training and deployment complete!")

Selected features used for training:
['Database Fundamentals', 'Computer Architecture', 'Distributed Computing Systems', 'Cyber Security', 'Networking', 'Software Development', 'Programming Skills', 'Project Management', 'Computer Forensics Fundamentals', 'Openness to Change', 'Conscientousness', 'Extraversion', 'Agreeableness', 'Emotional_Range']

Total features: 14

SVM Model Evaluation (Selected Features)
Training Accuracy: 0.8972
Testing Accuracy: 0.8807

Classification Report:
                                 precision    recall  f1-score   support

               AI ML Specialist       0.69      0.88      0.77       216
                 API Specialist       0.99      1.00      1.00       108
   Application Support Engineer       0.97      1.00      0.99       108
               Business Analyst       0.71      0.44      0.55       108
     Customer Service Executive       0.98      0.99      0.99       108
      Cyber Security Specialist       0.96      1.00      0.98       108
 

