In [6]:
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Load your dataset
data = pd.read_csv('user_data.csv')

# Drop irrelevant columns and missing values
data = data.drop(['Full Name', 'DOB', 'Email', 'Current City', 'Name of the Institute'], axis=1)
data = data.dropna()

# Encode categorical features
label_encoder = LabelEncoder()
data['Education Level'] = label_encoder.fit_transform(data['Education Level'])
data['Gender'] = label_encoder.fit_transform(data['Gender'])
data['Current Position of the Job'] = label_encoder.fit_transform(data['Current Position of the Job'])
data['Polling Station'] = label_encoder.fit_transform(data['Polling Station'])
data['Experience'] = data['Experience'].str.extract('(\d+)').astype(float)

# Preprocess 'Current Salary' column
data['Current Salary'] = data['Current Salary'].str.replace('[\$,]', '', regex=True).astype(float)

# Split the data into features (X) and target labels (y)
X = data.drop('Position Type', axis=1)
y = data['Position Type']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Random Forest classifier
clf = RandomForestClassifier()

# Train the model
clf.fit(X_train, y_train)

# Save the trained model to a pickle file
joblib.dump(clf, 'trained_model.pkl')


# Make predictions
y_pred_probs = clf.predict_proba(X_test)

# Define thresholds for different categories
senior_threshold = 0.7
junior_threshold = 0.3

# Classify predictions into different categories
senior_predictions = (y_pred_probs[:, 2] >= senior_threshold).astype(int)
junior_predictions = (y_pred_probs[:, 1] >= junior_threshold).astype(int)
clerk_predictions = (y_pred_probs[:, 0] < junior_threshold).astype(int)

# Evaluate the model
print("Senior Classification Report:")
print(classification_report(y_test.map(lambda x: int(x == 'Senior')), senior_predictions))

print("Junior Classification Report:")
print(classification_report(y_test.map(lambda x: int(x == 'Junior')), junior_predictions))

print("Clerk Classification Report:")
print(classification_report(y_test.map(lambda x: int(x == 'Clerk')), clerk_predictions))

# Prepare a new data instance for prediction (replace with your own values)
new_data = pd.DataFrame({
    'Age': [25],
    'Education Level': [3],
    'Gender': [1],
    'Polling Station': [2],
    'Current Position of the Job': [1],
    'Experience': [10],
    'Current Salary': [50000]
})

# Make a prediction
new_prediction_probs = clf.predict_proba(new_data)
if new_prediction_probs[0, 2] >= senior_threshold:
    new_prediction = "Senior"
elif new_prediction_probs[0, 1] >= junior_threshold:
    new_prediction = "Junior"
else:
    new_prediction = "Clerk"

print("Predicted Position:", new_prediction)

from sklearn.metrics import accuracy_score

# Make predictions
y_pred = clf.predict(X_test)

# Calculate overall accuracy
accuracy = accuracy_score(y_test, y_pred)

# Print the accuracy
print("Overall Accuracy:", accuracy)



Senior Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.64      0.78       100
           1       0.00      0.00      0.00         0

    accuracy                           0.64       100
   macro avg       0.50      0.32      0.39       100
weighted avg       1.00      0.64      0.78       100

Junior Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.75      0.86       100
           1       0.00      0.00      0.00         0

    accuracy                           0.75       100
   macro avg       0.50      0.38      0.43       100
weighted avg       1.00      0.75      0.86       100

Clerk Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00      60.0
           1       0.00      0.00      0.00      40.0

    accuracy                           0.00     100.0
   macro avg       0.00      0.00      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
