In [8]:
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Load your dataset
data = pd.read_csv('user_data.csv')

# Drop irrelevant columns and missing values
data = data.drop(['Full Name', 'DOB', 'Email', 'Current City', 'Name of the Institute'], axis=1)
data = data.dropna()

# Encode categorical features
label_encoder = LabelEncoder()
data['Education Level'] = label_encoder.fit_transform(data['Education Level'])
data['Gender'] = label_encoder.fit_transform(data['Gender'])
data['Current Position of the Job'] = label_encoder.fit_transform(data['Current Position of the Job'])
data['Polling Station'] = label_encoder.fit_transform(data['Polling Station'])
data['Experience'] = data['Experience'].str.extract('(\d+)').astype(float)

# Preprocess 'Current Salary' column
data['Current Salary'] = data['Current Salary'].str.replace('[\$,]', '', regex=True).astype(float)

# Encode 'Province', 'District', and 'Polling Division'
data = pd.get_dummies(data, columns=['Province', 'District', 'Polling Division'])

# Split the data into features (X) and target labels (y)
X = data.drop('Position Type', axis=1)
y = data['Position Type']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Random Forest classifier
clf = RandomForestClassifier()

# Train the model
clf.fit(X_train, y_train)

# Save the trained model to a pickle file
joblib.dump(clf, 'trained_model.pkl')

# Make predictions
y_pred = clf.predict(X_test)

# Evaluate the model
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Calculate overall accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Overall Accuracy:", accuracy)

# Prepare a new data instance for prediction (replace with your own values)
new_data = pd.DataFrame({
    'Age': [25],
    'Education Level': label_encoder.transform(['Bachelor\'s Degree'])[0],  # Use the correct label
    'Gender': label_encoder.transform(['Male'])[0],
    'Polling Station': label_encoder.transform(['Station 2'])[0],
    'Current Position of the Job': label_encoder.transform(['Doctor'])[0],
    'Experience': [10],
    'Current Salary': [50000],
    'Province_Central': [0],
    'Province_Eastern': [0],
    'Province_North Central': [0],
    'Province_Northern': [0],
    'Province_North Western': [0],
    'Province_Sabaragamuwa': [0],
    'Province_Southern': [0],
    'Province_Uva': [1],
    'Province_Western': [0],
    'District_Badulla': [0],
    'District_Colombo': [0],
    'District_Galle': [0],
    'District_Jaffna': [0],
    'District_Kandy': [0],
    'District_Matara': [0],
    'District_Mannar': [0],
    'District_Nuraniya': [0],
    'District_Trincomalee': [0],
    'Polling Division_Division 1': [0],
    'Polling Division_Division 2': [1],  # Set to 1 for the desired division
    'Polling Division_Division 3': [0],
    # ... add more divisions as needed ...
})

# Make a prediction
new_prediction = clf.predict(new_data)
print("Predicted Position:", new_prediction)


Classification Report:
                        precision    recall  f1-score   support

                 Clerk       0.92      0.94      0.93        36
Junior Polling Officer       1.00      0.84      0.91        19
Senior Polling Officer       0.96      1.00      0.98        45

              accuracy                           0.95       100
             macro avg       0.96      0.93      0.94       100
          weighted avg       0.95      0.95      0.95       100

Overall Accuracy: 0.95


ValueError: y contains previously unseen labels: 'Bachelors Degree'