In [16]:
import pandas as pd
import joblib
from sklearn.preprocessing import LabelEncoder

# Load your dataset
data = pd.read_csv('user_data.csv')

# Drop irrelevant columns and missing values
data = data.drop(['Full Name', 'DOB', 'Email', 'Current City', 'Name of the Institute'], axis=1)
data = data.dropna()

# Encode categorical features
label_encoder = LabelEncoder()
data['Education Level'] = label_encoder.fit_transform(data['Education Level'])
data['Gender'] = label_encoder.fit_transform(data['Gender'])
data['Current Position of the Job'] = label_encoder.fit_transform(data['Current Position of the Job'])
data['Polling Station'] = label_encoder.fit_transform(data['Polling Station'])
data['Experience'] = data['Experience'].str.extract('(\d+)').astype(float)

# Preprocess 'Current Salary' column
data['Current Salary'] = data['Current Salary'].str.replace('[\$,]', '', regex=True).astype(float)

# Encode 'Province', 'District', and 'Polling Division'
data = pd.get_dummies(data, columns=['Province', 'District', 'Polling Division'])

label_encoder = LabelEncoder()
data['Polling Station'] = label_encoder.fit_transform(data['Polling Station'])

# Split the data into features (X) and target labels (y)
X = data.drop('Position Type', axis=1)
y = data['Position Type']

# Load the trained model
clf = joblib.load('trained_model.pkl')

def predict_suitable_candidates(province, district, polling_division, polling_station, position_type):
    # Create a DataFrame with the provided criteria
    input_data = pd.DataFrame(columns=X.columns)
    input_data['Province_' + province] = 1
    input_data['District_' + district] = 1
    input_data['Polling Division_' + polling_division] = 1
    
    # Encode the Polling Station value using the same label encoder
    polling_station_encoded = label_encoder.transform([f"Station {polling_station}"])[0]
    input_data['Polling Station'] = polling_station_encoded
    
    # Make sure the feature names match the columns in the trained model
    input_data = input_data.reindex(columns=X.columns, fill_value=0)
    
    # Make a prediction using the trained model
    predicted_position = clf.predict(input_data.reshape(1, -1))
    
    # Get indices of candidates with the predicted position
    candidate_indices = data.index[data['Position Type'] == predicted_position].tolist()
    
    # Filter candidates who are not from the same criteria
    filtered_candidates = []
    for index in candidate_indices:
        if data.at[index, 'Province_' + province] != 1 or \
           data.at[index, 'District_' + district] != 1 or \
           data.at[index, 'Polling Division_' + polling_division] != 1 or \
           data.at[index, 'Polling Station'] != polling_station_encoded:
            filtered_candidates.append(data.at[index, 'Position Type'])
    
    return filtered_candidates

# Example usage
province = 'SampleProvince'
district = 'SampleDistrict'
polling_division = 'SamplePollingDivision'
position_type = 'senior'  # 'junior' or 'clerk'
suitable_candidates = predict_suitable_candidates(province, district, polling_division, polling_station, position_type)

print(f'Suitable {position_type.capitalize()} Candidates:', suitable_candidates)


ValueError: y contains previously unseen labels: 'Station SampleStation'