In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import joblib


# Load and preprocess the dataset
data = pd.read_csv('user_data.csv')

# Choose the appropriate columns for features and target
selected_features = ['Education Level', 'Experience', 'Current Salary']
target_column = 'Position Type'

# Split data into features (X) and target (y)
X = data[selected_features]
y = data[target_column]

# Preprocess 'Experience' column to extract numeric part
X['Experience'] = X['Experience'].str.extract('(\d+)').astype(float)

# Preprocess 'Current Salary' column to remove non-numeric characters and convert to float
X['Current Salary'] = X['Current Salary'].replace('[\$,]', '', regex=True).astype(float)

# Perform one-hot encoding for categorical variables
categorical_features = ['Education Level']
numeric_features = ['Experience', 'Current Salary']

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), categorical_features),
        ('num', StandardScaler(), numeric_features)
    ])

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply preprocessing
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

# Train a Random Forest classifier
model = RandomForestClassifier()
model.fit(X_train_processed, y_train)

# Make predictions
y_pred = model.predict(X_test_processed)

# Evaluate the model
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Save the trained model to a file
model_filename = 'trained_random_forest_model.joblib'
joblib.dump(model, model_filename)
print(f"Model saved as {model_filename}")


# Evaluate the model
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Calculate and print the accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Save the trained model to a file
model_filename = 'trained_random_forest_model.joblib'
joblib.dump(model, model_filename)
print(f"Model saved as {model_filename}")


[[36  0  0]
 [ 0 19  0]
 [ 0  0 45]]
                        precision    recall  f1-score   support

                 Clerk       1.00      1.00      1.00        36
Junior Polling Officer       1.00      1.00      1.00        19
Senior Polling Officer       1.00      1.00      1.00        45

              accuracy                           1.00       100
             macro avg       1.00      1.00      1.00       100
          weighted avg       1.00      1.00      1.00       100

Model saved as trained_random_forest_model.joblib
[[36  0  0]
 [ 0 19  0]
 [ 0  0 45]]
                        precision    recall  f1-score   support

                 Clerk       1.00      1.00      1.00        36
Junior Polling Officer       1.00      1.00      1.00        19
Senior Polling Officer       1.00      1.00      1.00        45

              accuracy                           1.00       100
             macro avg       1.00      1.00      1.00       100
          weighted avg       1.00      

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['Experience'] = X['Experience'].str.extract('(\d+)').astype(float)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['Current Salary'] = X['Current Salary'].replace('[\$,]', '', regex=True).astype(float)
