In [2]:
import pandas as pd
import os
import numpy as np
import sys
sys.path.append('..')

In [5]:
data = pd.read_csv(r'../../data/interim/promotion/train_clean.csv')

In [7]:
data.columns

Index(['Division', 'Qualification', 'Gender', 'Channel_of_Recruitment',
       'Trainings_Attended', 'Year_of_birth', 'Last_performance_score',
       'Year_of_recruitment', 'Targets_met', 'Previous_Award',
       'Training_score_average', 'State_Of_Origin', 'Foreign_schooled',
       'Marital_Status', 'Previous_IntraDepartmental_Movement',
       'No_of_previous_employers', 'Promoted_or_Not'],
      dtype='object')

In [9]:
x = data.drop(columns=['Promoted_or_Not'])
y = data.Promoted_or_Not

In [11]:
numerical_df = data.select_dtypes(exclude=['object'])
categorical_df = data.select_dtypes(include=['object'])

In [13]:
# Select categorical columns with relatively low cardinality (convenient but arbitrary)
categorical_cols = [cname for cname in x.columns if x[cname].nunique() < 800 and x[cname].dtype == "object"]

# Select numerical columns
numerical_cols = [cname for cname in x.columns if x[cname].dtype in ['int64', 'float64']]

In [15]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder,OrdinalEncoder,StandardScaler, MinMaxScaler

#train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

In [17]:
from category_encoders import BaseNEncoder

In [19]:
from src.eda_first import summarize_dataframe
summarize_dataframe(X_train)

ModuleNotFoundError: No module named 'src'

In [58]:
data.No_of_previous_employers.value_counts()

No_of_previous_employers
1              18064
0              12693
2               1816
3               1524
4               1266
5                893
More than 5      377
Name: count, dtype: int64

In [21]:
#create numerical transformer


numerical_transformer = Pipeline([('imputer', SimpleImputer(strategy='mean')), 
                                  ('scaler', StandardScaler()) ])

#create categorical transformer
categorical_transformer = Pipeline(steps=[ ('imputer', SimpleImputer(strategy='most_frequent')),
                                            ('onehot', OneHotEncoder(handle_unknown='ignore'))
                                            ])

base_encoder_columns = ['Division', 'Qualification', 'Channel_of_Recruitment', 'State_Of_Origin', 'Foreign_schooled', 'Marital_Status', 'Previous_IntraDepartmental_Movement', 'No_of_previous_employers', 'Gender']
base_encoder = Pipeline(steps=[
    ('base_encoder', BaseNEncoder(cols=base_encoder_columns, base=3))
])

In [23]:
# Combine the transformations using ColumnTransformer
preprocessor = ColumnTransformer(transformers=[
    ('base_name', base_encoder, base_encoder_columns),  # TargetEncoder for 'town'
    ('num', numerical_transformer, numerical_cols)])

In [25]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Create a RandomForestClassifier model
rf = RandomForestClassifier(n_estimators=150, random_state=42, max_depth=4)

# Model pipeline
rf_pipe = Pipeline(steps=[('preprocessor', preprocessor),
                          ('random_forest', rf)
                         ])

# Preprocessing of training data, fit model 
rf_pipe.fit(X_train, y_train)

# Preprocessing of validation data, get predictions
rf_preds = rf_pipe.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, rf_preds)
print('Accuracy for Random Forest Model:', accuracy)

# Detailed classification report
print('Classification Report:\n', classification_report(y_test, rf_preds))


Accuracy for Random Forest Model: 0.9224749772520473
Classification Report:
               precision    recall  f1-score   support

           0       0.92      1.00      0.96     10039
           1       0.98      0.11      0.19       951

    accuracy                           0.92     10990
   macro avg       0.95      0.55      0.58     10990
weighted avg       0.93      0.92      0.89     10990

