# Libraries

In [19]:
from pathlib import Path
import numpy as np

from src.data_processing import DataProcessor

# Data Prep

In [20]:
train_path = Path("./data/raw/aug_train.csv")

# Read first line as header
with open(train_path, "r", encoding="utf-8") as f:
    train_header = f.readline().strip().split(",")

train_data = np.genfromtxt(train_path, delimiter=",", skip_header=1, dtype=str)
train_data[:5]

array([['8949', 'city_103', '0.92', 'Male', 'Has relevent experience',
        'no_enrollment', 'Graduate', 'STEM', '>20', '', '', '1', '36',
        '1.0'],
       ['29725', 'city_40', '0.7759999999999999', 'Male',
        'No relevent experience', 'no_enrollment', 'Graduate', 'STEM',
        '15', '50-99', 'Pvt Ltd', '>4', '47', '0.0'],
       ['11561', 'city_21', '0.624', '', 'No relevent experience',
        'Full time course', 'Graduate', 'STEM', '5', '', '', 'never',
        '83', '0.0'],
       ['33241', 'city_115', '0.789', '', 'No relevent experience', '',
        'Graduate', 'Business Degree', '<1', '', 'Pvt Ltd', 'never',
        '52', '1.0'],
       ['666', 'city_162', '0.767', 'Male', 'Has relevent experience',
        'no_enrollment', 'Masters', 'STEM', '>20', '50-99',
        'Funded Startup', '4', '8', '0.0']], dtype='<U23')

# Preprocessing

I will implement preprocessing steps that are suitable for Naive Bayes models.

## Drop columns

In [21]:
# drop enrollee_id because it is just an identifier
# drop city_development_index because it is paired with city, one of them is enough
# drop training_hours because it has very low correlation with the target variable
drop_columns = ["enrollee_id", "city_development_index", "training_hours"]
train_data = train_data[:, [i for i, col in enumerate(train_header) if col not in drop_columns]]
train_header = [col for col in train_header if col not in drop_columns]

In [22]:
train_data[:5]

array([['city_103', 'Male', 'Has relevent experience', 'no_enrollment',
        'Graduate', 'STEM', '>20', '', '', '1', '1.0'],
       ['city_40', 'Male', 'No relevent experience', 'no_enrollment',
        'Graduate', 'STEM', '15', '50-99', 'Pvt Ltd', '>4', '0.0'],
       ['city_21', '', 'No relevent experience', 'Full time course',
        'Graduate', 'STEM', '5', '', '', 'never', '0.0'],
       ['city_115', '', 'No relevent experience', '', 'Graduate',
        'Business Degree', '<1', '', 'Pvt Ltd', 'never', '1.0'],
       ['city_162', 'Male', 'Has relevent experience', 'no_enrollment',
        'Masters', 'STEM', '>20', '50-99', 'Funded Startup', '4', '0.0']],
      dtype='<U23')

## Missing Value Handling

In [23]:
# Impute by mode
columns_to_impute = ['enrolled_university', 'major_discipline', 'experience', 'education_level', 'last_new_job' ]
columns_to_impute = [i for i, col in enumerate(train_header) if col in columns_to_impute]
train_data, _ = DataProcessor.impute_by_mode(train_data, columns_to_impute)

In [24]:
# Impute as category
columns_to_impute = ['gender', 'company_size', 'company_type']
columns_to_impute = [i for i, col in enumerate(train_header) if col in columns_to_impute]
train_data, _ = DataProcessor.impute_as_category(train_data, columns_to_impute)

In [25]:
# Check
for i, feature in enumerate(train_header):
    missing_count = np.sum(train_data[:, i] == "")
    print(f"Feature '{feature}' has {missing_count} missing values. Percentage: {missing_count / len(train_data) * 100:.2f}%")

Feature 'city' has 0 missing values. Percentage: 0.00%
Feature 'gender' has 0 missing values. Percentage: 0.00%
Feature 'relevent_experience' has 0 missing values. Percentage: 0.00%
Feature 'enrolled_university' has 0 missing values. Percentage: 0.00%
Feature 'education_level' has 0 missing values. Percentage: 0.00%
Feature 'major_discipline' has 0 missing values. Percentage: 0.00%
Feature 'experience' has 0 missing values. Percentage: 0.00%
Feature 'company_size' has 0 missing values. Percentage: 0.00%
Feature 'company_type' has 0 missing values. Percentage: 0.00%
Feature 'last_new_job' has 0 missing values. Percentage: 0.00%
Feature 'target' has 0 missing values. Percentage: 0.00%


In [26]:
train_data[:3]

array([['city_103', 'Male', 'Has relevent experience', 'no_enrollment',
        'Graduate', 'STEM', '>20', 'Missing', 'Missing', '1', '1.0'],
       ['city_40', 'Male', 'No relevent experience', 'no_enrollment',
        'Graduate', 'STEM', '15', '50-99', 'Pvt Ltd', '>4', '0.0'],
       ['city_21', 'Missing', 'No relevent experience',
        'Full time course', 'Graduate', 'STEM', '5', 'Missing',
        'Missing', 'never', '0.0']], dtype='<U23')

# SMOTEN

SMOTEN is a technique for handling imbalanced datasets with categorical features. It works by generating synthetic samples for the minority class using a combination of oversampling and nearest neighbor techniques specifically designed for categorical data.