# Importing Libraries

In [1]:
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.utils import resample
import pandas as pd
import numpy as np

# Load data

In [2]:
data = pd.read_csv('PO_data.csv')

# Drop columns with too many missing values (e.g., >50%)

In [3]:
data = data.dropna(thresh=len(data) * 0.5, axis=1)

# Assume `VendorPaymentMethodName` as a target example (replace with appropriate target variable)

In [4]:
if 'VendorPaymentMethodName' in data.columns:
    target_column = 'VendorPaymentMethodName'
else:
    raise ValueError("Please specify a valid target column")

# Separate features and target

In [5]:
features = data.drop(columns=[target_column], axis=1)
target = data[target_column]

# Balance the classes in the target variable

In [6]:
target_counts = target.value_counts()
max_count = target_counts.max()

# Oversample minority classes to balance

In [7]:
balanced_data = []
for class_label in target_counts.index:
    class_data = data[data[target_column] == class_label]
    balanced_data.append(resample(class_data, replace=True, n_samples=max_count, random_state=42))

# Combine balanced data into one DataFrame

In [8]:
balanced_data = pd.concat(balanced_data)

# Separate features and target again for balanced data

In [9]:
X_balanced = balanced_data.drop(columns=[target_column], axis=1)
y_balanced = balanced_data[target_column]

# `Handle missing values and encode categorical variables in X_balanced` For numerical columns, fill NaNs with the median

In [10]:
num_features = X_balanced.select_dtypes(include=[np.number])
num_imputer = SimpleImputer(strategy='median')
num_features_imputed = pd.DataFrame(num_imputer.fit_transform(num_features), columns=num_features.columns)

# For categorical columns, fill NaNs with a placeholder value

In [11]:
cat_features = X_balanced.select_dtypes(include=[object])
cat_imputer = SimpleImputer(strategy='constant', fill_value='missing')
cat_features_imputed = pd.DataFrame(cat_imputer.fit_transform(cat_features), columns=cat_features.columns)

# Use OneHotEncoder for categorical variables

In [12]:
cat_features_encoded = pd.get_dummies(cat_features_imputed)

# Combine processed numerical and categorical features

In [13]:
processed_features = pd.concat([num_features_imputed, cat_features_encoded], axis=1)

# **Scale features**

In [14]:
scaler = StandardScaler()

# Split data into train and test sets using stratified splitting

In [15]:
X_train, X_test, y_train, y_test = train_test_split(processed_features, y_balanced, test_size=0.2, stratify=y_balanced, random_state=42)

# Fit scaler on training data only, then apply to both train and test sets

In [16]:
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# `Cross-Validation to Check Overfitting` Use StratifiedKFold to check consistency in model performance

In [17]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Build a RandomForest model for classification

In [18]:
model = RandomForestClassifier(random_state=42)

# Cross-validation

In [19]:
cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=skf, scoring='accuracy')
print("Cross-Validation Scores:", cv_scores)
print("Mean CV Accuracy:", np.mean(cv_scores))

Cross-Validation Scores: [1. 1. 1. 1. 1.]
Mean CV Accuracy: 1.0


# Train model on full training set

In [20]:
model.fit(X_train_scaled, y_train)

# Make predictions and evaluate the model on the test set

In [21]:
y_pred = model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)

# Print results


In [22]:
print("Test Set Accuracy:", accuracy)
print("Classification Report:\n", classification_report(y_test, y_pred))

Test Set Accuracy: 1.0
Classification Report:
               precision    recall  f1-score   support

    BRIDGING       1.00      1.00      1.00       307
        Bank       1.00      1.00      1.00       308
       CHECK       1.00      1.00      1.00       308
    CNY_Wire       1.00      1.00      1.00       308
        Cash       1.00      1.00      1.00       308
       Check       1.00      1.00      1.00       307
     Cheques       1.00      1.00      1.00       308
    DINHEIRO       1.00      1.00      1.00       308
  ELECTRONIC       1.00      1.00      1.00       308
     SEPA CT       1.00      1.00      1.00       307
    Основной       1.00      1.00      1.00       307

    accuracy                           1.00      3384
   macro avg       1.00      1.00      1.00      3384
weighted avg       1.00      1.00      1.00      3384

