In [54]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix
import numpy as np

CV = 5
LABELS = []

data = pd.read_csv('sncb_final.csv', sep=';')
target = data['target']
LABELS = target.unique().tolist()
LABELS.sort()

def plot_confusion_matrix(cm, model_name, labels):
    cm = cm.astype('int') 
    sns.heatmap(cm, annot=True, fmt="d", xticklabels=labels, yticklabels=labels)
    plt.savefig(f'models\\figures\\confusion_matrix_{model_name}.png')
    plt.close()

### Creating and storing the models

In [21]:
# Load the data
data = pd.read_csv('sncb_final.csv', sep=';')

# Split the data into features and target
X = data.drop(columns=['target'])
y = data['target']

for cv in range(CV):

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

    # Create a random forest classifier
    clf = RandomForestClassifier()

    # Train the classifier
    clf.fit(X_train, y_train)

    # Save the model
    joblib.dump(clf, f'models\\models\\RFC_{cv}.pkl')

    # Save the test data
    X_test.to_csv(f'models\\test_data\\RFC_test_data_{cv}.csv', index=False)
    y_test.to_csv(f'models\\test_data\\RFC_test_target_{cv}.csv', index=False)

### Testing

In [55]:
models_names = ['RFC']

for model_name in models_names:
    accuracy = 0
    for cv in range(CV):
        clf = joblib.load(f'models\\models\\{model_name}_{cv}.pkl')
        X_test = pd.read_csv(f'models\\test_data\\{model_name}_test_data_{cv}.csv')
        y_test = pd.read_csv(f'models\\test_data\\{model_name}_test_target_{cv}.csv')

        y_pred = clf.predict(X_test)
        accuracy_cv = accuracy_score(y_test, y_pred)
        cm_cv = confusion_matrix(y_test, y_pred, labels=LABELS)
        cm = cm_cv if cv == 0 else cm + cm_cv
        accuracy += accuracy_cv

    plot_confusion_matrix(cm, model_name, LABELS)
    print(f'accuracy: {accuracy/CV}')

accuracy: 0.2418300653594771
