In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

In [None]:
df = pd.read_csv("")

if "t_sec" in df.columns:
    df = df.drop(columns=["t_sec"])

df_back = df.drop(columns=['ANeck_x', 'ANeck_y', 'ANeck_z', 'GNeck_x', 'GNeck_y', 'GNeck_z'])

In [None]:
def split_data(data):   
    '''
    Splits training data into trainings and test data.

    Parameters:
        data (DataFrame): Any Data Frame.
    
    Returns:
        x_train (DataFrame): Features for traning a model.
        y_train (Series): Targets for training a model.
        x_test (DataFrame): Features for testing a trained model.
        y_test (Series): Targets for testing a trained model.
    '''

    x = data.drop(["Behavior"], axis=1)
    y = data["Behavior"]
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
    return x_train, y_train, x_test, y_test


x_train, y_train, x_test, y_test = split_data(df_back)

In [None]:
def plot_correlations(values):       
    '''
    Plots a heatmap which visualizes the correlations between different DF columns.

    Parameters:
        values (DataFrame): Any Data Frame with multiple columns.       
    '''
    sns.set(rc={'figure.figsize':(12,8)})
    sns.heatmap(values.corr().abs(),annot=True)


plot_correlations(x_train)

In [None]:
def compare_algo(x, y):   
    '''
    Tests different classifier alogrithmns on a classification task to choose the most appropriate one for optimization.
    Visualizes the test results as a boxplot.

    Parameters:
        x (DataFrame): Features for traning a model.
        y (Series): Targets for training a model.
    '''

    algs = [("GBC", GradientBoostingClassifier()),
            ("GNB", GaussianNB()),
            ("KNN", KNeighborsClassifier()),
            ("LDA", LinearDiscriminantAnalysis()),
            ("RFC", RandomForestClassifier()),
            ("SVC", SVC()),
            ("TREE", DecisionTreeClassifier())]

    results = []
    names = []

    for name, model in algs:
        cv_results = cross_val_score(model, x, y, n_jobs=3, verbose=1)
        results.append(cv_results)
        names.append(name)

    plt.boxplot(results, labels=names)
    plt.title("Classifier Comparison")
    plt.xlabel("Classifier")
    plt.ylabel("cross-validation-score")
    plt.show()


#compare_algo(x_train, y_train)    

In [None]:
def tune_model(x, y):
    '''
    Performs a grid search to choose the best possible parameters.

    Parameters:
        x (DataFrame): Features for traning a model.
        y (Series): Targets for training a model.

    Returns:
        best_model (estimator): Estimator which got the best results.
    '''

    model = RandomForestClassifier(warm_start=True)
    params = {
        "n_estimators": [10, 50, 100, 500, 1000],
        "criterion": ["gini", "entropy"],
        "max_features": ["auto", "sqrt", "log2"],
        "min_samples_split": [2, 5, 10],
        "min_samples_leaf": [1, 2, 4],
        "bootstrap": [True, False]
    }
    search_model = GridSearchCV(estimator=model, param_grid=params, n_jobs=3, verbose=2)
    search_model.fit(x, y)
    best_model = search_model.best_estimator_
    return best_model


#best_model = tune_model(x_train, y_train)

In [None]:
def feature_importance(model):
    '''
    Reads the relevance of the various features and visualizes these values.

    Parameters:
        model (estimator): Trained classifier estimator.
    '''

    imp = model.feature_importances_ 
    features = x_train.columns 
    indices = np.argsort(imp)
    
    plt.title('Feature Importance\n', fontsize = 15)
    plt.ylabel("Feature\n")
    plt.xlabel("\nImportance")
    plt.barh(range(len(indices)), imp[indices], align='center')
    plt.yticks(range(len(indices)), [features[i] for i in indices])
    plt.show()   

In [None]:
def plot_conf_matrix(test, pred):
    '''
    Visualizes a confusion matrix based on the test data split and a predicition of an classifier model.

    Parameters:
        test (Series): Targets for testing a trained model.
        pred (array): Predicted classes from a classifier model.
    '''
    
    labels = test.unique()
    conf_matrix = confusion_matrix(test, pred)

    sns.heatmap(conf_matrix, xticklabels=labels, yticklabels=labels, annot=True,linewidths = 0.1, fmt="d", cmap = "YlGnBu")
    sns.set(rc={'figure.figsize':(20,16)})
    plt.title("Confusion matrix\n", fontsize = 20)
    plt.ylabel("True label\n")
    plt.xlabel("\nPred label")
    plt.show()

In [None]:
def test_model(model):
    '''
    Tests a ML-model and outputs different metrics.

    Parameters:
        model (estimator): Trained classifier estimator.
    '''
    
    y_pred = model.predict(x_test)

    print("Accuracy:", accuracy_score(y_test, y_pred))

    print("\n -----------------Classification Report-----------------\n")
    print(classification_report(y_test, y_pred))

    print("\n -------------------Confusion Matrix--------------------\n")
    plot_conf_matrix(y_test, y_pred)

    print("\n ------------------Feature Importance-------------------\n")
    feature_importance(model)

In [None]:
rfclf = RandomForestClassifier(
    n_estimators=100,
    criterion="gini",
    max_features="sqrt",
    min_samples_split=2,
    min_samples_leaf=1,
    bootstrap=False,
    n_jobs=3)

rfclf.fit(x_train, y_train)

test_model(rfclf)