In [13]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, VotingClassifier
from sklearn.metrics import accuracy_score

In [15]:
# Add these lines at the beginning of your code to define the parameter grid for KNN and SVM classifiers
knn_params = {
    'n_neighbors': range(1, 21, 2),
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan', 'minkowski']
}

svm_params = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto']
}


def load_data(video_name):
    data_files = [
        f"{video_name}_DataRGB.csv",
        f"{video_name}_DataHOG.csv",
        f"{video_name}_DataH10.csv",
        f"{video_name}_DataLBP.csv",
    ]
    labels_file = f"{video_name}_Labels.csv"

    data = [pd.read_csv(file) for file in data_files]
    labels = pd.read_csv(labels_file)

    return data, labels

def split_data(data, labels, training_size):
    X_train = data.iloc[:training_size, :]
    X_test = data.iloc[training_size:, :]
    y_train = labels.iloc[:training_size, :]
    y_test = labels.iloc[training_size:, :]

    return X_train, X_test, y_train, y_test

def preprocess_data(X_train, X_test):
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    return X_train_scaled, X_test_scaled

def class_distribution(y_train):
    return y_train.value_counts(normalize=True)

def compare_results(video_name, feature_name, results_df):
    ground_truth_file = f"{video_name}_Results.csv"
    ground_truth_results = pd.read_csv(ground_truth_file, header=None, sep=",")

    mean_ground_truth = ground_truth_results.mean(axis=0)
    std_ground_truth = ground_truth_results.std(axis=0)

    print(f"\nGround truth results for {video_name} using {feature_name} features:")
    print(f"Mean: {mean_ground_truth.values[0]:.4f}")
    print(f"Standard Deviation: {std_ground_truth.values[0]:.4f}")

    mean_code_results = results_df['Accuracy'].mean()
    std_code_results = results_df['Accuracy'].std()

    print(f"\nCode results for {video_name} using {feature_name} features:")
    print(f"Mean: {mean_code_results:.4f}")
    print(f"Standard Deviation: {std_code_results:.4f}")

def largest_prior_classifier(y_train):
    return y_train.value_counts().idxmax()

def predict_largest_prior(y_train, X_test):
    largest_prior = largest_prior_classifier(y_train)
    return np.full(X_test.shape[0], largest_prior)
def train_and_evaluate_classifiers(X_train, X_test, y_train, y_test):
    classifiers = [
        ('Largest Prior', None),
        ('Linear Discriminant Analysis', LinearDiscriminantAnalysis()),
        ('3-nn', KNeighborsClassifier(n_neighbors=3)),
        ('Decision Tree', DecisionTreeClassifier(random_state=42)),
        ('SVM', SVC(random_state=42)),
        ('Bagging', BaggingClassifier(random_state=42)),
        ('Random Forest', RandomForestClassifier(random_state=42)),
        ('KNN (tuned)', knn_tuned),
        ('SVM (tuned)', svm_tuned)
    ]

    results = []

    largest_prior_accuracy = y_test.value_counts(normalize=True).max()
    results.append({'Classifier': 'Largest Prior', 'Accuracy': largest_prior_accuracy})

    for name, classifier in classifiers[1:]:
        if name == 'Ensemble':
            y_pred = ensemble_voting_classifier(classifiers[1:], X_train, y_train, X_test)
        else:
            classifier.fit(X_train, y_train.values.ravel())
            y_pred = classifier.predict(X_test)

        accuracy = accuracy_score(y_test, y_pred)
        results.append({'Classifier': name, 'Accuracy': accuracy})

    results_df = pd.DataFrame(results)
    return results_df


def apply_pca(X_train, X_test, n_components=0.95):
    pca = PCA(n_components=n_components)
    X_train_pca = pca.fit_transform(X_train)
    X_test_pca = pca.transform(X_test)
    return X_train_pca, X_test_pca


def hyperparameter_tuning(classifier, param_grid, X_train, y):
    cv = StratifiedKFold(n_splits=5)
    grid_search = GridSearchCV(classifier, param_grid, cv=cv, scoring='accuracy')
    grid_search.fit(X_train, y.values.ravel())
    best_params = grid_search.best_params_
    return best_params





def ensemble_voting_classifier(classifiers, X_train, y_train, X_test):
    voting_clf = VotingClassifier(estimators=classifiers, voting='hard', n_jobs=-1)
    voting_clf.fit(X_train, y_train.values.ravel())
    y_pred = voting_clf.predict(X_test)
    return y_pred

# Main loop
if __name__ == "__main__":
    video_info = {
        'Pigs_49651_960_540_500f': 2710,
        'Koi_5652_952_540': 916,
        'Pigeons_8234_1280_720': 2268,
        'Pigeons_4927_960_540_600f': 1574,
        'Pigeons_29033_960_540_300f': 2148
    }

    feature_names = ['DataRGB', 'DataHOG', 'DataH10', 'DataLBP']

    for video_name, training_size in video_info.items():
        print(f'Processing {video_name}')
        data, labels = load_data(video_name)
        for i, feature_data in enumerate(data):
            feature_name = feature_names[i]
            print(f'\nFeature: {feature_name}')
            X_train_temp, X_test_temp, y_train_temp, y_test_temp = split_data(feature_data, labels, training_size)
            X_train_scaled, X_test_scaled = preprocess_data(X_train_temp, X_test_temp)

            # Apply PCA
            X_train_pca, X_test_pca = apply_pca(X_train_scaled, X_test_scaled)

            # Check class distribution
            print(f'Class Distribution:\n{class_distribution(y_train_temp)}\n')

            knn_tuned = hyperparameter_tuning(KNeighborsClassifier(), knn_params, X_train_pca, y_train_temp)
            svm_tuned = hyperparameter_tuning(SVC(probability=True), svm_params, X_train_pca, y_train_temp)

            # Perform cross-validation
            cv_scores_knn = cross_val_score(knn_tuned, X_train_pca, y_train_temp.values.ravel(), cv=StratifiedKFold(n_splits=5))
            cv_scores_svm = cross_val_score(svm_tuned, X_train_pca, y_train_temp.values.ravel(), cv=StratifiedKFold(n_splits=5))
            print(f"Cross-validation scores for KNN: {cv_scores_knn}")
            print(f"Cross-validation scores for SVM: {cv_scores_svm}")

            # Evaluate classifiers
            results_df = train_and_evaluate_classifiers(X_train_pca, X_test_pca, y_train_temp, y_test_temp)
            print(results_df)

            # Compare results with ground truth
            compare_results(video_name, feature_name, results_df)

        print('\n')



Processing Pigs_49651_960_540_500f

Feature: DataRGB
Class Distribution:
16
17    0.089299
16    0.088561
10    0.085609
1     0.077860
11    0.073432
2     0.066790
12    0.063469
9     0.060148
18    0.053875
7     0.050923
6     0.049077
13    0.047601
5     0.047232
4     0.030627
14    0.028782
8     0.018819
20    0.014760
15    0.014022
0     0.013653
19    0.013284
3     0.007380
21    0.004797
dtype: float64

