In [1]:
import math
import functions
import numpy as np
from scipy import linalg
import pandas as pd
from tabulate import tabulate
import time
from collections import Counter


## Classes handler

In [3]:
split_data_tp1_app = open_file('data/data_tp1_app.txt')
split_data_tp1_dec = open_file('data/data_tp1_dec.txt')

split_data_tp2_app = open_file('data/data_tp2_app.txt')
split_data_tp2_dec = open_file('data/data_tp2_dec.txt')

split_data_tp3_app = open_file('data/data_tp3_app.txt')
split_data_tp3_dec = open_file('data/data_tp3_dec.txt')

## Parzen

In [None]:
def get_uniform_val(app_point, dec_point, h):
    result = 0
    dist = compute_one_euclidian_dist(app_point, dec_point)
    if (dist > -h) & (dist < h):
        result = 1
    return result

def get_top_n_decision_parzen(n, theo_class, dists):
    dists_sorted = sorted(dists.items(), key = lambda kv: kv[1], reverse=True)
    cut_dists =dists_sorted[0:n]
    top_n_result = False
    for dist in cut_dists:
        if theo_class == dist[0]:
           top_n_result =True
    return top_n_result

def get_dict_uniform_value(point, data, h):
    result_dict = {}
    class_num = get_unique_class_num(data)
    COUNT_CLASS = 100
    for one_class in class_num:
        count_uniform = 0
        for app_point in data:
            if app_point[0] == one_class:
                count_uniform = count_uniform + get_uniform_val(app_point[1:], point, h)
        result_dict[one_class] = count_uniform/COUNT_CLASS
    return result_dict

def update_confusion_matrix(conf_matrix, line_class, scores_dict):
    row_num = int(line_class) - 1
    temp = max(scores_dict.values()) 
    res = [key for key in scores_dict if scores_dict[key] == temp] 
    col_num = int(res[0]) -1
    conf_matrix[row_num, col_num] = conf_matrix[row_num, col_num] + 1

def get_h_cross_validation(app_data, h_list, cv, method):
    best_h = [0, 0]
    df = pd.DataFrame(app_data)
    shuffled = df.sample(frac=1)
    cut_dfs = np.array_split(shuffled, cv)  
    for h in h_list:
        sum_error = 0
        for i in range(cv):
            df_cv = cut_dfs[i]
            df_train = df.drop(df_cv.index)
            df_cv = df_cv.values.tolist()
            df_train = df_train.values.tolist()
            count_top_1 = 0
            for line in df_cv:
                if method == "uniform":
                    results = get_dict_uniform_value()
                else if method == "gaussian":
                    results = get_dict_gaussian_value()
                else:
                    print("The given method: {} is not implemented.\n You must choose between \"uniform\" and \"gaussian\"".format(method))
                    exit()
                if max(results.values()) == line[0]:
                    count_top_1 = count_top_1 + 1
            error_rate = count_top_1/len(df_cv)
            sum_error = sum_error + error_rate
        if best_h[1] < sum_error/cv:
            best_h[0] = h
            best_h[1] = sum_error/cv
    return best_h

def compute_uniform_parzen(app_data, dec_data, h):
    count_top_1 = 0
    count_top_2 = 0
    classes_num =get_unique_class_num(app_data)
    conf_matrix = np.zeros((len(classes_num), len(classes_num)))
    for line in dec_data:
        results = get_dict_uniform_value(line[1:], app_data, h)
        if get_top_n_decision_parzen(1, line[0], results):
            count_top_1 = count_top_1 + 1
        if get_top_n_decision_parzen(2, line[0], results):
            count_top_2 = count_top_2 + 1
        update_confusion_matrix(conf_matrix, line[0], results)
    conf_matrix = transform_matrix_to_df(conf_matrix, classes_num)
    print_decision_model_result(len(app_data), len(dec_data), count_top_1/len(dec_data), count_top_2/len(dec_data), conf_matrix=conf_matrix)



In [None]:
h = 0.5
print("First Data TP1")
start_time = time.time()
compute_uniform_parzen(split_data_tp1_app, split_data_tp1_dec, h)
print("\tExecution time : %s seconds\n" % (time.time() - start_time))
print("==========================\n")

print("First Data TP2")
start_time = time.time()
compute_uniform_parzen(split_data_tp2_app, split_data_tp2_dec, h)
print("\tExecution time : %s seconds\n" % (time.time() - start_time))
print("==========================\n")

print("First Data TP3")
start_time = time.time()
compute_uniform_parzen(split_data_tp3_app, split_data_tp3_dec, h)
print("\tExecution time : %s seconds\n" % (time.time() - start_time))
print("==========================\n")

In [None]:
h_list = np.arange(0.1, 2, 0.1)
print("First Data TP1")
start_time = time.time()
best_h = get_h_cross_validation(split_data_tp1_app, h_list, 5, "uniform")
compute_uniform_parzen(split_data_tp1_app, split_data_tp1_dec, best_h[0])
print("\tThe best k used is {} with a error rate = {}".format(best_h[0], best_h[1]))
print("\tExecution time : %s seconds\n" % (time.time() - start_time))
print("==========================\n")

print("First Data TP2")
start_time = time.time()
best_h = get_h_cross_validation(split_data_tp2_app, h_list, 5, "uniform")
compute_uniform_parzen(split_data_tp2_app, split_data_tp2_dec, best_k[0])
print("\tThe best k used is {} with a error rate = {}".format(best_k[0], best_k[1]))
print("\tExecution time : %s seconds\n" % (time.time() - start_time))
print("==========================\n")

print("First Data TP3")
start_time = time.time()
best_h = get_h_cross_validation(split_data_tp3_app, h_list, 5, "uniform")
compute_uniform_parzen(split_data_tp3_app, split_data_tp3_dec, best_k[0])
print("\tThe best k used is {} with a error rate = {}".format(best_k[0], best_k[1]))
print("\tExecution time : %s seconds\n" % (time.time() - start_time))
print("==========================\n")

In [None]:
def get_gaussian_val(app_point, dec_point, h):
    result = 0
    dist = compute_one_euclidian_dist(app_point, dec_point)
    result = (1 / (h * math.sqrt(2 * math.pi)))* math.exp((-(dist)**2)/(2*(h**2)))
    return result

def get_dict_gaussian_value(point, data, h):
    result_dict = {}
    class_num = get_unique_class_num(data)
    COUNT_CLASS = 100
    for one_class in class_num:
        count_uniform = 0
        for app_point in data:
            if app_point[0] == one_class:
                count_uniform = count_uniform + get_gaussian_val(app_point[1:], point, h)
        result_dict[one_class] = count_uniform/COUNT_CLASS
    return result_dict

def compute_gaussian_parzen(app_data, dec_data, h):
    count_top_1 = 0
    count_top_2 = 0
    classes_num =get_unique_class_num(app_data)
    conf_matrix = np.zeros((len(classes_num), len(classes_num)))
    for line in dec_data:
        results = get_dict_gaussian_value(line[1:], app_data, h)
        if get_top_n_decision_parzen(1, line[0], results):
            count_top_1 = count_top_1 + 1
        if get_top_n_decision_parzen(2, line[0], results):
            count_top_2 = count_top_2 + 1
        update_confusion_matrix(conf_matrix, line[0], results)
    conf_matrix = transform_matrix_to_df(conf_matrix, classes_num)
    print_decision_model_result(len(app_data), len(dec_data), count_top_1/len(dec_data), count_top_2/len(dec_data), conf_matrix=conf_matrix)


In [None]:
h = 0.5
print("First Data TP1")
start_time = time.time()
compute_gaussian_parzen(split_data_tp1_app, split_data_tp1_dec, h)
print("\tExecution time : %s seconds\n" % (time.time() - start_time))
print("==========================\n")

print("First Data TP2")
start_time = time.time()
compute_gaussian_parzen(split_data_tp2_app, split_data_tp2_dec, h)
print("\tExecution time : %s seconds\n" % (time.time() - start_time))
print("==========================\n")

print("First Data TP3")
start_time = time.time()
compute_gaussian_parzen(split_data_tp3_app, split_data_tp3_dec, h)
print("\tExecution time : %s seconds\n" % (time.time() - start_time))
print("==========================\n")

In [None]:
h_list = np.arange(0.1, 2, 0.1)
print("First Data TP1")
start_time = time.time()
best_h = get_h_cross_validation(split_data_tp1_app, h_list, 5, "gaussian")
compute_gaussian_parzen(split_data_tp1_app, split_data_tp1_dec, best_h[0])
print("\tThe best k used is {} with a error rate = {}".format(best_h[0], best_h[1]))
print("\tExecution time : %s seconds\n" % (time.time() - start_time))
print("==========================\n")

print("First Data TP2")
start_time = time.time()
best_h = get_h_cross_validation(split_data_tp2_app, h_list, 5, "gaussian")
compute_gaussian_parzen(split_data_tp2_app, split_data_tp2_dec, best_k[0])
print("\tThe best k used is {} with a error rate = {}".format(best_k[0], best_k[1]))
print("\tExecution time : %s seconds\n" % (time.time() - start_time))
print("==========================\n")

print("First Data TP3")
start_time = time.time()
best_h = get_h_cross_validation(split_data_tp3_app, h_list, 5, "gaussian")
compute_gaussian_parzen(split_data_tp3_app, split_data_tp3_dec, best_k[0])
print("\tThe best k used is {} with a error rate = {}".format(best_k[0], best_k[1]))
print("\tExecution time : %s seconds\n" % (time.time() - start_time))
print("==========================\n")

## Séparation linéaire
### Question 1