In [1]:
import math
import numpy as np
from scipy import linalg
import pandas as pd
from tabulate import tabulate
import time
from collections import Counter


## Classes handler

In [2]:
def open_file(filename):
    with open(filename) as f:
        lines = f.readlines()
    split_lines = []
    for line in lines:
        split_lines.append(line.split())
    return split_lines

def get_splited_class(data, class_num):
    data_class = []
    for line in data:
        if int(line[0]) == class_num:
            data_class.append(line)
    return data_class

def get_unique_class_num(data):
    class_num = []
    for line in data:
        class_num.append(line[0])
    return np.unique(class_num)

In [3]:
split_data_tp1_app = open_file('data/data_tp1_app.txt')
split_data_tp1_dec = open_file('data/data_tp1_dec.txt')

split_data_tp2_app = open_file('data/data_tp2_app.txt')
split_data_tp2_dec = open_file('data/data_tp2_dec.txt')

split_data_tp3_app = open_file('data/data_tp3_app.txt')
split_data_tp3_dec = open_file('data/data_tp3_dec.txt')

## Model results 

In [4]:
def get_top_n_decision(n, theo_class, dists):
    dists_sorted = sorted(dists.items(), key = lambda kv: kv[1])
    cut_dists =dists_sorted[0:n]
    top_n_result = False
    for dist in cut_dists:
        if theo_class == dist[0]:
           top_n_result =True
    return top_n_result

def update_confusion_matrix(conf_matrix, line_class, scores_dict):
    row_num = int(line_class) - 1
    temp = min(scores_dict.values()) 
    res = [key for key in scores_dict if scores_dict[key] == temp] 
    col_num = int(res[0]) -1
    conf_matrix[row_num, col_num] = conf_matrix[row_num, col_num] + 1

def transform_matrix_to_df(conf_matrix, class_names):
    df = pd.DataFrame(conf_matrix, index=class_names, columns=class_names, dtype=int)
    return df

def print_decision_model_result(nb_app, nb_dec, top1_rate, top2_rate, conf_matrix):
    print("\tResults :")
    print("\t----------------")
    print("\tMethod used : Euclidian distance")
    print("\tNumber of elements for the learning step : ", nb_app)
    print("\tNumber of elements for the decision step : ", nb_dec)
    print("\t----------------")
    print("\n\tTop results :")
    print("\t----------------")
    print("\tTop 1 rate : ", top1_rate)
    print("\tTop 2 rate : ", top2_rate)
    print("\t----------------")
    print("\n\tConfusion matrix :")
    print("\t----------------")
    print(tabulate(conf_matrix, headers = 'keys', tablefmt = 'fancy_grid')) 
    print("\t----------------")


## Estimation de gaussiennes

In [5]:
def gaussian_fit_model(data):
    class_centers = {}
    classes = get_unique_class_num(data)
    for class_num in classes:
        data_class = get_splited_class(data, int(class_num))
        sum_x = 0
        sum_y = 0
        for line in data_class:
            sum_x += float(line[1])
            sum_y += float(line[2])
        x_center = 1/len(data_class) * sum_x
        y_center = 1/len(data_class) * sum_y
        class_centers[class_num] = [x_center, y_center]
    return class_centers

def compute_one_euclidian_dist(first_point, second_point):
    sum_square = 0
    for i in range(0, len(first_point)):
        sum_square = (float(first_point[i]) - float(second_point[i]))**2      
    return math.sqrt(sum_square)

def compute_euclidian_dists(class_centers, line):
    dists_dict = {}
    for class_center in class_centers:
        dist = compute_one_euclidian_dist(class_centers[class_center], line[1:3])
        dists_dict[class_center] = dist   
    return dists_dict

def euclidian_test_model(class_centers, app_data, dec_data):
    classes_num =get_unique_class_num(app_data)
    conf_matrix = np.zeros((len(classes_num), len(classes_num)))
    count_top1 = 0
    count_top2 = 0
    for line in dec_data:
        dists = compute_euclidian_dists(class_centers, line)
        if get_top_n_decision(1, line[0], dists):
            count_top1 = count_top1 + 1
        if get_top_n_decision(2, line[0], dists):
            count_top2 = count_top2 + 1
        update_confusion_matrix(conf_matrix, line[0], dists)
    df = transform_matrix_to_df(conf_matrix, classes_num)
    print_decision_model_result(len(app_data), len(dec_data), count_top1/len(dec_data), count_top2/len(dec_data), df)


In [6]:
def compute_mahalanobis_dists(class_centers, line, inv_covmat):
    dists_dict = {}
    for class_center in class_centers:
        dist = compute_one_mahalanobis_dist(line[1:3], class_centers[class_center], inv_covmat)
        dists_dict[class_center] = dist   
    return dists_dict

def compute_inv_covmat(dec_data):
    data  = pd.DataFrame(dec_data, dtype=float)[[1,2]]
    cov = np.cov(data.values.T)
    inv_covmat = linalg.inv(cov)
    return inv_covmat

def compute_one_mahalanobis_dist(first_point, second_point, inv_covmat):
    x_minus_mu = []
    for i in range(0, len(first_point)):
        x_minus_mu.append(float(first_point[i]) - second_point[i])
    x_minus_mu = np.array(x_minus_mu)
    left_term = np.dot(x_minus_mu, inv_covmat)
    mahal = np.dot(left_term, x_minus_mu.T)
    return math.sqrt(mahal)

def mahalanobis_test_model(class_centers, app_data, dec_data):
    classes_num =get_unique_class_num(app_data)
    conf_matrix = np.zeros((len(classes_num), len(classes_num)))
    count_top1 = 0
    count_top2 = 0
    inv_covmat = compute_inv_covmat(dec_data)
    for line in dec_data:
        dists = compute_mahalanobis_dists(class_centers, line, inv_covmat)
        if get_top_n_decision(1, line[0], dists):
            count_top1 = count_top1 + 1
        if get_top_n_decision(2, line[0], dists):
            count_top2 = count_top2 + 1
        update_confusion_matrix(conf_matrix, line[0], dists)
    df = transform_matrix_to_df(conf_matrix, classes_num)
    print_decision_model_result(len(app_data), len(dec_data), count_top1/len(dec_data), count_top2/len(dec_data), df)



## Question 1
### Distance euclidienne

In [8]:
print("First Data TP1")
start_time = time.time()
model = gaussian_fit_model(split_data_tp1_app)
euclidian_test_model(model, split_data_tp1_app, split_data_tp1_dec)
print("\tExecution time : %s seconds\n" % (time.time() - start_time))
print("==========================\n")

print("First Data TP2")
start_time = time.time()
model = gaussian_fit_model(split_data_tp2_app)
euclidian_test_model(model, split_data_tp2_app, split_data_tp2_dec)
print("\tExecution time : %s seconds\n" % (time.time() - start_time))
print("==========================\n")

print("First Data TP3")
start_time = time.time()
model = gaussian_fit_model(split_data_tp3_app)
euclidian_test_model(model, split_data_tp3_app, split_data_tp3_dec)
print("\tExecution time : %s seconds\n" % (time.time() - start_time))
print("==========================\n")

First Data TP1
	Results :
	----------------
	Method used : Euclidian distance
	Number of elements for the learning step :  500
	Number of elements for the decision step :  500
	----------------

	Top results :
	----------------
	Top 1 rate :  0.576
	Top 2 rate :  0.962
	----------------

	Confusion matrix :
	----------------
╒════╤═════╤═════╤═════╤═════╤═════╕
│    │   1 │   2 │   3 │   4 │   5 │
╞════╪═════╪═════╪═════╪═════╪═════╡
│  1 │  40 │  56 │   0 │   0 │   4 │
├────┼─────┼─────┼─────┼─────┼─────┤
│  2 │  41 │  51 │   0 │   0 │   8 │
├────┼─────┼─────┼─────┼─────┼─────┤
│  3 │   0 │   0 │  42 │  58 │   0 │
├────┼─────┼─────┼─────┼─────┼─────┤
│  4 │   0 │   0 │  34 │  60 │   6 │
├────┼─────┼─────┼─────┼─────┼─────┤
│  5 │   1 │   0 │   4 │   0 │  95 │
╘════╧═════╧═════╧═════╧═════╧═════╛
	----------------
	Execution time : 0.0069086551666259766 seconds


First Data TP2
	Results :
	----------------
	Method used : Euclidian distance
	Number of elements for the learning step :  5

### Disitance de Malahonibis

In [9]:
print("First Data TP1")
start_time = time.time()
model = gaussian_fit_model(split_data_tp1_app)
mahalanobis_test_model(model, split_data_tp1_app, split_data_tp1_dec)
print("\tExecution time : %s seconds\n" % (time.time() - start_time))
print("==========================\n")

print("First Data TP2")
start_time = time.time()
model = gaussian_fit_model(split_data_tp2_app)
mahalanobis_test_model(model, split_data_tp2_app, split_data_tp2_dec)
print("\tExecution time : %s seconds\n" % (time.time() - start_time))
print("==========================\n")

print("First Data TP3")
start_time = time.time()
model = gaussian_fit_model(split_data_tp3_app)
mahalanobis_test_model(model, split_data_tp3_app, split_data_tp3_dec)
print("\tExecution time : %s seconds\n" % (time.time() - start_time))
print("==========================\n")

First Data TP1
	Results :
	----------------
	Method used : Euclidian distance
	Number of elements for the learning step :  500
	Number of elements for the decision step :  500
	----------------

	Top results :
	----------------
	Top 1 rate :  0.992
	Top 2 rate :  1.0
	----------------

	Confusion matrix :
	----------------
╒════╤═════╤═════╤═════╤═════╤═════╕
│    │   1 │   2 │   3 │   4 │   5 │
╞════╪═════╪═════╪═════╪═════╪═════╡
│  1 │  98 │   0 │   0 │   0 │   2 │
├────┼─────┼─────┼─────┼─────┼─────┤
│  2 │   0 │ 100 │   0 │   0 │   0 │
├────┼─────┼─────┼─────┼─────┼─────┤
│  3 │   0 │   0 │  99 │   1 │   0 │
├────┼─────┼─────┼─────┼─────┼─────┤
│  4 │   0 │   0 │   0 │ 100 │   0 │
├────┼─────┼─────┼─────┼─────┼─────┤
│  5 │   1 │   0 │   0 │   0 │  99 │
╘════╧═════╧═════╧═════╧═════╧═════╛
	----------------
	Execution time : 0.05328083038330078 seconds


First Data TP2
	Results :
	----------------
	Method used : Euclidian distance
	Number of elements for the learning step :  500
	

## Question 2
### 1ppv

In [7]:
def get_kppv_list(app_data, dec_point, k):
    dist_list = []
    for app_line in app_data:
        dist_list.append([app_line[0], compute_one_euclidian_dist(app_line[1:3], dec_point)])
    app_df = pd.DataFrame(dist_list)
    sorted_df = app_df.sort_values(by=1)
    kppv_list = sorted_df.head(k)
    return kppv_list

def compute_1ppv(app_data, dec_data):
    count_top_1 = 0
    classes_num =get_unique_class_num(app_data)
    conf_matrix = np.zeros((len(classes_num), len(classes_num)))
    for line in dec_data:
        kppv_list = get_kppv_list(app_data, line[1:], 1)
        if kppv_list.iloc[0][0] == line[0]:
            count_top_1 = count_top_1 + 1
        # Conf matrix
        row_num = int(line[0]) - 1
        col_num = int(kppv_list.iloc[0][0]) -1
        conf_matrix[row_num, col_num] = conf_matrix[row_num, col_num] + 1
    conf_matrix = transform_matrix_to_df(conf_matrix, classes_num)
    print_decision_model_result(len(app_data), len(dec_data), count_top_1/len(dec_data), top2_rate=None, conf_matrix=conf_matrix)

def get_majority_result(kppv_list):
    count_dict = Counter(kppv_list[0])
    temp = max(count_dict.values()) 
    res = [key for key in count_dict if count_dict[key] == temp] 
    return res

def get_k_cross_validation(app_data, k_max, cv):
    best_k = [0, 0]
    df = pd.DataFrame(app_data)
    shuffled = df.sample(frac=1)
    cut_dfs = np.array_split(shuffled, cv)  
    for i in range(k_max):
        k = i + 1
        sum_error = 0
        for i in range(cv):
            df_cv = cut_dfs[i]
            df_train = df.drop(df_cv.index)
            df_cv = df_cv.values.tolist()
            df_train = df_train.values.tolist()
            count_top_1 = 0
            for line in df_cv:
                kppv_list = get_kppv_list(df_train, line[1:], k)
                res = get_majority_result(kppv_list)
                if res[0] == line[0]:
                    count_top_1 = count_top_1 + 1
            error_rate = count_top_1/len(df_cv)
            sum_error = sum_error + error_rate
        if best_k[1] < sum_error/cv:
            best_k[0] = k
            best_k[1] = sum_error/cv
    return best_k

def compute_kppv(app_data, dec_data, k):
    count_top_1 = 0
    count_top_2 = 0
    classes_num =get_unique_class_num(app_data)
    conf_matrix = np.zeros((len(classes_num), len(classes_num)))
    for line in dec_data:
        kppv_list = get_kppv_list(app_data, line[1:], k)
        k_list = list(Counter(kppv_list[0]).items())
        k_list.sort(key=lambda a: a[1], reverse=True)
        if k_list[0][0] == line[0]:
            count_top_1 = count_top_1 + 1
            count_top_2 = count_top_2 + 1
        else:
            if len(k_list) >= 2:
                if k_list[1][0] == line[0]:
                    count_top_2 = count_top_2 + 1
        
    #     # Conf matrix
        row_num = int(line[0]) - 1
        col_num = int(k_list[0][0]) -1
        conf_matrix[row_num, col_num] = conf_matrix[row_num, col_num] + 1
    conf_matrix = transform_matrix_to_df(conf_matrix, classes_num)
    print_decision_model_result(len(app_data), len(dec_data), count_top_1/len(dec_data), count_top_2/len(dec_data), conf_matrix=conf_matrix)


In [28]:
print("First Data TP1")
start_time = time.time()
best_k = get_k_cross_validation(split_data_tp1_app, 15, 5)
compute_kppv(split_data_tp1_app, split_data_tp1_dec, best_k[0])
print("\tThe best k used is {} with a error rate = {}".format(best_k[0], best_k[1]))
print("\tExecution time : %s seconds\n" % (time.time() - start_time))
print("==========================\n")

print("First Data TP2")
start_time = time.time()
best_k = get_k_cross_validation(split_data_tp2_app, 15, 5)
compute_kppv(split_data_tp2_app, split_data_tp2_dec, best_k[0])
print("\tThe best k used is {} with a error rate = {}".format(best_k[0], best_k[1]))
print("\tExecution time : %s seconds\n" % (time.time() - start_time))
print("==========================\n")

print("First Data TP3")
start_time = time.time()
best_k = get_k_cross_validation(split_data_tp3_app, 15, 5)
compute_kppv(split_data_tp3_app, split_data_tp3_dec, best_k[0])
print("\tThe best k used is {} with a error rate = {}".format(best_k[0], best_k[1]))
print("\tExecution time : %s seconds\n" % (time.time() - start_time))
print("==========================\n")

First Data TP1
	Results :
	----------------
	Method used : Euclidian distance
	Number of elements for the learning step :  500
	Number of elements for the decision step :  500
	----------------

	Top results :
	----------------
	Top 1 rate :  0.65
	Top 2 rate :  0.978
	----------------

	Confusion matrix :
	----------------
╒════╤═════╤═════╤═════╤═════╤═════╕
│    │   1 │   2 │   3 │   4 │   5 │
╞════╪═════╪═════╪═════╪═════╪═════╡
│  1 │  49 │  50 │   0 │   0 │   1 │
├────┼─────┼─────┼─────┼─────┼─────┤
│  2 │  44 │  52 │   0 │   0 │   4 │
├────┼─────┼─────┼─────┼─────┼─────┤
│  3 │   0 │   0 │  76 │  24 │   0 │
├────┼─────┼─────┼─────┼─────┼─────┤
│  4 │   0 │   0 │  43 │  53 │   4 │
├────┼─────┼─────┼─────┼─────┼─────┤
│  5 │   1 │   0 │   0 │   4 │  95 │
╘════╧═════╧═════╧═════╧═════╧═════╛
	----------------
	The best k used is 15 with a error rate = 0.63
	Execution time : 8.872598648071289 seconds


First Data TP2
	Results :
	----------------
	Method used : Euclidian distance
	Nu

## Parzen

In [8]:
def get_dict_uniform_value(point, data, h):
    value = 0;
    return value

def compute_uniform_parzen(app_data, dec_data, h):
    for line in dec_data:
        get_dict_uniform_value(line[1:], app_data, h)
    print("OK")

compute_uniform_parzen(split_data_tp1_app, split_data_tp1_dec, 5)

OK
