In [1]:
from numpy.linalg import norm
import numpy as np
from scipy import sparse
from matplotlib import pyplot as plt
from scipy import optimize
from scipy.sparse import csr_matrix
from scipy.optimize import minimize
from sklearn.datasets import load_iris
from sklearn.datasets import load_svmlight_file
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.utils.validation import check_is_fitted, check_array, check_X_y, check_random_state
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_mldata

from LMNN import LargeMarginNearestNeighbor
from LMNN_SS import SemiSupervisedLargeMarginNearestNeighbor

import time

# Metrics Learning 

**Group Members: Oskar Hint , Xiaoshen Hou,  Kasper Schønberg,  Valentine Van Der Nya**

***
## 0.  Experinment 0 - Iris data 


In [None]:
# data
dataset = load_iris()
X, y = dataset.data, dataset.target

# 25 points in labeled, 50 in unlabeled, 75 in test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.667, random_state=42)
X_labeled = np.delete(X_train, range(30), 0)
y_labeled = np.delete(y_train, range(30))
X_unlabeled = np.delete(X_train, range(20,49), 0)

In [None]:
K=[1,2,3,4,5,6,7,8,9,10]
for k in K:
    print("########  K = ",k, " ###########")
    time1 = time.time()
    lmnn = LargeMarginNearestNeighbor(n_neighbors=k)
    lmnn.fit(X_labeled,y_labeled)
    time2 = time.time()
    print("Iris LMNN : ", lmnn.score(X_test,y_test), " Time(s) :", (time2-time1))
   
    time1 = time.time()
    ssc = SemiSupervisedLargeMarginNearestNeighbor(n_neighbors=k)
    ssc.fit(X_labeled, X_unlabeled, y_labeled)
    time2 = time.time()
    print("Iris SSC : ", ssc.score(X_test, y_test), " Time(s) :", (time2-time1))    

    neigh = KNeighborsClassifier(n_neighbors=k)
    neigh.fit(X_labeled,y_labeled)
    print("Iris KNN : ", neigh.score(X_test,y_test))

***
## 1.  Experinment 1 - USPS data 

In [2]:
def get_USPS_training_data():
    data = load_svmlight_file('ml_data/usps/usps')
    return data[0], data[1]

def get_USPS_testing_data():
    data = load_svmlight_file('ml_data/usps/usps.t')
    return data[0], data[1]

X_train, y_train = get_USPS_training_data()
X_test, y_test = get_USPS_testing_data()

In [4]:
K=[1,2,3,4,5,6,7,8,9,10]
for k in K:
    print("########  K = ",k, " ###########")
    lmnn = LargeMarginNearestNeighbor(n_neighbors=k)
    lmnn.fit(X_train[0:100,].todense(),y_train[0:100,])
    print("USPS LMNN : ", lmnn.score(X_test.todense(),y_test))

    ssc = SemiSupervisedLargeMarginNearestNeighbor(n_neighbors=k)
    ssc.fit(X_train[0:100,].todense(),X_train[101:600,].todense(), y_train[0:100,])
    print("USPS SSC : ", ssc.score(X_test.todense(),y_test))
    
    neigh = KNeighborsClassifier(n_neighbors=k)
    neigh.fit(X_train[0:100,].todense(),y_train[0:100,])
    print("USPS KNN : ",  neigh.score(X_test.todense(),y_test))

########  K =  1  ###########
USPS LMNN :  0.762331838565
USPS SSC :  0.762331838565
USPS KNN :  0.762331838565
########  K =  2  ###########
USPS LMNN :  0.70004982561
USPS SSC :  0.726955655207
USPS KNN :  0.712506228201
########  K =  3  ###########
USPS LMNN :  0.71649227703
USPS SSC :  0.719481813652
USPS KNN :  0.69456900847
########  K =  4  ###########
USPS LMNN :  0.660189337319
USPS SSC :  0.689586447434
USPS KNN :  0.68360737419
########  K =  5  ###########
USPS LMNN :  0.629297458894
USPS SSC :  0.666168410563
USPS KNN :  0.686098654709
########  K =  6  ###########
USPS LMNN :  0.607872446437
USPS SSC :  0.66168410563
USPS KNN :  0.675635276532
########  K =  7  ###########
USPS LMNN :  0.588938714499
USPS SSC :  0.62879920279
USPS KNN :  0.668659691081
########  K =  8  ###########
USPS LMNN :  0.58495266567
USPS SSC :  0.596910812157
USPS KNN :  0.651718983558
########  K =  9  ###########
USPS LMNN :  0.581464872945
USPS SSC :  0.606377678127
USPS KNN :  0.65620328849


***
## 2.  Experinment 2 - MNIST data 

In [None]:
def get_MNIST_training_data():
    data = load_svmlight_file('ml_data/mnist/mnist')
    return data[0], data[1]

def get_MNIST_testing_data():
    data = load_svmlight_file('ml_data/mnist/mnist.t')
    return data[0], data[1]

X_train, y_train = get_MNIST_training_data()
X_test, y_test = get_MNIST_testing_data()

In [None]:
mnist = fetch_mldata('MNIST original', data_home='ml_data/mnist')  #Download MMNIST

In [None]:
X, y = mnist.data, mnist.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.99, random_state=42)

In [None]:
K=[1,2,3,4,5,6,7,8,9,10]
for k in K:
    print("########  K = ",k, " ###########")
    lmnn = LargeMarginNearestNeighbor(n_neighbors=k)
    lmnn.fit(X_train[0:200,],y_train[0:200])
    print("MNIST LMNN : ", lmnn.score(X_test,y_test))

    ssc = SemiSupervisedLargeMarginNearestNeighbor(n_neighbors=k)
    ssc.fit(X_train[0:200,],X_train[201:600,], y_train[0:200,])
    print("MNIST SSC : ", ssc.score(X_test.todense(),y_test))
    
    neigh = KNeighborsClassifier(n_neighbors=k)
    neigh.fit(X_train[0:200,],y_train[0:200,])
    print("MNIST KNN : ", neigh.score(X_test,y_test))

## 3. Experiment - Semi-supervised Iris

In [None]:
dataset = load_iris()
X, y = dataset.data, dataset.target

# 25 points in labeled, 50 in unlabeled, 75 in test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)
X_labeled = np.delete(X_train, range(50), 0)
y_labeled = np.delete(y_train, range(50))
X_unlabeled = np.delete(X_train, range(50,75), 0)

In [None]:
K=[1,2,3,4,5,6,7,8,9,10]
for k in K:
    print("########  K = ",k, " ###########")
    time1 = time.time()
    ssc = SemiSupervisedLargeMarginNearestNeighbor(n_neighbors=k)
    ssc.fit(X_labeled, X_unlabeled, y_labeled)
    time2 = time.time()
    print("Iris SSC : ", ssc.score(y_test, ssc.predict(X_test)), " Time(s) :", (time2-time1))
   
    neigh = KNeighborsClassifier(n_neighbors=k)
    neigh.fit(X_train,y_train)
    print("Iris KNN : ", neigh.score(X_test,y_test))