# Devoir 1

## Initialisations
### Appel des modules

In [1]:
import numpy as np
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
import sklearn.metrics as metrics


### Enregistrement des données

In [2]:
f = lambda x : 1 if x > 250 else 0
normalize = np.vectorize(f)
mnist_train = normalize(np.loadtxt("mnist_train.csv", delimiter=",", skiprows=1))

mnist_test = normalize(np.loadtxt("mnist_test.csv", delimiter=",", skiprows=1))

In [3]:
def dct(x):
    N = x.shape[1]
    
    cos_mat = np.cos( np.pi/N * np.outer(np.arange(N) + 1.0/2.0, np.arange(N)))
    
    return 2 * np.dot(x, cos_mat)

def dctI(x):
    N = x.shape[1]
    cos_mat = np.cos(np.pi/N * np.outer(np.arange(1, N), np.arange(N) + 1.0/2.0))
    return  np.abs((x[:, 0]/2 + np.dot(x[:,1:], cos_mat))/N)

mnist_train_dct = dct(mnist_train[:1000, 1:])
#print(mnist_train[1,1:])
print(mnist_train_dct)

[[ 1.14000000e+02 -6.61390948e+00 -1.62802840e+01 ... -7.09799957e+00
  -2.02923118e+00 -3.15076570e+00]
 [ 1.32000000e+02  1.51991062e+00 -4.49430158e+01 ... -6.02863826e+00
   1.65296472e+01  1.41162529e+00]
 [ 4.60000000e+01 -7.94559430e+00 -2.53291427e+01 ...  2.40202316e+00
  -7.91897467e+00  2.37525272e+01]
 ...
 [ 1.24000000e+02  9.51190366e-01 -5.68239454e+01 ...  9.63703024e+00
   9.82688116e-01  1.83143294e+00]
 [ 7.00000000e+01  3.19341540e+00 -1.73243318e+01 ...  4.71515550e+00
  -6.13635909e+00 -3.04796333e+00]
 [ 9.20000000e+01 -4.19542887e+00 -4.07596663e+01 ...  1.84023781e+00
  -9.77086555e-01 -7.74095237e-02]]


# distances

In [4]:
def distance_Ln(x1, x2, n=2):
    return np.sum((x2 - x1)**n, axis=1)**(1.0/n)



## Algorithmes

### K-moyenne

In [5]:
class K_mean:
    def __init__(self, k, random_seed=0, distance=distance_Ln):
        self._k = k
        self._similarity = distance
        self._random_seed = random_seed
        
        
    def _get_closest_centroid(self, datas):
        distances = np.zeros((len(datas), self._k))
        i = 0
        for centroid in self._centroids:
            distances[:,i] = self._similarity(datas, centroid)
            
            #print(distances[:,i])
            i += 1
        
        return np.argmin(distances, axis=1)
    
    def _initialize(self, datas):
        
        dim = datas.shape[1]
        self._centroids = np.fill((self._k, dim), -1)
        self._centroids[0] = train_set[np.random.randint(n)]
        distances = np.zeros((len(datas), self._k))
        ncentroid = 0
        #for i in range(1, self._k):
            #for j in range(ncentroid):
                
        
    def train(self, train_set):
        np.random.seed(self._random_seed)
        n = len(train_set)
        dim = train_set.shape[1]
        self._centroids = np.zeros((self._k, dim))
        self._centroids[0] = train_set[np.random.randint(n)]
        
        for i in range(1, self._k):
            self._centroids[i] = train_set[np.random.randint(n)]
         

        groups = self._get_closest_centroid(train_set)
        new_centroids = np.zeros((self._k, dim))
        
        while True:
        
            for i in range(self._k):
                cent_group = train_set[groups == i]
                if len(cent_group) > 0:
                    new_centroids[i] = np.average(cent_group, axis=0)
                else :
                    new_centroids[i] = self._centroids[i]
            
            if np.array_equal(new_centroids,self._centroids):
                #print(self._centroids)
                return self
            
            self._centroids = new_centroids
            
            groups = self._get_closest_centroid(train_set)
        
        
    def predict(self, test_set):
        return self._get_closest_centroid(test_set)
          

k = 10
random_state = 255
#km = K_mean(k, random_seed=random_state).train(mnist_train_dct)
km = KMeans(10, random_state=random_state).fit(mnist_train_dct)
#print(km._centroids[1])        

In [6]:
from PIL import Image

def printImage(datas):
    #datas = [[int(datas[i + 28*j] * 255) for i in range(28)] for j in range(28)]
    #print(datas)
    datas = [min(255, int(255 * d)) for d in datas]
    colors = bytes(datas)
    
    im = Image.frombytes("L", (28,28), colors)
    im.show()
    

for cent in km.cluster_centers_ :   
    
    inv_cent = dctI(cent.reshape(1,len(cent)))
    #print(inv_cent)
    #printImage(inv_cent.reshape(inv_cent.shape[1]))
#printImage(mnist_train_dct[0,1:])

#### Evaluation

In [7]:
predictions = km.predict(mnist_train_dct[:1000])
metrics.silhouette_score(mnist_train_dct[:1000], predictions)

0.03813499847033897

### Partition binaire

In [8]:
clustering = AgglomerativeClustering(n_clusters=10).fit(mnist_train_dct)
args = np.argwhere(clustering.labels_ == 0)
args = args.reshape((len(args)))
#print(args)
average = np.average(mnist_train_dct[args], axis=0)
#printImage(dctI(average.reshape(1, average.shape[0])).reshape(average.shape[0], 1))
predictions = clustering.labels_
print(metrics.silhouette_score(mnist_train_dct[:1000], predictions))

-0.0009470541656144587


 # KNN

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

mnist = pd.read_csv("mnist_train.csv").values
mnist_x, mnist_y = mnist[:,1:], mnist[:,0]
print(mnist_x.shape, mnist_y.shape)

# Implementing OneHotEncoder
ohe = OneHotEncoder(sparse=False)
mnist_y = ohe.fit_transform(mnist_y.reshape(-1,1))

# Reshaping the image in a matrix (28x28 images) for visualization purposes only
nfigs = 5
fig = plt.figure(figsize=(10,10))
for i in range(nfigs **2):
    ax = fig.add_subplot(nfigs, nfigs, i+1)
    ax.imshow(mnist_x[i].reshape(28,28))
    
# Using cross validation for <optimal> values of hyper-parameters
# Note to self: Maybe I should remove this part, and evaluate with k manually
grid_params = {"n_neighbors": (4,21), "weights":["uniform","distance"]}
knn= KNeighborsClassifier()
grid_search = GridSearchCV(knn,grid_params,verbose=2, n_jobs=1)
grid_search.fit(mnist_x, mnist_y)

# Let`s predict our test group now
testing = pd.read_csv(mnist_train.csv)
preds = grid_search.predict(testing)



((60000, 784), (60000,))
Fitting 3 folds for each of 4 candidates, totalling 12 fits
[CV] n_neighbors=4, weights=uniform ..................................
[CV] ................... n_neighbors=4, weights=uniform, total=16.7min
[CV] n_neighbors=4, weights=uniform ..................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed: 49.6min remaining:    0.0s


[CV] ................... n_neighbors=4, weights=uniform, total=16.6min
[CV] n_neighbors=4, weights=uniform ..................................
[CV] ................. n_neighbors=4, weights=uniform, total=1798.0min
[CV] n_neighbors=4, weights=distance .................................
[CV] ................ n_neighbors=4, weights=distance, total=1336.2min
[CV] n_neighbors=4, weights=distance .................................
