# Machine Learning Project

Pia CHANCEREL - Raphael LASRY - Maxime POLI

Based on the article :

**A Continuation Method for Semi-Supervised SVMs**

Olivier Chapelle $\hspace{3.9cm}$ olivier.chapelle@tuebingen.mpg.de

Mingmin Chi $\hspace{4.5cm}$ mingmin.chi@tuebingen.mpg.de

Alexander Zien $\hspace{4.1cm}$ alexander.zien@tuebingen.mpg.de


Max Planck Institute for Biological Cybernetics, Tübingen, Germany

https://dl.acm.org/doi/pdf/10.1145/1143844.1143868?download=true

In [1]:
import numpy as np
import scipy as sc

# Data

20newsgroup dataset: https://scikit-learn.org/0.19/datasets/twenty_newsgroups.html

Dataset of an old forum. We will only focus on messages related to windows and mac. The goal is to predict the subject of the message (windows or mac) thanks to a $S^3VM$ method.

In [3]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import PolynomialFeatures

In [4]:
cat = ['comp.sys.mac.hardware', 'comp.windows.x']
newsgroups_train = fetch_20newsgroups(subset = 'train', categories = cat) 
newsgroups_test = fetch_20newsgroups(subset = 'test', categories = cat) 

In [5]:
print(newsgroups_train.DESCR) #Documentation of how to use the dataset

.. _20newsgroups_dataset:

The 20 newsgroups text dataset
------------------------------

The 20 newsgroups dataset comprises around 18000 newsgroups posts on
20 topics split in two subsets: one for training (or development)
and the other one for testing (or for performance evaluation). The split
between the train and test set is based upon a messages posted before
and after a specific date.

This module contains two loaders. The first one,
:func:`sklearn.datasets.fetch_20newsgroups`,
returns a list of the raw texts that can be fed to text feature
extractors such as :class:`sklearn.feature_extraction.text.CountVectorizer`
with custom parameters so as to extract feature vectors.
The second one, :func:`sklearn.datasets.fetch_20newsgroups_vectorized`,
returns ready-to-use features, i.e., it is not necessary to use a feature
extractor.

**Data Set Characteristics:**

    Classes                     20
    Samples total            18846
    Dimensionality               1
    Features       

In [6]:
print(newsgroups_train.filenames.shape) #Text, content of the messages
print(newsgroups_train.target.shape) #In which categories the message should be classified

print(newsgroups_test.filenames.shape)
print(newsgroups_test.target.shape)

(1171,)
(1171,)
(780,)
(780,)


In [7]:
vectorizer = TfidfVectorizer()
vectors_train = vectorizer.fit_transform(newsgroups_train.data) #From a text message to a sparse matrix
vectors_test = vectorizer.fit_transform(newsgroups_test.data)
print(vectors_train.shape)
print(vectors_test.shape)

(1171, 20400)
(780, 22944)


In [8]:
print(vectors_train) #Sparse matrix. The first index is the index of the message and the second ones are the indexes whithin this matrix where the value isn't a 0.

  (0, 9381)	0.0833319202434346
  (0, 9537)	0.051488753790363266
  (0, 7097)	0.0716017121921997
  (0, 19198)	0.03825757019634386
  (0, 13923)	0.0612567924476602
  (0, 19520)	0.0612567924476602
  (0, 13751)	0.06779317109369336
  (0, 11389)	0.06258699323079087
  (0, 6526)	0.06005984630463308
  (0, 4094)	0.040173990576588194
  (0, 13321)	0.04790996430819856
  (0, 13196)	0.04854513021356325
  (0, 18501)	0.06235389442108352
  (0, 13920)	0.05395063736099927
  (0, 10224)	0.036000680480091043
  (0, 19648)	0.15444542259960156
  (0, 15199)	0.14639735648225874
  (0, 5916)	0.06893303130389038
  (0, 4700)	0.027787419148156887
  (0, 7025)	0.08531073486179562
  (0, 8529)	0.06490899824521898
  (0, 12368)	0.03951762719939179
  (0, 178)	0.07319867824112937
  (0, 4997)	0.0716017121921997
  (0, 4864)	0.059503491097782954
  :	:
  (1170, 17575)	0.21148205930503944
  (1170, 17563)	0.015929014052134184
  (1170, 13194)	0.04351509358870771
  (1170, 10309)	0.1045700958833079
  (1170, 17572)	0.15091995023585136
  

# S3VM

In [26]:
class S3VMClassifier():
    """
        S3VM class as defined in the article.
    """
    def __init__(self, C=1.0, C_=1.0, lambada=0.001, eta=0.01, s=3):
        self.C = C
        self.C_ = C_
        self.lambada = lambada
        self.eta = eta
        self.s = s
        self.X = None
        self.alpha = None
        self.w = None
        self.b = None
        
    def compute_gamma(self, epochs):
        """
            Compute the first value of gamma defined at the §3.2.
        """
        n = self.X.shape[0]
        #BUG ICI JE NE FAIS PAS LA BONNE OPERATION
        mat = np.sum(np.outer(self.X[n + 1:], self.X[n + 1:].T) / sc.sparse.linalg.norm(self.X[n + 1:], axis = 1) ** 3, axis = 1)
    
        values, _ = np.linalg.eig(mat)
        lamb_max = np.max(values)
        
        gamma_0 = (self.C_ * lamb_max) ** (2 / 3) / (2 * self.s) ** (1 / 3)
        gamma_end = 1 / (epochs * 2 * self.s * sc.sparse.linalg.norm(self.X) ** 2)
        
        return gamma_0, gamma_end
    
    def gradient_loss(self, gamma, Y):
        """
            Compute the gradient of the loss defined at the end of the §3.1.
        """
        n = slef.X.shape[0]
        a = 1 + 2 * self.gamma * self.s * sc.sparse.linalg.norm(self.X, axis = 1) ** 2
        e = (Y * (self.w.T.dot(self.X) + self.b) - 1) / (np.sqrt(2 * gamma) * sc.sparse.linalg.norm(self.X, axis = 1))
        var = self.w.T * X + self.b
        
        labelled = self.C / 2 * np.sum(sc.special.erfc(e[:n]) * Y[:n] * self.X[:n].T, axis = 1)
        unlabelled = self.C_ * np.sum(((2 * self.s) * var[n + 1:]) / (a ** (3 / 2)) * np.exp(- self.s * var[n + 1:] ** 2 / a) * X[n + 1:].T)
        
        return w.T - labelled - unelabelled
    
    def loss(self, gamma, Y, d):
        """
            Compute the convolved loss defined at the end of the §3.1.
        """
        n = self.X.shape[0]
        a = 1 + 2 * self.gamma * self.s * sc.sparse.linalg.norm(self.X, axis = 1) ** 2
        e = (Y * (self.w.T.dot(self.X) + self.b) - 1) / (np.sqrt(2 * gamma) * sc.sparse.linalg.norm(self.X, axis = 1))
        var = self.w.T * X + self.b
        
        labelled = self.C * np.sum(gamma * sc.sparse.linalg.norm(self.X[:n], axis = 1) / np.sqrt(2) 
                                   * (np.exp(- e[:n] ** 2) / np.sqrt(np.pi) - e[:n] * sc.special.erfc(e[:n])))
        unlabelled = self.C_ * np.sum(1 / np.sqrt(a[n + 1:] * np.exp(- s * var[n + 1:] ** 2 / a)))
        
        return 1 / 2 * self.w.T.dot(self.w) + 1 / 2 * gamma * d + labelled + unlabelled
    
    def fit(self, X, Y, epochs = 10, w = None, b = None):
        """
            Train the model on X and Y datas.
        """
        mat = PolynomialFeatures(1)
        X = mat.fit_transform(X)
        f = X.shape[1]
        n = X.shape[0]
        
        self.X = X
        gamma_0, gamma_end = self.compute_gamma(epochs)
        
        alpha, d = np.zeros((f, n)), np.zeros(f)
        m = 0
        
        if w is None: #Warm start
            self.w = np.zeros(f)
        if b is None: #Warm start
            self.b = 0

        for k in range(epochs):
            r = np.random.permutation(n)
            self.X = self.X[r[:], :]    
            Y = Y[r[:]]
            gamma = (gamma_end / gamma_0) ** (k / epochs) * gamma_0
            
            for i in range(n):
                m = min(n, i + 1 + k * n)
                d -= alpha[:, r[i]]
                alpha[:, r[i]] = self.gardient_loss(gamma, Y)
                d += alpha[:, r[i]]
                self.w -= self.eta * self.lambada * self.w + self.eta / m * d

        self.b = self.w[0]
        self.w = self.w[1:]

    def predict(self, X):
        """
            Predict the value of Y (0 or 1).
        """
        return np.array(X.dot(self.w) + self.b >= 0).astype(int)
    
    def accuracy(self, X, Y):
        """
            Evaluate the accuracy of the prediction.
        """
        Y_pred = self.predict(X)
        return np.mean(abs((Y + 1) / 2 == Y_pred))

In [27]:
s3vm = S3VMClassifier()
s3vm.fit(vectors_train, newsgroups_train.target)
print(s3vm.accuracy(vectors_test, newsgroups_test))

LinAlgError: 1-dimensional array given. Array must be at least two-dimensional