#  Semi-Supervised Learning with Support Vector Machine (SVM)


In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn import datasets
import matplotlib.pyplot as plt
# pd.set_option('display.max_rows', None)

## Preparing the dataset

In [2]:
#Load dataset
dataset = datasets.load_wine()

In [28]:
df = pd.DataFrame(dataset.data, columns=[dataset.feature_names]) 
df['label'] = pd.Series(dataset.target)
df = df.sample(frac=1).reset_index(drop=True) # shuffle the dataframe in-place and reset the index
df

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,label
0,14.06,1.63,2.28,16.0,126.0,3.00,3.17,0.24,2.10,5.65,1.09,3.71,780.0,0
1,12.67,0.98,2.24,18.0,99.0,2.20,1.94,0.30,1.46,2.62,1.23,3.16,450.0,1
2,12.22,1.29,1.94,19.0,92.0,2.36,2.04,0.39,2.08,2.70,0.86,3.02,312.0,1
3,12.36,3.83,2.38,21.0,88.0,2.30,0.92,0.50,1.04,7.65,0.56,1.58,520.0,2
4,13.48,1.81,2.41,20.5,100.0,2.70,2.98,0.26,1.86,5.10,1.04,3.47,920.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
173,12.08,1.83,2.32,18.5,81.0,1.60,1.50,0.52,1.64,2.40,1.08,2.27,480.0,1
174,12.08,1.33,2.30,23.6,70.0,2.20,1.59,0.42,1.38,1.74,1.07,3.21,625.0,1
175,11.56,2.05,3.23,28.5,119.0,3.18,5.08,0.47,1.87,6.00,0.93,3.69,465.0,1
176,12.00,1.51,2.42,22.0,86.0,1.45,1.25,0.50,1.63,3.60,1.05,2.65,450.0,1


### Create labeled dataset

In [29]:
#taking half of the dataset as labled data 
X = df.iloc[0:89,0:13].values
y = df.iloc[0:89,-1].values
df.iloc[0:89,0:13]

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
0,14.06,1.63,2.28,16.0,126.0,3.00,3.17,0.24,2.10,5.65,1.09,3.71,780.0
1,12.67,0.98,2.24,18.0,99.0,2.20,1.94,0.30,1.46,2.62,1.23,3.16,450.0
2,12.22,1.29,1.94,19.0,92.0,2.36,2.04,0.39,2.08,2.70,0.86,3.02,312.0
3,12.36,3.83,2.38,21.0,88.0,2.30,0.92,0.50,1.04,7.65,0.56,1.58,520.0
4,13.48,1.81,2.41,20.5,100.0,2.70,2.98,0.26,1.86,5.10,1.04,3.47,920.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
84,14.39,1.87,2.45,14.6,96.0,2.50,2.52,0.30,1.98,5.25,1.02,3.58,1290.0
85,13.94,1.73,2.27,17.4,108.0,2.88,3.54,0.32,2.08,8.90,1.12,3.10,1260.0
86,13.05,3.86,2.32,22.5,85.0,1.65,1.59,0.61,1.62,4.80,0.84,2.01,515.0
87,11.62,1.99,2.28,18.0,98.0,3.02,2.26,0.17,1.35,3.25,1.16,2.96,345.0


In [30]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.6, random_state=1)

In [31]:
X_train.shape

(35, 13)

In [32]:
X_test.shape

(54, 13)

### Create unlabeled dataset

In [33]:
# taking the other half of the data as unlabeled data
X_unl_df = df.iloc[89:,0:13].reset_index(drop=True)
X_unl = X_unl_df.values
X_unl_df

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
0,13.08,3.90,2.36,21.5,113.0,1.41,1.39,0.34,1.14,9.40,0.57,1.33,550.0
1,11.96,1.09,2.30,21.0,101.0,3.38,2.14,0.13,1.65,3.21,0.99,3.13,886.0
2,12.17,1.45,2.53,19.0,104.0,1.89,1.75,0.45,1.03,2.95,1.45,2.23,355.0
3,13.63,1.81,2.70,17.2,112.0,2.85,2.91,0.30,1.46,7.30,1.28,2.88,1310.0
4,13.73,4.36,2.26,22.5,88.0,1.28,0.47,0.52,1.15,6.62,0.78,1.75,520.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
84,12.08,1.83,2.32,18.5,81.0,1.60,1.50,0.52,1.64,2.40,1.08,2.27,480.0
85,12.08,1.33,2.30,23.6,70.0,2.20,1.59,0.42,1.38,1.74,1.07,3.21,625.0
86,11.56,2.05,3.23,28.5,119.0,3.18,5.08,0.47,1.87,6.00,0.93,3.69,465.0
87,12.00,1.51,2.42,22.0,86.0,1.45,1.25,0.50,1.63,3.60,1.05,2.65,450.0


## 1. Training on the labeled dataset

In [34]:
clf = svm.SVC(kernel='linear', probability=True, C=1.0).fit(X_train, y_train)
clf.score(X_test, y_test)

0.8888888888888888

## 2. Make a prediction using the unlabeled datset (x_unl)

In [35]:
#find the probability of each class
clp= clf.predict_proba(X_unl)
clf_prob = pd.DataFrame(clp, columns = ['class1', 'class2','class3']) 
# predict the the label of each class
lab=clf.predict(X_unl)
clf_prob["max"] = clf_prob.max(axis = 1)
clf_prob["lab"] = lab
clf_prob

Unnamed: 0,class1,class2,class3,max,lab
0,0.072278,0.071348,0.856373,0.856373,2
1,0.227621,0.650834,0.121545,0.650834,1
2,0.066752,0.819318,0.113930,0.819318,1
3,0.740845,0.142378,0.116777,0.740845,0
4,0.053969,0.179610,0.766421,0.766421,2
...,...,...,...,...,...
84,0.092743,0.809754,0.097502,0.809754,1
85,0.070210,0.853388,0.076402,0.853388,1
86,0.046057,0.621623,0.332320,0.621623,1
87,0.056994,0.782603,0.160402,0.782603,1


## 3. Choose the samples in X_unl with high confidence and add them into the labeled dataset 

In [36]:
th = 0.7
clf_prob[clf_prob["max"] > th]

Unnamed: 0,class1,class2,class3,max,lab
0,0.072278,0.071348,0.856373,0.856373,2
1,0.227621,0.650834,0.121545,0.650834,1
2,0.066752,0.819318,0.113930,0.819318,1
3,0.740845,0.142378,0.116777,0.740845,0
4,0.053969,0.179610,0.766421,0.766421,2
...,...,...,...,...,...
83,0.127260,0.792069,0.080671,0.792069,1
84,0.092743,0.809754,0.097502,0.809754,1
85,0.070210,0.853388,0.076402,0.853388,1
86,0.046057,0.621623,0.332320,0.621623,1


In [37]:
#add the predicted labels to the training dataset
unl_size =len(X_unl[clf_prob["max"] > th])
X_train_new = np.append(X_train, X_unl[clf_prob["max"] > th], axis=0)
y_train_new = np.append(y_train, clf_prob['lab'][clf_prob["max"] > th].values, axis=0)

X_train = X_train_new
y_train = y_train_new

In [38]:
#remove the added labels from the unlabled dataset
X_unl_df = X_unl_df.drop(X_unl_df[clf_prob["max"] > th].index).reset_index(drop=True)
#update the unlabeled set
X_unl = X_unl_df.values
# X_unl_df

## 4. Repeat

In [39]:
score_ls = []
while len(X_unl) != 0 and unl_size != 0: # stop when there are no more unlabeled data or when we are no confident about the data
    #Step 1
    clf = svm.SVC(kernel='linear', probability=True,C=1).fit(X_train, y_train)
    score_ls.append(clf.score(X_test, y_test))
    print ('Accuracy: ',clf.score(X_test, y_test))
#     print(len(X_unl))
    
    #Step2
    #find the probability of each class
    clp= clf.predict_proba(X_unl)
    clf_prob = pd.DataFrame(clp, columns = ['class1', 'class2','class3']) 
    # predict the the label of each class
    lab=clf.predict(X_unl)
    clf_prob["max"] = clf_prob.max(axis = 1)
    clf_prob["lab"] = lab
    
    #Step3
    unl_size =len(X_unl[clf_prob["max"] > th])
    X_train_new = np.append(X_train, X_unl[clf_prob["max"] > th], axis=0)
    y_train_new = np.append(y_train, clf_prob['lab'][clf_prob["max"] > th].values, axis=0)
    X_train = X_train_new
    y_train = y_train_new
    
    
    X_unl_df = X_unl_df.drop(X_unl_df[clf_prob["max"] > th].index).reset_index(drop=True)
    X_unl = X_unl_df.values

Accuracy:  0.8888888888888888
Accuracy:  0.9074074074074074
Accuracy:  0.9074074074074074
