# Semi-Supervised Learning: 
## ---Label Propagation and Label Spreading (Experiments v1)---

### Step 1 - Import libraries

In [55]:
# Data manipulation
import pandas as pd # for data manipulation
import numpy as np # for data manipulation

# Visualization
#import plotly.express as px # for data visualization
#import matplotlib.pyplot as plt # for showing handwritten digits

# Skleran|
#from sklearn.datasets import load_digits # for MNIST data
#from sklearn.model_selection import train_test_split # for splitting data into train and test samples
from sklearn.metrics import classification_report # for model evaluation metrics
from sklearn.datasets import make_moons # for creating dummy data

# Semi-supervised
from sklearn.semi_supervised import LabelSpreading
from sklearn.semi_supervised import LabelPropagation
from sklearn.semi_supervised import SelfTrainingClassifier

# UMAP dimensionality reduction
#from umap import UMAP

# Other utilities
import sys
import os
import getpass
import chart_studio
import chart_studio.plotly as py

# Assign main directory to a variable
main_dir=os.path.dirname(sys.path[0])
#main_dir

In [5]:
import sklearn
sklearn.__version__

'1.0.1'

In [56]:
Xm, ym = make_moons(n_samples=100, shuffle=True, noise=None, random_state=42)

In [None]:
# Input Username and Password for Chart-Studio
print('Input Chart-Studio Username: ')
CS_user = input()
print('Input Chart-Studio API Key: ')
CS_api = getpass.getpass()

# Set Chart-Studio Credentials
chart_studio.tools.set_credentials_file(username=CS_user, api_key=CS_api)

### Step 2 - Get the weather data for our model (from Kaggle)
- Donwload from: https://www.kaggle.com/jsphyg/weather-dataset-rattle-package
- After downloading it, we read csv into a Pandas dataframe

In [34]:
# Set Pandas options to display more columns
pd.options.display.max_columns=50

# Read in the weather data csv
df=pd.read_csv('/Users/solclover/Documents/Python/SAS_to_Python/019_ML_CART_Decision_Trees/weatherAUS.csv', encoding='utf-8')

# Drop records where target RainTomorrow=NaN
df=df[pd.isnull(df['RainTomorrow'])==False]

# For other columns with missing values, fill them in with column mean
df=df.fillna(df.mean())

# Create a flag for RainToday and RainTomorrow, note RainTomorrowFlag will be our target variable
df['RainTodayFlag']=df['RainToday'].apply(lambda x: 1 if x=='Yes' else 0)
df['RainTomorrowFlag']=df['RainTomorrow'].apply(lambda x: 1 if x=='Yes' else 0)

# Create a smaller (10%) dataframe to use since the LS with rbf keeps dying)
df2=df.sample(frac=0.1, random_state=42)

# Randomely select 10% of observations to keep labels while others will be unlabeled
df2['Rand_selection'] = False
df2.loc[df2.sample(frac=0.1, random_state=42).index, 'Rand_selection'] = True
# Create a new target colum with labels, where 1 and 0 are original labels and -1 means unlabeled 
df2['New_target']=df2.apply(lambda x: x['RainTodayFlag'] if x['Rand_selection']==True else -1, axis=1)

# Show a snaphsot of data
df2

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow,RainTodayFlag,RainTomorrowFlag,Rand_selection,New_target
57760,2016-06-09,Ballarat,7.1,13.0,8.8,5.469824,7.624853,N,41.000000,N,WNW,24.0,22.0,100.0,98.0,1001.700000,1005.400000,8.000000,8.000000,8.6,11.5,Yes,Yes,1,1,False,-1
127128,2009-10-24,Walpole,13.2,18.3,0.0,5.469824,7.624853,E,48.000000,ESE,ESE,24.0,20.0,73.0,73.0,1027.600000,1023.800000,4.437189,4.503167,14.2,17.0,No,No,0,0,False,-1
119994,2015-09-21,PerthAirport,9.2,22.7,0.0,5.000000,11.100000,ENE,52.000000,ENE,ESE,26.0,20.0,45.0,25.0,1030.100000,1025.900000,0.000000,0.000000,15.1,22.5,No,No,0,0,False,-1
7088,2011-12-06,Cobar,15.3,26.1,0.0,10.400000,7.624853,E,44.000000,E,NE,24.0,19.0,48.0,40.0,1013.200000,1009.800000,7.000000,7.000000,17.5,24.3,No,No,0,0,False,-1
62992,2014-03-15,Sale,11.9,31.8,0.0,5.000000,4.100000,NW,72.000000,E,N,6.0,19.0,89.0,25.0,1006.700000,1001.000000,7.000000,6.000000,16.2,27.4,No,Yes,0,1,False,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8459,2015-11-05,Cobar,17.3,29.0,19.4,6.200000,7.624853,NW,41.000000,N,WNW,17.0,26.0,85.0,45.0,1008.000000,1005.800000,7.000000,3.000000,21.1,26.9,Yes,No,1,0,False,-1
54648,2016-03-29,MountGinini,8.2,13.3,0.0,5.469824,7.624853,NE,26.000000,ENE,NNE,11.0,11.0,98.0,94.0,1017.653758,1015.258204,4.437189,4.503167,8.4,9.7,No,No,0,0,False,-1
108991,2010-01-25,Albany,18.2,19.5,12.6,5.400000,0.000000,,39.984292,S,SSE,31.0,26.0,87.0,88.0,1011.800000,1013.800000,8.000000,7.000000,18.8,19.0,Yes,No,1,0,False,-1
127258,2010-03-03,Walpole,15.7,23.0,0.0,5.469824,7.624853,S,31.000000,SE,SSE,15.0,17.0,63.0,60.0,1017.800000,1016.800000,4.437189,4.503167,19.4,20.6,No,No,0,0,False,-1


In [35]:
df2['New_target'].value_counts()

-1    12797
 0     1111
 1      311
Name: New_target, dtype: int64

In [4]:
#df['New_target'].value_counts()

-1    127974
 0     11110
 1      3109
Name: New_target, dtype: int64

In [36]:
df2[(df2['Rand_selection']==True) & (df2['New_target']==1)]

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow,RainTodayFlag,RainTomorrowFlag,Rand_selection,New_target
75425,2010-03-22,Portland,6.7,17.9,1.4,3.600000,11.000000,W,37.000000,WSW,WNW,6.0,20.000000,68.0,52.000000,1024.100000,1023.500000,6.000000,3.000000,13.3,17.500000,Yes,No,1,0,True,1
27309,2009-05-23,Richmond,14.1,20.6,32.6,5.469824,7.624853,E,44.000000,ESE,ESE,9.0,30.000000,99.0,80.000000,1024.100000,1022.400000,4.437189,4.503167,15.8,19.000000,Yes,Yes,1,1,True,1
116196,2013-07-24,PearceRAAF,11.6,19.9,10.9,5.469824,8.800000,WNW,37.000000,N,W,9.0,22.000000,85.0,50.000000,1021.800000,1020.800000,2.000000,4.000000,14.0,19.000000,Yes,Yes,1,1,True,1
40108,2010-07-25,Williamtown,7.9,17.8,2.4,3.000000,7.624853,S,31.000000,WNW,SE,11.0,17.000000,87.0,74.000000,1029.700000,1027.500000,6.000000,6.000000,13.3,15.100000,Yes,Yes,1,1,True,1
28119,2011-09-10,Richmond,5.1,17.7,2.4,5.469824,7.624853,WNW,57.000000,WSW,W,15.0,19.000000,42.0,36.000000,1011.800000,1009.200000,4.437189,4.503167,12.2,16.000000,Yes,No,1,0,True,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23040,2014-07-04,NorfolkIsland,13.6,18.1,1.2,6.000000,8.300000,SSW,72.000000,SSW,SSW,35.0,28.000000,56.0,56.000000,1019.200000,1019.300000,4.000000,3.000000,16.7,17.400000,Yes,No,1,0,True,1
17493,2015-10-12,Newcastle,15.0,30.2,9.4,5.469824,7.624853,,39.984292,N,,9.0,18.637576,81.0,51.482606,1017.653758,1015.258204,1.000000,4.503167,20.0,21.687235,Yes,No,1,0,True,1
95206,2014-06-08,Townsville,18.8,20.3,1.6,2.600000,0.000000,SW,41.000000,SSW,ESE,11.0,20.000000,88.0,91.000000,1017.500000,1014.200000,8.000000,8.000000,19.3,19.600000,Yes,Yes,1,1,True,1
28714,2013-06-25,Richmond,8.7,14.5,3.8,1.000000,7.624853,SW,28.000000,,SW,0.0,15.000000,100.0,99.000000,1016.400000,1015.400000,4.437189,4.503167,10.3,12.800000,Yes,Yes,1,1,True,1


### Step 3 - Use semi-supervised training to make predictions

In [45]:
model_LS_rbf = LabelSpreading(kernel='rbf', # {'knn', 'rbf'} default='rbf'
                              gamma=20, # default=20, Parameter for rbf kernel.
                              #n_neighbors=7, # default=7, Parameter for knn kernel which is a strictly positive integer.
                              alpha=0.2, # default=0.2, Clamping factor. A value in (0, 1) that specifies the relative amount that an instance should adopt the information from its neighbors as opposed to its initial label.
                              max_iter=100, # default=30, Maximum number of iterations allowed.
                              tol=0.001, # default=1e-3, Convergence tolerance: threshold to consider the system at steady state.
                              n_jobs=-1, # default=None, The number of parallel jobs to run. -1 means using all processors. 
                             )

model_LS_knn = LabelSpreading(kernel='knn', # {'knn', 'rbf'} default='rbf'
                              #gamma=20, # default=20, Parameter for rbf kernel.
                              n_neighbors=7, # default=7, Parameter for knn kernel which is a strictly positive integer.
                              alpha=0.2, # default=0.2, Clamping factor. A value in (0, 1) that specifies the relative amount that an instance should adopt the information from its neighbors as opposed to its initial label.
                              max_iter=100, # default=30, Maximum number of iterations allowed.
                              tol=0.001, # default=1e-3, Convergence tolerance: threshold to consider the system at steady state.
                              n_jobs=-1, # default=None, The number of parallel jobs to run. -1 means using all processors. 
                             )


In [46]:
# Select data for modeling
#X=df[['MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'Sunshine', 'WindGustSpeed', 
#      'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm', 'Pressure9am',  
#      'Pressure3pm', 'Cloud9am', 'Cloud3pm', 'Temp9am', 'Temp3pm', 'RainTodayFlag']]
X=df2[['WindGustSpeed', 'Humidity3pm']]
#y=df['RainTomorrowFlag'].values
y=df2['New_target'].values

In [47]:
# Fit the models
LS_rbf=model_LS_rbf.fit(X, y)

# Print results
print("Input array X: ", LS_rbf.X_)
print("Classes: ", LS_rbf.classes_)
print("Label Distributions: ", LS_rbf.label_distributions_)
print("Transduction Label: ", LS_rbf.transduction_)
print("No. of features: ", LS_rbf.n_features_in_)
print("Feature names: ", LS_rbf.feature_names_in_)
print("No. of iterations: ", LS_rbf.n_iter_)

Input array X:  [[41.         98.        ]
 [48.         73.        ]
 [52.         25.        ]
 ...
 [39.98429166 88.        ]
 [31.         60.        ]
 [78.         72.        ]]
Classes:  [0 1]
Label Distributions:  [[9.99971724e-01 2.82755466e-05]
 [3.12536349e-19 1.00000000e+00]
 [1.00000000e+00 3.03327586e-48]
 ...
 [3.29438642e-02 9.67056136e-01]
 [6.66666667e-01 3.33333333e-01]
 [2.74665151e-05 9.99972533e-01]]
Transduction Label:  [0 1 0 ... 1 0 1]
No. of features:  2
Feature names:  ['WindGustSpeed' 'Humidity3pm']
No. of iterations:  8


In [48]:
# Fit Label Spreading KNN model
LS_knn=model_LS_knn.fit(X, y)

# Print results
print("Input array X: ", LS_knn.X_)
print("Classes: ", LS_knn.classes_)
print("Label Distributions: ", LS_knn.label_distributions_)
print("Transduction Label: ", LS_knn.transduction_)
print("No. of features: ", LS_knn.n_features_in_)
print("Feature names: ", LS_knn.feature_names_in_)
print("No. of iterations: ", LS_knn.n_iter_)

Input array X:  [[41.         98.        ]
 [48.         73.        ]
 [52.         25.        ]
 ...
 [39.98429166 88.        ]
 [31.         60.        ]
 [78.         72.        ]]
Classes:  [0 1]
Label Distributions:  [[1.         0.        ]
 [0.00493543 0.99506457]
 [1.         0.        ]
 ...
 [0.         1.        ]
 [1.         0.        ]
 [0.35019733 0.64980267]]
Transduction Label:  [0 1 0 ... 1 0 1]
No. of features:  2
Feature names:  ['WindGustSpeed' 'Humidity3pm']
No. of iterations:  8


In [41]:
pd.DataFrame(LS_rbf.transduction_).value_counts()

0    11466
1     2753
dtype: int64

In [42]:
pd.DataFrame(LS_knn.transduction_).value_counts()

0    12208
1     2011
dtype: int64

In [43]:
# Attach results back to the original df2 dataframe
#df2['LS_rbf_label']=LS_rbf.transduction_
#df2['LS_knn_label']=LS_knn.transduction_
#df2

In [49]:
# Look at classification report to evaluate the model
print('*************** Evaluation of LS rbf model ***************')
print(classification_report(df2['RainTomorrowFlag'], LS_rbf.transduction_))
print('--------------------------------------------------------')
print('*************** Evaluation of LS knn model ***************')
print(classification_report(df2['RainTomorrowFlag'], LS_knn.transduction_))
print('--------------------------------------------------------')

*************** Evaluation of LS rbf model ***************
              precision    recall  f1-score   support

           0       0.82      0.85      0.84     11046
           1       0.41      0.35      0.38      3173

    accuracy                           0.74     14219
   macro avg       0.61      0.60      0.61     14219
weighted avg       0.73      0.74      0.73     14219

--------------------------------------------------------
*************** Evaluation of LS knn model ***************
              precision    recall  f1-score   support

           0       0.81      0.90      0.85     11046
           1       0.45      0.28      0.35      3173

    accuracy                           0.76     14219
   macro avg       0.63      0.59      0.60     14219
weighted avg       0.73      0.76      0.74     14219

--------------------------------------------------------


### Step 4 - Label propagation

In [50]:
model_LP_rbf = LabelPropagation(kernel='rbf', # {'knn', 'rbf'} default='rbf'
                              gamma=20, # default=20, Parameter for rbf kernel.
                              #n_neighbors=7, # default=7, Parameter for knn kernel which is a strictly positive integer.
                              max_iter=1000, # default=1000, Maximum number of iterations allowed.
                              tol=0.001, # default=1e-3, Convergence tolerance: threshold to consider the system at steady state.
                              n_jobs=-1, # default=None, The number of parallel jobs to run. -1 means using all processors. 
                             )

model_LP_knn = LabelPropagation(kernel='knn', # {'knn', 'rbf'} default='rbf'
                              #gamma=20, # default=20, Parameter for rbf kernel.
                              n_neighbors=7, # default=7, Parameter for knn kernel which is a strictly positive integer.
                              max_iter=1000, # default=30, Maximum number of iterations allowed.
                              tol=0.001, # default=1e-3, Convergence tolerance: threshold to consider the system at steady state.
                              n_jobs=-1, # default=None, The number of parallel jobs to run. -1 means using all processors. 
                             )

In [51]:
# Fit the models
LP_rbf=model_LP_rbf.fit(X, y)

# Print results
print("Input array X: ", LP_rbf.X_)
print("Classes: ", LP_rbf.classes_)
print("Label Distributions: ", LP_rbf.label_distributions_)
print("Transduction Label: ", LP_rbf.transduction_)
print("No. of features: ", LP_rbf.n_features_in_)
print("Feature names: ", LP_rbf.feature_names_in_)
print("No. of iterations: ", LP_rbf.n_iter_)

Input array X:  [[41.         98.        ]
 [48.         73.        ]
 [52.         25.        ]
 ...
 [39.98429166 88.        ]
 [31.         60.        ]
 [78.         72.        ]]
Classes:  [0 1]
Label Distributions:  [[6.04877172e-01 3.95122828e-01]
 [1.71590571e-06 9.99998284e-01]
 [1.00000000e+00 2.47766254e-41]
 ...
 [1.34920139e-09 9.99999999e-01]
 [6.66666669e-01 3.33333331e-01]
 [5.00000000e-01 5.00000000e-01]]
Transduction Label:  [0 1 0 ... 1 0 0]
No. of features:  2
Feature names:  ['WindGustSpeed' 'Humidity3pm']
No. of iterations:  1000




In [52]:
# Fit Label Spreading KNN model
LP_knn=model_LP_knn.fit(X, y)

# Print results
print("Input array X: ", LP_knn.X_)
print("Classes: ", LP_knn.classes_)
print("Label Distributions: ", LP_knn.label_distributions_)
print("Transduction Label: ", LP_knn.transduction_)
print("No. of features: ", LP_knn.n_features_in_)
print("Feature names: ", LP_knn.feature_names_in_)
print("No. of iterations: ", LP_knn.n_iter_)

Input array X:  [[41.         98.        ]
 [48.         73.        ]
 [52.         25.        ]
 ...
 [39.98429166 88.        ]
 [31.         60.        ]
 [78.         72.        ]]
Classes:  [0 1]
Label Distributions:  [[1.         0.        ]
 [0.83842239 0.16157761]
 [1.         0.        ]
 ...
 [0.         1.        ]
 [1.         0.        ]
 [0.46283078 0.53716922]]
Transduction Label:  [0 0 0 ... 1 0 1]
No. of features:  2
Feature names:  ['WindGustSpeed' 'Humidity3pm']
No. of iterations:  914


In [53]:
# Look at classification report to evaluate the model
print('*************** Evaluation of LP rbf model ***************')
print(classification_report(df2['RainTomorrowFlag'], LP_rbf.transduction_))
print('--------------------------------------------------------')
print('*************** Evaluation of LP knn model ***************')
print(classification_report(df2['RainTomorrowFlag'], LP_knn.transduction_))
print('--------------------------------------------------------')

*************** Evaluation of LP rbf model ***************
              precision    recall  f1-score   support

           0       0.82      0.85      0.84     11046
           1       0.41      0.35      0.38      3173

    accuracy                           0.74     14219
   macro avg       0.62      0.60      0.61     14219
weighted avg       0.73      0.74      0.73     14219

--------------------------------------------------------
*************** Evaluation of LP knn model ***************
              precision    recall  f1-score   support

           0       0.82      0.90      0.86     11046
           1       0.47      0.29      0.36      3173

    accuracy                           0.77     14219
   macro avg       0.64      0.60      0.61     14219
weighted avg       0.74      0.77      0.75     14219

--------------------------------------------------------
