# Semi supervised learning 

#### aim of this notebook : build a classifer for defaults (that is classify a comment as a review related to a default, issue)
### first build a classifier in supervised approach using labeled data 
### second build a classifer based on labeled data + unlabeled data to which we propagated labels 


In [1]:
import pandas as pd
from tqdm import tqdm, tqdm_notebook # progress bars in Jupyter
#import newspaper # download newspapers' data easily
from time import time # measure the computation time of a python code
import pandas as pd # the most basic & powerful data manipulation tool
import numpy as np # Here, mostly used for np.nan
import langdetect # detect the language of text
import stop_words # handles stop words in many languages without having to rebuild them everytime
import spacy # NLP library for POS tagging
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import re
import itertools
# For spacy use "pip install spacy", then "python -m spacy download en" to download English text mining modules

In [2]:
tqdm.pandas()
#tqdm_notebook()

### Read data

In [4]:
df = pd.read_csv('labeled_data.csv', engine='python') # label data only -> used for supervised model 

dfu = pd.read_csv('data_unlabeled.csv', encoding = 'utf-8') 
# unlabeled data -> used to together with lable data for semi supervised learning 

In [5]:
print(df.shape)
print(df.head(1))

(10997, 15)
                                               text  apps_update  \
0  dope video brian iâve been subscribe for years          0.0   

   battery_life_charging  battery_overheat  camera  connectivity  \
0                    0.0               0.0     0.0           0.0   

   customerservice  locking_system  memory_storage  screen  software_bugs  \
0              0.0             0.0             0.0     0.0            0.0   

   sound  system  water_damage  issue  
0    0.0     0.0           0.0    0.0  


In [6]:
df[[c for c in df.columns if c not in ['text', 'tokens']]].sum().map(int)

apps_update                82
battery_life_charging     147
battery_overheat           22
camera                    100
connectivity              109
customerservice           109
locking_system            206
memory_storage            156
screen                    316
software_bugs             140
sound                      43
system                    343
water_damage                6
issue                    1498
dtype: int64

In [18]:
del df['issue']
del df['water_damage']
del df['sound']
del df['battery_overheat']

In [19]:
df[[c for c in df.columns if c not in ['text', 'tokens']]].sum().map(int)

apps_update               82
battery_life_charging    147
camera                   100
connectivity             109
customerservice          109
locking_system           206
memory_storage           156
screen                   316
software_bugs            140
system                   343
dtype: int64

Let's see what we have for 'screen'

In [131]:
df.loc[df.screen==1].head()

Unnamed: 0,text,apps_update,battery_life_charging,camera,connectivity,customerservice,locking_system,memory_storage,screen,software_bugs,system
5,it crashes for nothing and strikes with someth...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
19,only drawback is that it's very difficult for ...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
133,have horizontal lines that run across the scre...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
156,a small drop the screen cracked and we had a c...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
217,only thing is that ive got big hands so i mis...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


# Create features and prepare the data into a NMF matrix before Machine Learning

### one important thing to have in mind when building a model : to make feature engineering separately on train and test. If you don't do that, you will incoporate info from the test set into the train

In [7]:
from gensim.models import Phrases
from gensim import corpora
import stop_words
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

#nlp = spacy.load('en')

In [8]:
## Function to clean and process the reviews
def cleaning_data(df) :
    STOPWORDS = stop_words.get_stop_words(language='en')
    #df.drop_duplicates(inplace= True) # Drop duplicated sentences
    df = df[~df['text'].isnull()] # Remove empty sentences
        
    # Remove special characters and punctucation
    df['clean_review']= [ re.sub('[^A-Za-z]+',' ', e ) for e in df['text'].apply(lambda x : x.lower())]
    
    # Remove empty clean_review
    df = df[~df['clean_review'].isnull()]
    df = df[~(df['clean_review']==' ')]
    df.reset_index(inplace=True, drop=True) # Reset index
    
    df['tokens'] = df['clean_review'].map(word_tokenize)
    df['nb_tokens'] = df['tokens'].map(len)
    
    ## keep only sentences with at least 3 tokens
    df = df[df['nb_tokens']>2]
    
    # remove stopwords
    df['tokens'] = df['tokens'].apply(lambda x: [i for i in x if i not in STOPWORDS])

    stemmer = SnowballStemmer("english")
    df['stemmed_text'] = df["tokens"].apply(lambda x: [stemmer.stem(y) for y in x])
    df['joined_stemmed_text'] = [' '.join(word for word in word_list) for word_list in df.stemmed_text ]

    return df




In [21]:
## split between train and test at the beginning 
# we will use the same test set for supervised and semi supervised learning, so that we can compare the performances of 
# both approaches 
df_train, df_test = train_test_split(df, test_size=0.3, random_state=42)

In [22]:
# Preparing data
df_train = cleaning_data(df_train)
df_test = cleaning_data(df_test)
dfu = cleaning_data(dfu)

In [23]:
## in order to have the same features on train data sets (for both supervised and semi-sup) and test data sets
# build the tf idf with vocab which is the union the 3 above data sets 
vocab = list(set(itertools.chain(*dfu.stemmed_text.tolist()))|set(itertools.chain(*df_test.stemmed_text.tolist()))|set(itertools.chain(*df_train.stemmed_text.tolist())))
vocab_dict = dict((y, x) for x, y in enumerate(vocab))
print(len(vocab))

17547


In [24]:
# build tf idf matrix separately for train and test and unlabeled data sets 
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, ngram_range=(1,3), use_idf=True, vocabulary = vocab_dict)
td_train = tfidf_vectorizer.fit_transform(df_train.joined_stemmed_text.tolist())
td_test = tfidf_vectorizer.transform(df_test.joined_stemmed_text.tolist())
td_u = tfidf_vectorizer.transform(dfu.joined_stemmed_text.tolist())
#td_test = tfidf_vectorizer.fit_transform(df_test.joined_stemmed_text.tolist())
#td_u = tfidf_vectorizer.fit_transform(dfu.joined_stemmed_text.tolist())

In [89]:
#td_test

<3286x17547 sparse matrix of type '<class 'numpy.float64'>'
	with 25395 stored elements in Compressed Sparse Row format>

Try without the NMF. Just a tf-idf matrix as X.

In [90]:
#X_train = pd.DataFrame(td_train)
#X_test = pd.DataFrame(td_test)
#X_u = pd.DataFrame(td_u)

In [222]:
## same with NMF dimensionality reduction 
## the NMF decomposes this Term Document matrix into the product of 2 smaller matrices: W and H
n_dimensions = 20 # This can also be interpreted as topics in this case. This is the "beauty" of NMF. 10 is arbitrary
nmf_model = NMF(n_components=n_dimensions, random_state=42, alpha=.1, l1_ratio=.5)

#X_u = pd.DataFrame(nmf_model.fit_transform(td_u))

X_train = pd.DataFrame(nmf_model.fit_transform(td_train))
X_test = pd.DataFrame(nmf_model.transform(td_test))
X_u = pd.DataFrame(nmf_model.transform(td_u))
#X_test = pd.DataFrame(nmf_model.fit_transform(td_test))
#X_u = pd.DataFrame(nmf_model.fit_transform(td_u))

Here I decided to reduce the number of topics to 10 instead of 50 to see if it improves our performance.

In [223]:
X_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,0.000000,0.009067,0.000000,0.000000,0.000000,0.000000,0.000000,0.005009,0.000000,0.000000,0.000000,0.000000,0.007727,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,0.000706,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.007678,0.000000,0.000000,0.000000,0.005445,0.000000,0.022895,0.000000,0.000000,0.000000,0.000000,0.002714
2,0.000000,0.012111,0.000000,0.000000,0.000000,0.000000,0.000000,0.004991,0.000000,0.000626,0.000000,0.000000,0.010724,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.005905
3,0.000850,0.044951,0.062886,0.002955,0.085107,0.000933,0.003366,0.004986,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000565,0.000000,0.005773,0.000000,0.000000
4,0.000000,0.004256,0.000000,0.000375,0.000000,0.000000,0.000000,0.000559,0.000000,0.000000,0.000000,0.000000,0.001450,0.000000,0.000000,0.000000,0.000000,0.029272,0.000000,0.004467
5,0.032868,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000991,0.007071
6,0.022364,0.030547,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.065530,0.053299,0.000000,0.000000,0.000000,0.000000,0.000000,0.021669,0.000000,0.000000
7,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000292,0.000000,0.000098,0.000000,0.000000,0.000000,0.000000,0.000000,0.051730,0.008199
8,0.000000,0.000000,0.133219,0.000000,0.142543,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
9,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


### So far I've tried:

- Keeping 'fit' to X_train, X_test and X_u gives very low performances (particularly for f1 for our relevant labeling. It basically labels 0 or only one of the testing data as 'relevant'.

- Putting 'fit' only for X_train. Gives the best overall results: tf = 0.09 for the Normal classifier for our relevant topic. The unsupervised propagation with nn = 10 does not improve performance (it a actually decrease them if we consider the relevant category: 0.06. However, if we lower the threshold we get to 0.17

- Increasing the the number of topics of NMF to 100 (instead of 50): It increases the performances: 0.11 for Normal Classifier. For unsupervised propagation it decreases f1 to 0.04.  If we lower the threshold we get 0.09

NOTA: So far both last solutions give an overall f1 of 0.96 for Normal, unsupervised, and threshold reduced (against 0.80 for 'fit' eveywhere).

- Putting 'fit' only for X_u (because higher number of comments) with topics = 50. Increases the performances: f1 = 0.11 for Normal (still with 0.96 overall).  But only 0.02 for unsupervised (still 0.96 overall). However: it increases the f1 of the lowered threshold to 0.19! (still 0.96 overall)

#### -> Next steps: change nn to 20? Try to find a Classifier that puts more weight on the relevant category during the optimization.

- With 'fit' only on X_train. Topics = 50. (Normal is the same of course) With nn = 20: f1 = 0.06 for unsupervised. (0.96 overall). However 0.22 for lower threshold! (0.96 overall)

- Topics = 20, nn = 50: f1 = 0.11 for Normal (0.96 overall) 0.02 for unsupervised (0.96 overall) and 0.23 for lower threshold! (0.96 overall)

- Topics = 20, nn = 5: f1 = 0.10 for unsupervised (0.96 overall) and only 0.10 with lowered threshold. (0.96 overall)

# Machine Learning approach

In [224]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

Let's try with "screen first"

In [225]:
y_train = df_train.screen.map(int)
y_test = df_test.screen.map(int)

In [226]:
# get the labels for both train and test 
#for i in df.columns if i not in ['text', 'tokens']
#    y_train[i] = df_train.columns[i].map(int)
#    y_test[i] = df_test.columns[i].map(int)

In [227]:
# lets look at the number of positive in the data sets 
print(len(X_train), '(Number of comments in X_train)')
print(sum(y_train), '(Number of relevant labels in X_train)')

print(len(X_test), '(Number of comments in X_test)')
print(sum(y_test), '(Number of relevant labels in X_test)')

7656 (Number of comments in X_train)
222 (Number of relevant labels in X_train)
3286 (Number of comments in X_test)
94 (Number of relevant labels in X_test)


In [228]:
# lets estimate a gradient boosting classifier 
model = GradientBoostingClassifier(n_estimators=100, random_state=42, learning_rate=0.1)

In [229]:
model.fit(X_train, y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=42, subsample=1.0, verbose=0,
              warm_start=False)

Here with 'screen' again

In [230]:
print(confusion_matrix(y_train, model.predict(X_train)))
print(confusion_matrix(y_test, model.predict(X_test)))

[[7426    8]
 [ 158   64]]
[[3180   12]
 [  88    6]]


Here we see that only 5 comments are labeled as "screen" by our prediction model on the testing set. And 89 that should have been detected did not get detected! This is pretty pretty bad. The reason might be that our Gradient Boosting method focuses on optimizing the prediction error, which is not the metric that makes sense in our case.

In [231]:
print(classification_report(y_test, model.predict(X_test)))

             precision    recall  f1-score   support

          0       0.97      1.00      0.98      3192
          1       0.33      0.06      0.11        94

avg / total       0.95      0.97      0.96      3286



### semi supervised learning 

In [239]:
from sklearn.semi_supervised import LabelPropagation
label_prop_model = LabelPropagation(kernel = 'knn', n_neighbors=5, max_iter = 3000)
label_prop_model.fit(X_train, y_train)
#label_prop_model.fit(pd.concat([X_train, X_test]), pd.concat([y_train, y_test]))

LabelPropagation(alpha=None, gamma=20, kernel='knn', max_iter=3000, n_jobs=1,
         n_neighbors=5, tol=0.001)

What distance is used here? Because we are using a TF-IDF Matrix... Euclidian distance does not make sense.
Here we are actually using it on the NMF. So the number of dimension is way lower.

In [240]:
y_semi_proba = label_prop_model.predict_proba(X_u) # first column gives the proba of 0, second column gives the proba of 1 
y_semi = pd.Series(label_prop_model.predict(X_u))
print(y_semi.value_counts())

0    83183
1      735
dtype: int64


In [241]:
proba_1 = y_semi_proba[:,1] # get the proba of 1 
pd.Series(proba_1).describe()

count    83918.000000
mean         0.028585
std          0.097246
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max          1.000000
dtype: float64

In [242]:
# with n neigh = 10
X_train_semi = pd.concat([X_train, X_u])
y_train_semi = pd.concat([y_train, y_semi])
model.fit(X_train_semi, y_train_semi)

print(confusion_matrix(y_train_semi, model.predict(X_train_semi)))
print(confusion_matrix(y_test, model.predict(X_test)))

[[90556    61]
 [  667   290]]
[[3186    6]
 [  89    5]]


In [243]:
print(classification_report(y_test, model.predict(X_test)))

             precision    recall  f1-score   support

          0       0.97      1.00      0.99      3192
          1       0.45      0.05      0.10        94

avg / total       0.96      0.97      0.96      3286



##### We see that here the Label Propagation does not really improve our model... (or a bit only).

Here we see that with a 50% threshold it's maybe too strict for this case... Maybe we should lower this.

In [244]:
# try to spread more labels (use thereshold lower than 0.5 in order to predict more labels)
# here we spread the same proportion of 1 in the unlabeled data set as in the labeled train data set  
y_semi_bis = pd.Series([1 if x > pd.Series(proba_1).quantile(q=1-np.mean(y_train)) else 0 for x in proba_1])
y_train_semi_bis = pd.concat([y_train, y_semi_bis])
model.fit(X_train_semi, y_train_semi_bis)
print(confusion_matrix(y_train_semi_bis, model.predict(X_train_semi)))
print(confusion_matrix(y_test, model.predict(X_test)))

[[90556    61]
 [  667   290]]
[[3186    6]
 [  89    5]]


In [245]:
print(classification_report(y_test, model.predict(X_test)))

             precision    recall  f1-score   support

          0       0.97      1.00      0.99      3192
          1       0.45      0.05      0.10        94

avg / total       0.96      0.97      0.96      3286



Lowering the threshold improves the f1 score for the for the category.

# Let's try XGBoost

In [47]:
import xgboost as xgb

In [48]:
# lets estimate a XG boosting classifier 
XGmodel = xgb.XGBClassifier(n_estimators=100, random_state=42, learning_rate=0.1)

In [49]:
XGmodel.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic',
       random_state=42, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=True, subsample=1)

In [50]:
print(confusion_matrix(y_train, XGmodel.predict(X_train)))
print(confusion_matrix(y_test, XGmodel.predict(X_test)))

[[7430    4]
 [ 198   24]]
[[3188    4]
 [  93    1]]


Here we see that only one comment was label as "screen" by our prediction model on the testing set. And 431 that should have been detected did not get detected! This is pretty pretty bad. The reason might be that our Gradient Boosting method focuses on optimizing the prediction error, which is not the metric that makes sense in our case.

In [51]:
print(classification_report(y_test, XGmodel.predict(X_test)))

             precision    recall  f1-score   support

          0       0.97      1.00      0.99      3192
          1       0.20      0.01      0.02        94

avg / total       0.95      0.97      0.96      3286



### semi supervised learning combined to XGBoost

In [52]:
# with n neigh = 10
XGmodel.fit(X_train_semi, y_train_semi)

print(confusion_matrix(y_train_semi, XGmodel.predict(X_train_semi)))
print(confusion_matrix(y_test, XGmodel.predict(X_test)))

[[90955     1]
 [  617     1]]
[[3191    1]
 [  94    0]]


In [53]:
print(classification_report(y_test, XGmodel.predict(X_test)))

             precision    recall  f1-score   support

          0       0.97      1.00      0.99      3192
          1       0.00      0.00      0.00        94

avg / total       0.94      0.97      0.96      3286



### This is shit. It does not any label any comment as 'screen'...

In [54]:
# try to spread more labels (use thereshold lower than 0.5 in order to predict more labels)
# here we spread the same proportion of 1 in the unlabeled data set as in the labeled train data set  
y_semi_bis = pd.Series([1 if x > pd.Series(proba_1).quantile(q=1-np.mean(y_train)) else 0 for x in proba_1])
y_train_semi_bis = pd.concat([y_train, y_semi_bis])
XGmodel.fit(X_train_semi, y_train_semi_bis)
print(confusion_matrix(y_train_semi_bis, XGmodel.predict(X_train_semi)))
print(confusion_matrix(y_test, XGmodel.predict(X_test)))

[[88818   211]
 [ 1275  1270]]
[[3165   27]
 [  84   10]]


In [55]:
print(classification_report(y_test, XGmodel.predict(X_test)))

             precision    recall  f1-score   support

          0       0.97      0.99      0.98      3192
          1       0.27      0.11      0.15        94

avg / total       0.95      0.97      0.96      3286



XGBoost performs worse than the normal GradientBoosting...