# Semi supervised learning 

#### aim of this notebook : build a classifer for relevant / not relevant (that is classify a comment as a review related to a default, issue)
### first build a classifier in supervised approach using labeled data 
### second build a classifer based on labeled data + unlabeled data to which we propagated labels 


In [2]:
import pandas as pd
from tqdm import tqdm, tqdm_notebook # progress bars in Jupyter
#import newspaper # download newspapers' data easily
from time import time # measure the computation time of a python code
import pandas as pd # the most basic & powerful data manipulation tool
import numpy as np # Here, mostly used for np.nan
import langdetect # detect the language of text
import stop_words # handles stop words in many languages without having to rebuild them everytime
import spacy # NLP library for POS tagging
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import re
import itertools
# For spacy use "pip install spacy", then "python -m spacy download en" to download English text mining modules

In [3]:
tqdm.pandas()
#tqdm_notebook()

### Read data

In [4]:
df = pd.read_csv('labeled_data.csv', engine='python') # label data only -> used for supervised model 

dfu = pd.read_csv('data_unlabeled.csv', encoding = 'utf-8') 
# unlabeled data -> used to together with lable data for semi supervised learning 

In [5]:
print(df.shape)
print(df.head(1))

(10997, 15)
                                               text  apps_update  \
0  dope video brian iâve been subscribe for years          0.0   

   battery_life_charging  battery_overheat  camera  connectivity  \
0                    0.0               0.0     0.0           0.0   

   customerservice  locking_system  memory_storage  screen  software_bugs  \
0              0.0             0.0             0.0     0.0            0.0   

   sound  system  water_damage  issue  
0    0.0     0.0           0.0    0.0  


In [6]:
df[[c for c in df.columns if c not in ['text', 'tokens']]].sum().map(int)

apps_update                82
battery_life_charging     147
battery_overheat           22
camera                    100
connectivity              109
customerservice           109
locking_system            206
memory_storage            156
screen                    316
software_bugs             140
sound                      43
system                    343
water_damage                6
issue                    1498
dtype: int64

# Create features

### one important thing to have in mind when building a model : to make feature engineering separately on train and test. If you don't do that, you will incoporate info from the test set into the train

In [7]:
from gensim.models import Phrases
from gensim import corpora
import stop_words
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

#nlp = spacy.load('en')

In [8]:
## Function to clean and process the reviews
def cleaning_data(df) :
    STOPWORDS = stop_words.get_stop_words(language='en')
    #df.drop_duplicates(inplace= True) # Drop duplicated sentences
    df = df[~df['text'].isnull()] # Remove empty sentences
        
    # Remove special characters and punctucation
    df['clean_review']= [ re.sub('[^A-Za-z]+',' ', e ) for e in df['text'].apply(lambda x : x.lower())]
    
    # Remove empty clean_review
    df = df[~df['clean_review'].isnull()]
    df = df[~(df['clean_review']==' ')]
    df.reset_index(inplace=True, drop=True) # Reset index
    
    df['tokens'] = df['clean_review'].map(word_tokenize)
    df['nb_tokens'] = df['tokens'].map(len)
    
    ## keep only sentences with at least 3 tokens
    df = df[df['nb_tokens']>2]
    
    # remove stopwords
    df['tokens'] = df['tokens'].apply(lambda x: [i for i in x if i not in STOPWORDS])

    stemmer = SnowballStemmer("english")
    df['stemmed_text'] = df["tokens"].apply(lambda x: [stemmer.stem(y) for y in x])
    df['joined_stemmed_text'] = [' '.join(word for word in word_list) for word_list in df.stemmed_text ]

    return df




In [9]:
df.issue.value_counts()

0.0    9499
1.0    1498
Name: issue, dtype: int64

In [10]:
## split between train and test at the beginning 
# we will use the same test set for supervised and semi supervised learning, so that we can compare the performances of 
# both approaches 
df_train, df_test = train_test_split(df, test_size=0.3, random_state=42, stratify=df.issue)

In [11]:
# Preparing data
df_train = cleaning_data(df_train)
df_test = cleaning_data(df_test)
dfu = cleaning_data(dfu)

In [12]:
## in order to have the same features on train data sets (for both supervised and semi-sup) and test data sets
# build the tf idf with vocab which is the union the 3 above data sets 
vocab = list(set(itertools.chain(*dfu.stemmed_text.tolist()))|set(itertools.chain(*df_test.stemmed_text.tolist()))|set(itertools.chain(*df_train.stemmed_text.tolist())))
vocab_dict = dict((y, x) for x, y in enumerate(vocab))
print(len(vocab))

17547


In [13]:
# build tf idf matrix separately for train and test and unlabeled data sets 
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, ngram_range=(1,3), use_idf=True, vocabulary = vocab_dict)
td_train = tfidf_vectorizer.fit_transform(df_train.joined_stemmed_text.tolist())
td_test = tfidf_vectorizer.transform(df_test.joined_stemmed_text.tolist())
td_u = tfidf_vectorizer.transform(dfu.joined_stemmed_text.tolist())
#td_test = tfidf_vectorizer.fit_transform(df_test.joined_stemmed_text.tolist())
#td_u = tfidf_vectorizer.fit_transform(dfu.joined_stemmed_text.tolist())

In [14]:
# same with NMF dimensionality reduction 
# the NMF decomposes this Term Document matrix into the product of 2 smaller matrices: W and H
n_dimensions = 50 # This can also be interpreted as topics in this case. This is the "beauty" of NMF. 10 is arbitrary
nmf_model = NMF(n_components=n_dimensions, random_state=42, alpha=.1, l1_ratio=.5)

X_train = pd.DataFrame(nmf_model.fit_transform(td_train))
X_test = pd.DataFrame(nmf_model.transform(td_test))
X_u = pd.DataFrame(nmf_model.transform(td_u))
#X_test = pd.DataFrame(nmf_model.fit_transform(td_test))
#X_u = pd.DataFrame(nmf_model.fit_transform(td_u))

# Machine Learning approach

In [15]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

In [31]:
y_train = df_train.issue.map(int)
y_test = df_test.issue.map(int)

In [20]:
y_train

0       0
1       0
2       0
3       0
4       0
5       0
6       0
7       0
8       0
9       0
10      0
11      0
12      0
13      0
14      0
15      0
16      0
17      0
18      0
19      0
20      0
21      0
22      0
23      0
24      0
25      0
26      0
27      0
28      0
29      0
       ..
7664    0
7665    0
7666    0
7667    0
7668    0
7669    0
7670    0
7671    0
7672    1
7673    0
7674    0
7675    0
7676    0
7677    0
7678    0
7679    0
7680    0
7681    0
7682    0
7683    0
7684    0
7685    0
7686    0
7687    0
7688    0
7689    0
7690    0
7691    0
7692    0
7693    0
Name: screen, Length: 7658, dtype: int64

In [32]:
# lets look at the number of positive in the data sets 
print(len(X_train))
print(len(y_train))
print(sum(y_train))
print(sum(y_test))

7658
7658
1047
449


In [33]:
# lets estimate a gradient boosting classifier 
model = GradientBoostingClassifier(n_estimators=100, random_state=42, learning_rate=0.1)

In [34]:
model.fit(X_train, y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=42, subsample=1.0, verbose=0,
              warm_start=False)

In [30]:
df.loc[df.issue==1].head()

Unnamed: 0,text,apps_update,battery_life_charging,battery_overheat,camera,connectivity,customerservice,locking_system,memory_storage,screen,software_bugs,sound,system,water_damage,issue
1,"finally, i really hated that apple didnÃ¢â¬â...",0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,"so many crashes and battery drains,...",0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
5,it crashes for nothing and strikes with someth...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0
19,only drawback is that it's very difficult for ...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
26,slower than my windows phone from 2013.,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0


In [35]:
print(confusion_matrix(y_train, model.predict(X_train)))
print(confusion_matrix(y_test, model.predict(X_test)))

[[6605    6]
 [ 953   94]]
[[2819   16]
 [ 437   12]]


Here we see that only 12 comments are labeled as "issue" by our prediction model on the testing set. And 437 that should have been detected did not get detected! This is pretty pretty bad. The reason might be that our Gradient Boosting method focuses on optimizing the prediction error, which is not the metric that makes sense in our case.

In [36]:
print(classification_report(y_test, model.predict(X_test)))

             precision    recall  f1-score   support

          0       0.87      0.99      0.93      2835
          1       0.43      0.03      0.05       449

avg / total       0.81      0.86      0.81      3284



### semi supervised learning 

In [27]:
from sklearn.semi_supervised import LabelPropagation
label_prop_model = LabelPropagation(kernel = 'knn', n_neighbors=10, max_iter = 3000)
label_prop_model.fit(X_train, y_train)
#label_prop_model.fit(pd.concat([X_train, X_test]), pd.concat([y_train, y_test]))

LabelPropagation(alpha=None, gamma=20, kernel='knn', max_iter=3000, n_jobs=1,
         n_neighbors=10, tol=0.001)

What distance is used here? Because we are using a TF-IDF Matrix... Euclidian distance does not make sense.
Here we are actually using it on the NMF. So the number of dimension is way lower.

In [28]:
y_semi_proba = label_prop_model.predict_proba(X_u) # first column gives the proba of 0, second column gives the proba of 1 
y_semi = pd.Series(label_prop_model.predict(X_u))
print(y_semi.value_counts())

0    83783
1      135
dtype: int64


In [24]:
proba_1 = y_semi_proba[:,1] # get the proba of 1 
pd.Series(proba_1).describe()

count    83918.000000
mean         0.126843
std          0.145547
min          0.000000
25%          0.000000
50%          0.100000
75%          0.200000
max          1.000000
dtype: float64

Here we see that with a 50% threshold it's maybe too strict for this case... Maybe we should lower this.

In [25]:
# with n neigh = 10
X_train_semi = pd.concat([X_train, X_u])
y_train_semi = pd.concat([y_train, y_semi])
model.fit(X_train_semi, y_train_semi)

print(confusion_matrix(y_train_semi, model.predict(X_train_semi)))
print(confusion_matrix(y_test, model.predict(X_test)))

[[88827   226]
 [ 1550   973]]
[[2793   42]
 [ 436   13]]


In [26]:
print(classification_report(y_test, model.predict(X_test)))

             precision    recall  f1-score   support

          0       0.86      0.99      0.92      2835
          1       0.24      0.03      0.05       449

avg / total       0.78      0.85      0.80      3284



In [27]:
# try to spread more labels (use thereshold lower than 0.5 in order to predict more labels)
# here we spread the same proportion of 1 in the unlabeled data set as in the labeled train data set  
y_semi_bis = pd.Series([1 if x > pd.Series(proba_1).quantile(q=1-np.mean(y_train)) else 0 for x in proba_1])
y_train_semi_bis = pd.concat([y_train, y_semi_bis])
model.fit(X_train_semi, y_train_semi_bis)
print(confusion_matrix(y_train_semi_bis, model.predict(X_train_semi)))
print(confusion_matrix(y_test, model.predict(X_test)))

[[82674   759]
 [ 5711  2432]]
[[2768   67]
 [ 424   25]]


In [28]:
print(classification_report(y_test, model.predict(X_test)))

             precision    recall  f1-score   support

          0       0.87      0.98      0.92      2835
          1       0.27      0.06      0.09       449

avg / total       0.79      0.85      0.81      3284



##### We see that here the Label Propagation does not really improve our model... (or a bit only).

# Let's try XGBoost

In [29]:
import xgboost as xgb

In [30]:
# lets estimate a XG boosting classifier 
XGmodel = xgb.XGBClassifier(n_estimators=100, random_state=42, learning_rate=0.1)

In [31]:
XGmodel.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic',
       random_state=42, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=True, subsample=1)

In [32]:
print(confusion_matrix(y_train, XGmodel.predict(X_train)))
print(confusion_matrix(y_test, XGmodel.predict(X_test)))

[[6603    8]
 [ 998   49]]
[[2827    8]
 [ 443    6]]


Here we see that only one comment was label as "issue" by our prediction model on the testing set. And 431 that should have been detected did not get detected! This is pretty pretty bad. The reason might be that our Gradient Boosting method focuses on optimizing the prediction error, which is not the metric that makes sense in our case.

In [33]:
print(classification_report(y_test, XGmodel.predict(X_test)))

             precision    recall  f1-score   support

          0       0.86      1.00      0.93      2835
          1       0.43      0.01      0.03       449

avg / total       0.80      0.86      0.80      3284



### semi supervised learning combined to XGBoost

In [34]:
# with n neigh = 10
XGmodel.fit(X_train_semi, y_train_semi)

print(confusion_matrix(y_train_semi, XGmodel.predict(X_train_semi)))
print(confusion_matrix(y_test, XGmodel.predict(X_test)))

[[88854   199]
 [ 1659   864]]
[[2816   19]
 [ 441    8]]


In [35]:
print(classification_report(y_test, XGmodel.predict(X_test)))

             precision    recall  f1-score   support

          0       0.86      0.99      0.92      2835
          1       0.30      0.02      0.03       449

avg / total       0.79      0.86      0.80      3284



In [36]:
# try to spread more labels (use thereshold lower than 0.5 in order to predict more labels)
# here we spread the same proportion of 1 in the unlabeled data set as in the labeled train data set  
y_semi_bis = pd.Series([1 if x > pd.Series(proba_1).quantile(q=1-np.mean(y_train)) else 0 for x in proba_1])
y_train_semi_bis = pd.concat([y_train, y_semi_bis])
XGmodel.fit(X_train_semi, y_train_semi_bis)
print(confusion_matrix(y_train_semi_bis, XGmodel.predict(X_train_semi)))
print(confusion_matrix(y_test, XGmodel.predict(X_test)))

[[82833   600]
 [ 5936  2207]]
[[2777   58]
 [ 425   24]]


In [46]:
print(classification_report(y_test, XGmodel.predict(X_test)))

             precision    recall  f1-score   support

          0       0.87      0.98      0.92      2848
          1       0.27      0.04      0.08       432

avg / total       0.79      0.86      0.81      3280

