## Treating the Labels as Noisy
### Learning from Noisy Labels using CleanLab

Link: https://github.com/cgnorthcutt/cleanlab/

In [1]:
import json
import pandas as pd
import numpy as np

In [2]:
train = pd.read_json('data/raw/train.json')
train.head()

Unnamed: 0,uid,sentiment,text
0,3,negative,@ AdilNisarButt pakistan ka ghra tauq he Pakis...
1,41,negative,Madarchod mulle ye mathura me Nahi dikha tha j...
2,48,positive,@ narendramodi Manya Pradhan Mantri mahoday Sh...
3,64,positive,@ Atheist _ Krishna Jcb full trend me chal rah...
4,66,positive,@ AbhisharSharma _ @ RavishKumarBlog Loksabha ...


In [3]:
num_classes = len(set(list(train['sentiment']))); num_classes

3

# Refering to : [Twitter-Airlines](https://github.com/martinpella/twitter-airlines/blob/master/shallow_learning.ipynb) for cleaning data. 

In [4]:
# !pip install --upgrade snowballstemmer
# !pip install --upgrade nltk
# !pip install --upgrade scikit-learn

In [5]:
from utils import TextCleaner, CleanTwitter

In [6]:
%time X_train, X_test, y_train, y_test, tfidf_train, tfidf_test = CleanTwitter(train)

CPU times: user 24.4 s, sys: 77.2 ms, total: 24.4 s
Wall time: 24.5 s


# LearningWithNoisyLabels : [IrisSimple](https://github.com/cgnorthcutt/cleanlab/blob/master/examples/iris_simple_example.ipynb)?

In [7]:
!pip install --upgrade cleanlab

Requirement already up-to-date: cleanlab in /Users/meghanabhange/anaconda3/envs/Hinglish/lib/python3.7/site-packages (0.1.0)


In [8]:
from cleanlab.classification import LearningWithNoisyLabels
from cleanlab.noise_generation import generate_noise_matrix_from_trace
from cleanlab.noise_generation import generate_noisy_labels
from cleanlab.util import value_counts
from cleanlab.latent_algebra import compute_inv_noise_matrix
import cleanlab

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

    while running methods in cleanlab.pruning, install tqdm
    via "pip install tqdm".


In [9]:
seed = 37
np.random.seed(seed = seed)

In [10]:
# Not sure if this is how you do it? HOW TO CHANGE THESE VALUES?

# Set the sparsity of the noise matrix.
FRAC_ZERO_NOISE_RATES = 0.5
# A proxy for the fraction of labels that are correct.
avg_trace = 0.67 # ~33% wrong labels. Increasing makes the problem easier.

p_y = value_counts(y_train) # probability of y estimates
noise_matrix = generate_noise_matrix_from_trace(
        K = num_classes,
        trace = num_classes * avg_trace, 
        py = p_y,
        frac_zero_noise_rates = FRAC_ZERO_NOISE_RATES,
    )

# Create noisy labels
s = generate_noisy_labels(y_train, noise_matrix)

In [11]:
noise_matrix

array([[0.57010869, 0.        , 0.        ],
       [0.        , 0.91519804, 0.47530674],
       [0.42989131, 0.08480196, 0.52469326]])

In [12]:
print('WITHOUT confident learning,', end=" ")
m = LogisticRegression(C=4, dual=False, multi_class="auto", solver="lbfgs", max_iter=1000)
_ = m.fit(tfidf_train, y_train)
pred = m.predict(tfidf_test)
print("test accuracy:", round(accuracy_score(pred, y_test), 5))

WITHOUT confident learning, test accuracy: 0.59397


In [13]:
print('WITH confident learning (without noise matrix given),', end=" ")
m_rp = LogisticRegression(solver='lbfgs', multi_class='auto', max_iter=1000)
rp = LearningWithNoisyLabels(clf = m_rp)
_ = rp.fit(tfidf_train, y_train)
pred = rp.predict(tfidf_test)
print("test accuracy:", round(accuracy_score(pred, y_test),5))

WITH confident learning (without noise matrix given), test accuracy: 0.60322


In [14]:
print('WITH confident learning (noise matrix given),', end=" ")
m2_rp = LogisticRegression(solver='lbfgs', multi_class='auto', max_iter=1000)
rp = LearningWithNoisyLabels(clf = m2_rp)
_ = rp.fit(tfidf_train, y_train, noise_matrix=noise_matrix)
pred = rp.predict(tfidf_test)
print("test accuracy:", round(accuracy_score(pred, y_test),5))

WITH confident learning (noise matrix given), test accuracy: 0.61089


In [15]:
print('WITH confident learning (noise / inverse noise matrix given),', end=" ")
m3_rp = LogisticRegression(solver='lbfgs', multi_class='auto', max_iter=1000)
rp = LearningWithNoisyLabels(clf = m3_rp)
_ = rp.fit(tfidf_train, y_train, noise_matrix = noise_matrix, inverse_noise_matrix=compute_inv_noise_matrix(p_y, noise_matrix))
pred = rp.predict(tfidf_test)
print("test accuracy:", round(accuracy_score(pred, y_test),5))

WITH confident learning (noise / inverse noise matrix given), test accuracy: 0.6101


In [16]:
print('WITH confident learning (using latent noise matrix estimation),', end=" ")
m = LogisticRegression(solver='lbfgs', multi_class='auto', max_iter=1000)
rp = LearningWithNoisyLabels(clf = m)
_ = rp.fit(tfidf_train, y_train)
pred = rp.predict(tfidf_test)
print("test accuracy:", round(accuracy_score(pred, y_test),5))

WITH confident learning (using latent noise matrix estimation), test accuracy: 0.60508


In [17]:
print('WITH confident learning (using calibrated confident joint),', end=" ")
m = LogisticRegression(solver='lbfgs', multi_class='auto', max_iter=1000)
rp = LearningWithNoisyLabels(clf=m)
_ = rp.fit(tfidf_train, y_train)
pred = rp.predict(tfidf_test)
print("test accuracy:", round(accuracy_score(pred, y_test),5))

WITH confident learning (using calibrated confident joint), test accuracy: 0.60878


In [18]:
sum(s != y_train)

3600

## Grid Search

In [19]:
# !pip install --upgrade hypopt # this is not a typo, hypopt and hyperopt are different packages

In [20]:
# from hypopt.model_selection import GridSearch
from sklearn.model_selection import ParameterGrid

In [21]:
param_grid = {
    "prune_method": ["prune_by_noise_rate", "prune_by_class", "both"],
    "converge_latent_estimates": [True, False],
}
# Fit LearningWithNoisyLabels across all parameter settings.
params = ParameterGrid(param_grid)
scores = []
for param in params:
    clf = LogisticRegression(solver = 'lbfgs', multi_class = 'auto', max_iter = 1000)
    rp = LearningWithNoisyLabels(clf = clf, **param)
    _ = rp.fit(tfidf_train, s) # s is the noisy y_train labels
    scores.append(accuracy_score(rp.predict(tfidf_test), y_test))

# Print results sorted from best to least
for i in np.argsort(scores)[::-1]:
    print("Param settings:", params[i])
    print(
        "Hinglish dataset test accuracy (using confident learning):\t", 
        round(scores[i], 5),
        "\n"
    )

Param settings: {'prune_method': 'prune_by_class', 'converge_latent_estimates': False}
Hinglish dataset test accuracy (using confident learning):	 0.45995 

Param settings: {'prune_method': 'both', 'converge_latent_estimates': False}
Hinglish dataset test accuracy (using confident learning):	 0.45969 

Param settings: {'prune_method': 'both', 'converge_latent_estimates': True}
Hinglish dataset test accuracy (using confident learning):	 0.45942 

Param settings: {'prune_method': 'prune_by_noise_rate', 'converge_latent_estimates': True}
Hinglish dataset test accuracy (using confident learning):	 0.45731 

Param settings: {'prune_method': 'prune_by_class', 'converge_latent_estimates': True}
Hinglish dataset test accuracy (using confident learning):	 0.45467 

Param settings: {'prune_method': 'prune_by_noise_rate', 'converge_latent_estimates': False}
Hinglish dataset test accuracy (using confident learning):	 0.4507 



# 200 Most Likely Wrong Samples

In [22]:
# !pip install git+https://github.com/cgnorthcutt/rankpruning.git

In [23]:
psx = cleanlab.latent_estimation.estimate_cv_predicted_probabilities(tfidf_train, s, clf=LogisticRegression(max_iter=1000, multi_class='auto', solver='lbfgs'))

In [24]:
s = np.asarray(s)
psx = np.asarray(psx)

In [25]:
from cleanlab.pruning import get_noise_indices
import cleanlab

jc, psx = cleanlab.latent_estimation.estimate_confident_joint_and_cv_pred_proba(tfidf_train, y_train, rp)

ordered_label_errors = get_noise_indices(
    s = s,
    psx = psx,
    sorted_index_method='normalized_margin',
 )

In [26]:
ordered_label_errors

array([3647, 1849,  417, ..., 2706, 7614, 1452])

In [27]:
most_likely_200 = ordered_label_errors[:200]
pd.DataFrame({"text": X_train[most_likely_200], "label" : y_train[most_likely_200], "index" : most_likely_200})

Unnamed: 0,text,label,index
0,husne ylcnk romagnetique eloraxiong i love you,1,3647
1,anuview badhai badhai badhai ji haan you are u...,2,1849
2,happy birthday to my favorite gemini i love yo...,2,417
3,ngo laureen thanks abi i love you most,2,4065
4,narendramodi modiji very very very happy for y...,2,587
...,...,...,...
195,rt v basilio happy birthday simplyglyzz i miss...,2,6764
196,shri shankarlalwani ji ko bahut bahut badhai a...,2,488
197,sanjayazadsln sanjay chor ye kaise sabit ho ra...,1,3236
198,wow its really super world cup fananthem by ub...,2,3698
