## Treating the Labels as Noisy
### Learning from Noisy Labels using CleanLab

Link: https://github.com/cgnorthcutt/cleanlab/

In [None]:
import json
import pandas as pd
import numpy as np

In [None]:
from pathlib import Path
datapath = Path("../data")
data_raw = datapath/"raw"
data_interim = datapath/"interim"
data_processed = datapath/"processed"
cleanlab_datapath = datapath/"cleanlab"

In [None]:
train = pd.read_json(data_interim/'train-large.json')
train.head()

In [None]:
num_classes = len(set(list(train['sentiment']))); num_classes

# Refering to : [Twitter-Airlines](https://github.com/martinpella/twitter-airlines/blob/master/shallow_learning.ipynb) for cleaning data. 

In [None]:
# !pip install --upgrade snowballstemmer
# !pip install --upgrade nltk
# !pip install --upgrade scikit-learn

In [None]:
from utils import TextCleaner, CleanTwitter

In [None]:
%time X_train, X_test, y_train, y_test, tfidf_train, tfidf_test = CleanTwitter(train)

# LearningWithNoisyLabels : [IrisSimple](https://github.com/cgnorthcutt/cleanlab/blob/master/examples/iris_simple_example.ipynb)?

In [None]:
!pip install --upgrade cleanlab

In [None]:
from cleanlab.classification import LearningWithNoisyLabels
from cleanlab.noise_generation import generate_noise_matrix_from_trace
from cleanlab.noise_generation import generate_noisy_labels
from cleanlab.util import value_counts
from cleanlab.latent_algebra import compute_inv_noise_matrix
import cleanlab

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [None]:
seed = 37
np.random.seed(seed = seed)

In [None]:
# Not sure if this is how you do it? HOW TO CHANGE THESE VALUES?

# Set the sparsity of the noise matrix.
FRAC_ZERO_NOISE_RATES = 0.5
# A proxy for the fraction of labels that are correct.
avg_trace = 0.67  # ~33% wrong labels. Increasing makes the problem easier.

p_y = value_counts(y_train)  # probability of y estimates
noise_matrix = generate_noise_matrix_from_trace(
    K=num_classes,
    trace=num_classes * avg_trace,
    py=p_y,
    frac_zero_noise_rates=FRAC_ZERO_NOISE_RATES,
)

# Create noisy labels
s = generate_noisy_labels(y_train, noise_matrix)

In [None]:
noise_matrix

In [None]:
print("WITHOUT confident learning,", end=" ")
m = LogisticRegression(
    C=4, dual=False, multi_class="auto", solver="lbfgs", max_iter=1000
)
_ = m.fit(tfidf_train, y_train)
pred = m.predict(tfidf_test)
print("test accuracy:", round(accuracy_score(pred, y_test), 5))

In [None]:
print("WITH confident learning (without noise matrix given),", end=" ")
m_rp = LogisticRegression(solver="lbfgs", multi_class="auto", max_iter=1000)
rp = LearningWithNoisyLabels(clf=m_rp)
_ = rp.fit(tfidf_train, y_train)
pred = rp.predict(tfidf_test)
print("test accuracy:", round(accuracy_score(pred, y_test), 5))

In [None]:
print("WITH confident learning (noise matrix given),", end=" ")
m2_rp = LogisticRegression(solver="lbfgs", multi_class="auto", max_iter=1000)
rp = LearningWithNoisyLabels(clf=m2_rp)
_ = rp.fit(tfidf_train, y_train, noise_matrix=noise_matrix)
pred = rp.predict(tfidf_test)
print("test accuracy:", round(accuracy_score(pred, y_test), 5))

In [None]:
print("WITH confident learning (noise / inverse noise matrix given),", end=" ")
m3_rp = LogisticRegression(solver="lbfgs", multi_class="auto", max_iter=1000)
rp = LearningWithNoisyLabels(clf=m3_rp)
_ = rp.fit(
    tfidf_train,
    y_train,
    noise_matrix=noise_matrix,
    inverse_noise_matrix=compute_inv_noise_matrix(p_y, noise_matrix),
)
pred = rp.predict(tfidf_test)
print("test accuracy:", round(accuracy_score(pred, y_test), 5))

In [None]:
print("WITH confident learning (using latent noise matrix estimation),", end=" ")
m = LogisticRegression(solver="lbfgs", multi_class="auto", max_iter=1000)
rp = LearningWithNoisyLabels(clf=m)
_ = rp.fit(tfidf_train, y_train)
pred = rp.predict(tfidf_test)
print("test accuracy:", round(accuracy_score(pred, y_test), 5))

In [None]:
print('WITH confident learning (using calibrated confident joint),', end=" ")
m = LogisticRegression(solver='lbfgs', multi_class='auto', max_iter=1000)
rp = LearningWithNoisyLabels(clf=m)
_ = rp.fit(tfidf_train, y_train)
pred = rp.predict(tfidf_test)
print("test accuracy:", round(accuracy_score(pred, y_test),5))

In [None]:
sum(s != y_train)

## Grid Search

In [None]:
# !pip install --upgrade hypopt # this is not a typo, hypopt and hyperopt are different packages

In [None]:
# from hypopt.model_selection import GridSearch
from sklearn.model_selection import ParameterGrid

In [None]:
param_grid = {
    "prune_method": ["prune_by_noise_rate", "prune_by_class", "both"],
    "converge_latent_estimates": [True, False],
}
# Fit LearningWithNoisyLabels across all parameter settings.
params = ParameterGrid(param_grid)
scores = []
for param in params:
    clf = LogisticRegression(solver = 'lbfgs', multi_class = 'auto', max_iter = 1000)
    rp = LearningWithNoisyLabels(clf = clf, **param)
    _ = rp.fit(tfidf_train, s) # s is the noisy y_train labels
    scores.append(accuracy_score(rp.predict(tfidf_test), y_test))

# Print results sorted from best to least
for i in np.argsort(scores)[::-1]:
    print("Param settings:", params[i])
    print(
        "Hinglish dataset test accuracy (using confident learning):\t", 
        round(scores[i], 5),
        "\n"
    )

# 200 Most Likely Wrong Samples

In [None]:
# !pip install git+https://github.com/cgnorthcutt/rankpruning.git

In [None]:
psx = cleanlab.latent_estimation.estimate_cv_predicted_probabilities(
    tfidf_train,
    s,
    clf=LogisticRegression(max_iter=1000, multi_class="auto", solver="lbfgs"),
)

In [None]:
s = np.asarray(s)
psx = np.asarray(psx)
pd.set_option("display.max_colwidth", 201)
pd.set_option("max_rows", 200)

In [None]:
from cleanlab.pruning import get_noise_indices
import cleanlab

jc, psx = cleanlab.latent_estimation.estimate_confident_joint_and_cv_pred_proba(
    tfidf_train, y_train, rp
)

ordered_label_errors = get_noise_indices(
    s=s, psx=psx, sorted_index_method="normalized_margin"
)

In [None]:
ordered_label_errors

In [None]:
most_likely_200 = ordered_label_errors[:200]
pd.DataFrame(
    {
        "text": X_train[most_likely_200],
        "label": y_train[most_likely_200],
        "index": most_likely_200,
    }
)