# Cleanlab
## Treating the Labels as Noisy
### Learning from Noisy Labels
Link: https://github.com/cgnorthcutt/cleanlab/

In [1]:
import json
import pandas as pd
import numpy as np

In [2]:
train = pd.read_json('train.json')
train.head()

Unnamed: 0,sentiment,text,uid
0,negative,@ AdilNisarButt pakistan ka ghra tauq he Pakis...,3
1,negative,Madarchod mulle ye mathura me Nahi dikha tha j...,41
2,positive,@ narendramodi Manya Pradhan Mantri mahoday Sh...,48
3,positive,@ Atheist _ Krishna Jcb full trend me chal rah...,64
4,positive,@ AbhisharSharma _ @ RavishKumarBlog Loksabha ...,66


In [3]:
num_classes = len(set(list(train['sentiment'])))

# Refering to : [Twitter-Airlines](https://github.com/martinpella/twitter-airlines/blob/master/shallow_learning.ipynb) for cleaning data. 

In [4]:
from utils import TextCleaner, CleanTwitter

In [5]:
X_train, X_test, y_train, y_test, tfidf_train, tfidf_test = CleanTwitter(train)

# LearningWithNoisyLabels : [IrisSimple](https://github.com/cgnorthcutt/cleanlab/blob/master/examples/iris_simple_example.ipynb)?

In [6]:
from cleanlab.classification import LearningWithNoisyLabels
from cleanlab.noise_generation import generate_noise_matrix_from_trace
from cleanlab.noise_generation import generate_noisy_labels
from cleanlab.util import value_counts
from cleanlab.latent_algebra import compute_inv_noise_matrix

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [7]:
m = LogisticRegression(C=4, dual=True)
rp = LearningWithNoisyLabels(clf = m)

In [None]:
# Not sure if this is how you do it?

# Set the sparsity of the noise matrix.
FRAC_ZERO_NOISE_RATES = 0.5 
# A proxy for the fraction of labels that are correct.
avg_trace = 0.65 # ~35% wrong labels. Increasing makes the problem easier.
py = value_counts(y_train)
noise_matrix = generate_noise_matrix_from_trace(
        K = num_classes,
        trace = num_classes * avg_trace, 
        py = py,
        frac_zero_noise_rates = FRAC_ZERO_NOISE_RATES,
    )

# Create noisy labels
s = generate_noisy_labels(y_train, noise_matrix)

In [None]:
print('WITHOUT confident learning,', end=" ")
m = LogisticRegression(solver='lbfgs', multi_class='auto', max_iter=1000)
_ = m.fit(tfidf_train, s)
pred = m.predict(tfidf_test)
print("test accuracy:", round(accuracy_score(pred, y_test), 2))

print("\nNow we show the improvement using confident learning to characterize the noise")
print("and learn on the data that is (with high confidence) labeled correctly.")
print()
print('WITH confident learning (noise matrix given),', end=" ")
_ = rp.fit(tfidf_train, s, noise_matrix=noise_matrix)
pred = rp.predict(tfidf_test)
print("test accuracy:", round(accuracy_score(pred, y_test),2))

print('WITH confident learning (noise / inverse noise matrix given),', end=" ")
_ = rp.fit(tfidf_train, s, noise_matrix = noise_matrix, inverse_noise_matrix=compute_inv_noise_matrix(py, noise_matrix))
pred = rp.predict(tfidf_test)
print("test accuracy:", round(accuracy_score(pred, y_test),2))

print('WITH confident learning (using latent noise matrix estimation),', end=" ")
m = LogisticRegression(solver='lbfgs', multi_class='auto', max_iter=1000)
rp = LearningWithNoisyLabels(clf = m)
_ = rp.fit(tfidf_train, s)
pred = rp.predict(tfidf_test)
print("test accuracy:", round(accuracy_score(pred, y_test),2))

print('WITH confident learning (using calibrated confident joint),', end=" ")
m = LogisticRegression(solver='lbfgs', multi_class='auto', max_iter=1000)
rp = LearningWithNoisyLabels(clf=m)
_ = rp.fit(tfidf_train, s)
pred = rp.predict(tfidf_test)
print("test accuracy:", round(accuracy_score(pred, y_test),2))