In [1]:
import pandas as pd

from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
url = 'https://storage.googleapis.com/gresearch/goemotions/data/full_dataset/goemotions_1.csv'
df = pd.read_csv(url)
df.head()

Unnamed: 0,text,id,author,subreddit,link_id,parent_id,created_utc,rater_id,example_very_unclear,admiration,...,love,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral
0,That game hurt.,eew5j0j,Brdd9,nrl,t3_ajis4z,t1_eew18eq,1548381000.0,1,False,0,...,0,0,0,0,0,0,0,1,0,0
1,>sexuality shouldn’t be a grouping category I...,eemcysk,TheGreen888,unpopularopinion,t3_ai4q37,t3_ai4q37,1548084000.0,37,True,0,...,0,0,0,0,0,0,0,0,0,0
2,"You do right, if you don't care then fuck 'em!",ed2mah1,Labalool,confessions,t3_abru74,t1_ed2m7g7,1546428000.0,37,False,0,...,0,0,0,0,0,0,0,0,0,1
3,Man I love reddit.,eeibobj,MrsRobertshaw,facepalm,t3_ahulml,t3_ahulml,1547965000.0,18,False,0,...,1,0,0,0,0,0,0,0,0,0
4,"[NAME] was nowhere near them, he was by the Fa...",eda6yn6,American_Fascist713,starwarsspeculation,t3_ackt2f,t1_eda65q2,1546669000.0,2,False,0,...,0,0,0,0,0,0,0,0,0,1


In [3]:
df.columns

Index(['text', 'id', 'author', 'subreddit', 'link_id', 'parent_id',
       'created_utc', 'rater_id', 'example_very_unclear', 'admiration',
       'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion',
       'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust',
       'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy',
       'love', 'nervousness', 'optimism', 'pride', 'realization', 'relief',
       'remorse', 'sadness', 'surprise', 'neutral'],
      dtype='object')

In [4]:
df.excitement.value_counts()

0    68100
1     1900
Name: excitement, dtype: int64

In [5]:
X,y = df.text, df.excitement
pipe = make_pipeline(
    CountVectorizer(),
    LogisticRegression(class_weight='balanced', max_iter=1000)
)

In [6]:
%%time
pipe.fit(X,y)

CPU times: total: 953 ms
Wall time: 3.08 s


In [7]:
pipe.predict_proba(X)

array([[0.81908219, 0.18091781],
       [0.87343858, 0.12656142],
       [0.99887482, 0.00112518],
       ...,
       [0.95765476, 0.04234524],
       [0.89401878, 0.10598122],
       [0.97989132, 0.02010868]])

In [8]:
probas = pipe.predict_proba(X)[:, 0]
filtered = df[(probas>0.45) & (probas < 0.55)]
filtered[['text', 'excitement']].head(10)

Unnamed: 0,text,excitement
8,that's adorable asf,0
46,"If there’s a pattern, yes.",0
107,My fans on patreon will be rewarded soon,0
154,"Ones with close ties to SA, anyway. An escaped...",0
158,I really like this ring so I’m glad to hear that.,0
262,OMG THOSE TINY SHOES! *desire to boop snoot in...,0
362,This. I relate to this. So much. Almost too much.,0
427,"Wow, the Wizards are horrible",0
477,All the people in this thread are fucking disg...,0
590,You should come. You'd enjoy it.,0


In [9]:
pred = pipe.predict(X)
df[df.excitement != pred].shape

(5315, 37)

In [10]:
def get_confidence(X,y,mod):
    probas = mod.predict_proba(X)
    values = []
    for i, proba in enumerate(probas):
        proba_dict = {mod.classes_[j]: v for j, v in enumerate(proba)}
        values.append(proba_dict[y[i]])
    return values

df['confidence'] = get_confidence(X,y,pipe)
df.sort_values('confidence')

Unnamed: 0,text,id,author,subreddit,link_id,parent_id,created_utc,rater_id,example_very_unclear,admiration,...,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral,confidence
5676,I am inexplicably excited by [NAME]. I get so ...,eekgi19,AsTheCoolKidsSay,Gunners,t3_ai1ge0,t3_ai1ge0,1.548019e+09,37,False,0,...,0,0,0,0,0,0,0,0,1,0.000148
42757,Omg this is so amazing ! Keep up the awesome w...,eczwg8f,AceLynxx,TheWalkingDeadGame,t3_abfivk,t3_abfivk,1.546322e+09,5,False,1,...,0,0,0,0,0,0,0,0,0,0.000187
28707,Omg this is so amazing ! Keep up the awesome w...,eczwg8f,AceLynxx,TheWalkingDeadGame,t3_abfivk,t3_abfivk,1.546322e+09,70,False,1,...,0,0,0,0,0,0,0,0,0,0.000187
24756,Sounds like a fun game. Our home game around h...,edt2ngm,22pablo,poker,t3_aevm5l,t3_aevm5l,1.547216e+09,24,False,0,...,0,0,0,0,0,0,0,0,1,0.000262
44459,So no replays for arsenal penalty calls.. Cool...,eeypghm,iamthesap,Gunners,t3_ajsvk0,t3_ajsvk0,1.548452e+09,51,False,0,...,0,0,0,0,0,0,0,0,0,0.000595
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14425,"You got the true ending don't worry, you would...",edf5hi9,wackbacksack,Persona5,t3_adaedu,t3_adaedu,1.546812e+09,49,False,0,...,0,0,0,0,0,0,0,0,0,1.000000
49519,"Low sex drive is not uncommon, but one must ma...",eeongda,Jonny__Whishbone07,Marriage,t3_aiiu6m,t3_aiiu6m,1.548157e+09,41,False,0,...,0,0,0,0,0,0,0,0,1,1.000000
10029,"Low sex drive is not uncommon, but one must ma...",eeongda,Jonny__Whishbone07,Marriage,t3_aiiu6m,t3_aiiu6m,1.548157e+09,61,False,0,...,0,0,0,0,0,0,0,0,0,1.000000
19818,That unfortunately still doesn’t stop the thou...,ef7w8xr,theguyfromuncle420,self,t3_akqzow,t1_ef7w17h,1.548722e+09,52,False,1,...,0,0,0,0,0,0,0,0,0,1.000000


In [11]:
from cleanlab.pruning import get_noise_indices

ordered_errors = get_noise_indices(
    s=y,
    psx=pipe.predict_proba(X),
    sorted_index_method='prob_given_label'
)
ordered_errors.shape

(1857,)

In [12]:
df.iloc[ordered_errors][['text','excitement']].head(20)

Unnamed: 0,text,excitement
5676,I am inexplicably excited by [NAME]. I get so ...,0
42757,Omg this is so amazing ! Keep up the awesome w...,0
28707,Omg this is so amazing ! Keep up the awesome w...,0
24756,Sounds like a fun game. Our home game around h...,0
44459,So no replays for arsenal penalty calls.. Cool...,0
20823,"Wow, your posting history is a real... interes...",0
69395,"Wow, your posting history is a real... interes...",0
2001,No different than people making a big deal abo...,0
30921,"Hey congrats!! That's amazing, you've done suc...",0
39475,"I just read your list and now I can't wait, ei...",0


In [14]:
from cleanlab.classification import LearningWithNoisyLabels
from sklearn.linear_model import LogisticRegression

fresh_pipe = make_pipeline(
    CountVectorizer(),
    LogisticRegression(class_weight='balanced', max_iter=1000)
)
lnl = LearningWithNoisyLabels(clf=fresh_pipe)
lnl.fit(X=X, s=y.values)

In [16]:
disagreement = df.loc[lnl.predict(X) != pipe.predict(X)]
disagreement[['text', 'excitement']].sample(5)

Unnamed: 0,text,excitement
25919,This is hilarious if you read it in [NAME] voice,0
47961,It's actually insane how she looks like [NAME].,1
34370,Welcome to the subreddit!,0
24636,my mom caught be masturbating to dominatrix po...,0
58625,Thank you! Looks delicious.,0
