In [1]:
%load_ext autoreload
%autoreload 2


In [2]:
import sys
import os
cwd = os.getcwd()
sys.path.insert(0, cwd + "/..")

In [3]:
from tqdm import tqdm
import matplotlib.pylab as plt
import numpy as np

# Loads a Dataset

In [4]:
from koala.utils import convert_to_samples

In [5]:
from datasets import load_dataset
NAME_DATASET = "ag_news"
def change_label(sample, dict_names):
    sample.label = dict_names[sample.label]
    return sample

if NAME_DATASET == "ag_news":
    dict_names_labels = {0: "world",
                        1: "sports",
                        2: "business",
                        3: "science/tech"}
    
    datasets = load_dataset('ag_news')
    dataset_train = datasets["train"]
    dataset_test = datasets["test"]
    train_samples = convert_to_samples(dataset_train)
    test_samples = convert_to_samples(dataset_test)
    train_samples = [change_label(s, dict_names_labels) for s in train_samples]
    test_samples = [change_label(s, dict_names_labels) for s in test_samples]
elif NAME_DATASET == "per_sent":
    dict_names_labels = {0: "negative",
                        1: "neutral",
                        2: "positive"}
    
    datasets = load_dataset("per_sent")
    # We echange test and train because test is not balanced and it's too small to be balanced
    dataset_train = datasets["test_random"]
    dataset_test = datasets["train"]
    
    train_samples = convert_to_samples(dataset_train, 'DOCUMENT', 'TRUE_SENTIMENT')
    test_samples = convert_to_samples(dataset_test,  'DOCUMENT', 'TRUE_SENTIMENT')
    train_samples = [change_label(s, dict_names_labels) for s in train_samples]
    test_samples = [change_label(s, dict_names_labels) for s in test_samples]
    test_samples = balance_data(test_samples)
elif NAME_DATASET == "hate_speech_offensive":
    dict_names_labels = {0: "hate_speech",
                    1: "offensive",
                    2: "neither"}
    
    datasets = load_dataset("hate_speech_offensive")
    dataset_train = datasets["train"].filter(lambda example, indice: indice < 1000, with_indices=True)
    dataset_test = datasets["train"].filter(lambda example, indice: 1000< indice, with_indices=True)
    
    train_samples = convert_to_samples(dataset_train, 'tweet', 'class')
    test_samples = convert_to_samples(dataset_test,  'tweet', 'class')
    train_samples = [change_label(s, dict_names_labels) for s in train_samples]
    
    test_samples = [change_label(s, dict_names_labels) for s in test_samples]
    test_samples = balance_data(test_samples)
elif NAME_DATASET == "yahoo_answers_topics":
    datasets = load_dataset("yahoo_answers_topics")
    dataset_train = datasets["train"].filter(lambda example, indice: indice < 1000, with_indices=True)
    dataset_test = datasets["test"].filter(lambda example, indice: indice < 2000, with_indices=True)
    train_samples = convert_to_samples(dataset_train, "question_title", "topic")
    test_samples = convert_to_samples(dataset_test,"question_title", "topic")
    test_samples = balance_data(test_samples)


Using custom data configuration default
Reusing dataset ag_news (/home/rodri/.cache/huggingface/datasets/ag_news/default/0.0.0/fb5c5e74a110037311ef5e904583ce9f8b9fbc1354290f97b4929f01b3f48b1a)


In [17]:
labels = ["sports", "business"]
train_samples = [s for s in train_samples if s.label in ["sports", "business"]]
test_samples = [s for s in test_samples if s.label in ["sports", "business"]]

In [187]:
train_samples_small =train_samples[:200]
np.random.shuffle(train_samples_small)

In [188]:
len(train_samples), len(test_samples)

(60000, 3800)

In [189]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(max_features=1000, lowercase=True)

In [190]:
X = vectorizer.fit_transform([s.text for s in train_samples_small])

In [191]:
X.shape

(200, 1000)

In [192]:
y = [labels.index(s.label) for s in train_samples_small]

In [193]:
np.unique(y, return_counts=True)

(array([0, 1]), array([101,  99]))

In [206]:
from sklearn.linear_model import OrthogonalMatchingPursuit
omp = OrthogonalMatchingPursuit(n_nonzero_coefs=200)

In [207]:
omp = omp.fit(X.toarray(), y)

dependence in the dictionary. The requested precision might not have been met.

  copy_X=copy_X, return_path=return_path)


In [208]:
coef = omp.coef_
len(coef.nonzero()[0])

119

In [209]:
labels

['sports', 'business']

In [210]:
word_index = vectorizer.get_feature_names()
[(word_index[i], coef[i]) for i in coef.nonzero()[0]]

[('17', 0.1657167681779543),
 ('200', -0.2864468429227494),
 ('afp', -0.1263572439590028),
 ('africa', 0.6024885380281998),
 ('amateur', -0.33273616284940427),
 ('another', -0.9807416376401726),
 ('ap', -0.009457443522249193),
 ('apple', 0.11635232232799393),
 ('arsenal', -0.4976141965793186),
 ('athens', -0.8354229899620476),
 ('athletes', 0.33556619394619225),
 ('atp', -0.0191475483340566),
 ('bacsik', -0.46435615939824515),
 ('before', 0.8416250736806827),
 ('betting', -0.5000000000000062),
 ('birdie', 0.009457443522254479),
 ('bryant', -0.3872473985844241),
 ('burn', -0.5263461103189893),
 ('canada', 0.19424711886940715),
 ('carlyle', 0.45290035397796197),
 ('chad', -0.23270464465598714),
 ('charley', -0.7991691925215543),
 ('claims', 1.7504301773016562),
 ('clear', 0.2572295856805437),
 ('clubbed', -0.39799406757696887),
 ('coach', -0.2353892787714339),
 ('colander', 0.6099150592135548),
 ('collaboration', 0.1839510791243521),
 ('companies', -0.05063926989389955),
 ('company', 5.1

In [211]:
X_test = vectorizer.transform([s.text for s in test_samples])

In [212]:
y_test = [labels.index(s.label) for s in test_samples]

In [213]:
np.unique(y_test, return_counts=True)

(array([0, 1]), array([1900, 1900]))

In [214]:
X_test.shape

(3800, 1000)

In [215]:
y_predicted_test = X_test * np.transpose(coef)
y_predicted_test = [0 if x < 0.5 else 1 for x in y_predicted_test]

In [216]:
from sklearn import metrics
print(metrics.classification_report(list(y_test), list(y_predicted_test)))

              precision    recall  f1-score   support

           0       0.51      0.96      0.66      1900
           1       0.63      0.07      0.12      1900

    accuracy                           0.51      3800
   macro avg       0.57      0.51      0.39      3800
weighted avg       0.57      0.51      0.39      3800



In [217]:
metrics.accuracy_score(y_test, y_predicted_test)

0.513421052631579