In [1]:
from base import BoWS, OneVsAllGridClassifier
import io

from sklearn.metrics import f1_score,confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC

from glob import glob

from os import path
import os

from collections import Counter


import gc

In [2]:
def read_texts(filename):
    with io.open(filename, newline='\n') as filin:
        return filin.readlines()

def get_array(X, idxs):
    return [ X[idx] for idx in idxs ]
def read_dataset(pathname):
    texts = read_texts(path.join(pathname, 'texts.txt'))
    scores = read_texts(path.join(pathname, 'score.txt'))
    scores = list(map(int, scores))
    return texts,scores
def dump_svmlight_file_gz(X,y,filename):
    with gzip.open(filename, 'w') as filout:
        dump_svmlight_file(X, y, filout, zero_based=False)
def load_splits_ids(folddir):
    splits = []
    with open(folddir, encoding='utf8', errors='ignore') as fileout:
        for line in fileout.readlines():
            train_index, test_index = line.split(';')
            train_index = list(map(int, train_index.split()))
            test_index = list(map(int, test_index.split()))
            splits.append( (train_index, test_index) )
    return splits 

In [22]:
datasetdir = "../../../../Documentos/datasets/only_docs_scores/irony/"
train_test_splits = load_splits_ids(path.join(datasetdir, 'representations', 'split_5.csv' ))
texts,scores = read_dataset(datasetdir)

In [23]:
(train_ids, test_ids) = train_test_splits[0]
X_train = get_array(texts, train_ids)
y_train = get_array(scores, train_ids)

bows = BoWS(min_df=2)
X_train_transformed = bows.fit_transform(X_train, y_train)

weak_clf = LogisticRegression(random_state=42, max_iter=1000)
weak_params = {'penalty': ['l1', 'l2'], 'class_weight': ['balanced', None], 'solver': ['liblinear'], 'C': [1, 10, 0.1, 0.01]}

#weak_clf = LinearSVC(random_state=42)
#weak_params = {'C': [1, 10, 0.1, 0.01]}

meta_clf = DecisionTreeClassifier()
meta_params = { 'criterion': [ "gini", "entropy" ], 'max_depth': [None, 2, 4, 6], 'min_samples_split': [2,4,6], 'min_samples_leaf': [1, 2, 4, 6] }

oal = OneVsAllGridClassifier( weak_params, weak_clf, meta_params, meta_clf )

y_pred = oal.fit_predict(X_train_transformed, y_train)

X_test = get_array(texts, test_ids)
X_test_transformed = bows.transform(X_test)

y_test = get_array(scores, test_ids)
gc.collect()

Building class representations: 100%|██████████| 63/63 [00:00<00:00, 1271.40it/s]
  self._set_arrayXarray(i, j, x)
Building Models: 100%|██████████| 3/3 [00:00<00:00, 299.62it/s]
Building class transformation: 100%|██████████| 63/63 [00:00<00:00, 450.03it/s]
Building class transformation: 100%|██████████| 63/63 [00:00<00:00, 427.66it/s]
Building class transformation: 100%|██████████| 63/63 [00:00<00:00, 527.75it/s]
Building class transformation: 100%|██████████| 18/18 [00:00<00:00, 347.31it/s]
Building class transformation: 100%|██████████| 18/18 [00:00<00:00, 644.42it/s]
Building class transformation: 100%|██████████| 18/18 [00:00<00:00, 2130.53it/s]


632

In [20]:
import json

In [21]:
with open('tmp_output/caralho.json', 'w') as fp:
    json.dump({'vai': 'tomar', 'no': 'cu'}, fp)

In [7]:
y_pred_test = oal.predict(X_test_transformed)

In [8]:
confusion_matrix(y_test, y_pred_test, labels=oal.classes_)/len(y_test)

array([[0.33333333, 0.16666667, 0.        ],
       [0.16666667, 0.05555556, 0.        ],
       [0.22222222, 0.        , 0.05555556]])

In [10]:
f1_micr = f1_score(y_test, y_pred_test, average='micro')
f1_macr = f1_score(y_test, y_pred_test, average='macro')
print("F1_mi: %.3f and F1_ma: %.3f" % ( f1_micr, f1_macr) )

F1_mi: 0.444 and F1_ma: 0.376


In [11]:
for c in X_test_transformed:
    y_pred = oal.clf_by_class[c].predict(X_test_transformed[c])
    y_test_transformed = oal.transform_y(y_test, c)
    f1_micr = f1_score(y_pred, y_test_transformed, average='micro')
    f1_macr = f1_score(y_pred, y_test_transformed, average='macro')
    print("%d F1_mi: %.3f and F1_ma: %.3f" % ( c, f1_micr, f1_macr) )

-1 F1_mi: 0.444 and F1_ma: 0.375
0 F1_mi: 0.778 and F1_ma: 0.438
1 F1_mi: 0.722 and F1_ma: 0.419


  'recall', 'true', average, warn_for)


In [5]:
sorted(glob("../../../../Documentos/datasets/only_docs_scores/*"), key=lambda x: len(read_dataset(x)[1]) )

['../../../../Documentos/datasets/only_docs_scores/irony',
 '../../../../Documentos/datasets/only_docs_scores/sarcasm',
 '../../../../Documentos/datasets/only_docs_scores/aisopos_ntua',
 '../../../../Documentos/datasets/only_docs_scores/nikolaos_ted',
 '../../../../Documentos/datasets/only_docs_scores/sentistrength_bbc',
 '../../../../Documentos/datasets/only_docs_scores/sentistrength_myspace',
 '../../../../Documentos/datasets/only_docs_scores/sentistrength_rw',
 '../../../../Documentos/datasets/only_docs_scores/sentistrength_digg',
 '../../../../Documentos/datasets/only_docs_scores/debate',
 '../../../../Documentos/datasets/only_docs_scores/sentistrength_youtube',
 '../../../../Documentos/datasets/only_docs_scores/sanders',
 '../../../../Documentos/datasets/only_docs_scores/vader_amazon',
 '../../../../Documentos/datasets/only_docs_scores/english_dailabor',
 '../../../../Documentos/datasets/only_docs_scores/vader_twitter',
 '../../../../Documentos/datasets/only_docs_scores/sentistren

In [35]:
for datasetdir in sorted(glob("../../../../Documentos/datasets/only_docs_scores/*"), key=lambda x: len(read_dataset(x)[1]) ):
    texts,scores = read_dataset(datasetdir)
    train_test_splits = load_splits_ids(path.join(datasetdir, 'representations', 'split_5.csv' ))
    dname = path.basename(datasetdir)
    print(dname)
    for f, (train_ids, test_ids) in enumerate(train_test_splits):
        X_train = get_array(texts, train_ids)
        y_train = get_array(scores, train_ids)

        bows = BoWS(min_df=2)
        X_train_transformed = bows.fit_transform(X_train, y_train)

        weak_clf = LogisticRegression(random_state=42)
        weak_params = {'penalty': ['l1', 'l2'], 'class_weight': ['balanced', None], 'solver': ['liblinear'], 'C': [1, 10, 0.1, 0.01]}
        #weak_params = {'penalty': ['l2'], 'class_weight': [None], 'solver': ['liblinear'], 'C': [0.01]}
        meta_clf = DecisionTreeClassifier()
        meta_params = { 'criterion': [ "gini", "entropy" ] }

        oal = OneVsAllGridClassifier( weak_params, weak_clf, meta_params, meta_clf )

        y_pred = oal.fit_predict(X_train_transformed, y_train)

        X_test = get_array(texts, test_ids)
        X_test_transformed = bows.transform(X_test)

        y_test = get_array(scores, test_ids)

        print("Fold %d" % f)
        for c in X_test_transformed:
            y_pred = oal.clf_by_class[c].predict(X_test_transformed[c])
            y_test_transformed = oal.transform_y(y_test, c)
            f1_micr = f1_score(y_pred, y_test_transformed, average='micro')
            f1_macr = f1_score(y_pred, y_test_transformed, average='macro')
            print("\t%d F1_mi: %.3f and F1_ma: %.3f" % ( c, f1_micr, f1_macr) )

        gc.collect()
    

Building class representations: 100%|██████████| 63/63 [00:00<00:00, 2006.80it/s]
  self._set_arrayXarray(i, j, x)
Building Models: 100%|██████████| 3/3 [00:00<00:00, 436.19it/s]

irony



Building class transformation: 100%|██████████| 63/63 [00:00<00:00, 1072.64it/s]
Building class transformation: 100%|██████████| 63/63 [00:00<00:00, 1207.35it/s]
Building class transformation: 100%|██████████| 63/63 [00:00<00:00, 1159.99it/s]
Building class transformation: 100%|██████████| 18/18 [00:00<00:00, 755.24it/s]
Building class transformation: 100%|██████████| 18/18 [00:00<00:00, 955.30it/s]
Building class transformation: 100%|██████████| 18/18 [00:00<00:00, 2522.55it/s]
  'recall', 'true', average, warn_for)
Building class representations: 100%|██████████| 64/64 [00:00<00:00, 1938.99it/s]
  self._set_arrayXarray(i, j, x)
Building Models: 100%|██████████| 3/3 [00:00<00:00, 455.99it/s]

Fold 0
	-1 F1_mi: 0.611 and F1_ma: 0.579
	0 F1_mi: 0.778 and F1_ma: 0.438
	1 F1_mi: 0.722 and F1_ma: 0.419
irony



Building class transformation: 100%|██████████| 64/64 [00:00<00:00, 1870.34it/s]
Building class transformation: 100%|██████████| 64/64 [00:00<00:00, 1269.58it/s]
Building class transformation: 100%|██████████| 64/64 [00:00<00:00, 1161.81it/s]
Building class transformation: 100%|██████████| 17/17 [00:00<00:00, 1018.84it/s]
Building class transformation: 100%|██████████| 17/17 [00:00<00:00, 3440.11it/s]
Building class transformation: 100%|██████████| 17/17 [00:00<00:00, 931.67it/s]
  'recall', 'true', average, warn_for)
Building class representations: 100%|██████████| 65/65 [00:00<00:00, 2108.46it/s]
  self._set_arrayXarray(i, j, x)
Building Models: 100%|██████████| 3/3 [00:00<00:00, 369.97it/s]

Fold 1
	-1 F1_mi: 0.471 and F1_ma: 0.395
	0 F1_mi: 0.824 and F1_ma: 0.452
	1 F1_mi: 0.706 and F1_ma: 0.414
irony



Building class transformation: 100%|██████████| 65/65 [00:00<00:00, 1256.70it/s]
Building class transformation: 100%|██████████| 65/65 [00:00<00:00, 1297.38it/s]
Building class transformation: 100%|██████████| 65/65 [00:00<00:00, 1139.05it/s]
Building class transformation: 100%|██████████| 16/16 [00:00<00:00, 851.00it/s]
Building class transformation: 100%|██████████| 16/16 [00:00<00:00, 1992.48it/s]
Building class transformation: 100%|██████████| 16/16 [00:00<00:00, 2853.63it/s]
  'recall', 'true', average, warn_for)
Building class representations: 100%|██████████| 66/66 [00:00<00:00, 2016.73it/s]
  self._set_arrayXarray(i, j, x)
Building Models: 100%|██████████| 3/3 [00:00<00:00, 392.27it/s]

Fold 2
	-1 F1_mi: 0.562 and F1_ma: 0.515
	0 F1_mi: 0.812 and F1_ma: 0.448
	1 F1_mi: 0.750 and F1_ma: 0.429
irony



Building class transformation: 100%|██████████| 66/66 [00:00<00:00, 1524.22it/s]
Building class transformation: 100%|██████████| 66/66 [00:00<00:00, 1366.04it/s]
Building class transformation: 100%|██████████| 66/66 [00:00<00:00, 1312.90it/s]
Building class transformation: 100%|██████████| 15/15 [00:00<00:00, 2493.25it/s]
Building class transformation: 100%|██████████| 15/15 [00:00<00:00, 848.81it/s]
Building class transformation: 100%|██████████| 15/15 [00:00<00:00, 875.10it/s]
  'recall', 'true', average, warn_for)
Building class representations: 100%|██████████| 66/66 [00:00<00:00, 1922.47it/s]
  self._set_arrayXarray(i, j, x)
Building Models: 100%|██████████| 3/3 [00:00<00:00, 471.73it/s]

Fold 3
	-1 F1_mi: 0.667 and F1_ma: 0.641
	0 F1_mi: 0.800 and F1_ma: 0.444
	1 F1_mi: 0.733 and F1_ma: 0.423
irony



Building class transformation: 100%|██████████| 66/66 [00:00<00:00, 1227.05it/s]
Building class transformation: 100%|██████████| 66/66 [00:00<00:00, 1305.93it/s]
Building class transformation: 100%|██████████| 66/66 [00:00<00:00, 1222.96it/s]
Building class transformation: 100%|██████████| 15/15 [00:00<00:00, 1370.63it/s]
Building class transformation: 100%|██████████| 15/15 [00:00<00:00, 813.89it/s]
Building class transformation: 100%|██████████| 15/15 [00:00<00:00, 3216.98it/s]
  'recall', 'true', average, warn_for)
Building class representations: 100%|██████████| 75/75 [00:00<00:00, 1804.99it/s]
  self._set_arrayXarray(i, j, x)
Building Models: 100%|██████████| 3/3 [00:00<00:00, 336.56it/s]

Fold 4
	-1 F1_mi: 0.533 and F1_ma: 0.348
	0 F1_mi: 0.800 and F1_ma: 0.444
	1 F1_mi: 0.733 and F1_ma: 0.423
sarcasm



Building class transformation: 100%|██████████| 75/75 [00:00<00:00, 908.29it/s]
Building class transformation: 100%|██████████| 75/75 [00:00<00:00, 1020.13it/s]
Building class transformation: 100%|██████████| 75/75 [00:00<00:00, 1222.09it/s]
Building class transformation: 100%|██████████| 20/20 [00:00<00:00, 3517.97it/s]
Building class transformation: 100%|██████████| 20/20 [00:00<00:00, 1753.69it/s]
Building class transformation: 100%|██████████| 20/20 [00:00<00:00, 3836.02it/s]
  'recall', 'true', average, warn_for)
Building class representations: 100%|██████████| 75/75 [00:00<00:00, 1944.87it/s]
  self._set_arrayXarray(i, j, x)
Building Models: 100%|██████████| 3/3 [00:00<00:00, 481.64it/s]

Fold 0
	-1 F1_mi: 0.700 and F1_ma: 0.697
	0 F1_mi: 0.750 and F1_ma: 0.429
	1 F1_mi: 0.650 and F1_ma: 0.394
sarcasm



Building class transformation: 100%|██████████| 75/75 [00:00<00:00, 1169.22it/s]
Building class transformation: 100%|██████████| 75/75 [00:00<00:00, 1342.06it/s]
Building class transformation: 100%|██████████| 75/75 [00:00<00:00, 1101.16it/s]
Building class transformation: 100%|██████████| 20/20 [00:00<00:00, 1822.70it/s]
Building class transformation: 100%|██████████| 20/20 [00:00<00:00, 3329.21it/s]
Building class transformation: 100%|██████████| 20/20 [00:00<00:00, 2207.94it/s]
  'recall', 'true', average, warn_for)
Building class representations: 100%|██████████| 75/75 [00:00<00:00, 1897.45it/s]
  self._set_arrayXarray(i, j, x)
Building Models: 100%|██████████| 3/3 [00:00<00:00, 498.95it/s]

Fold 1
	-1 F1_mi: 0.400 and F1_ma: 0.394
	0 F1_mi: 0.750 and F1_ma: 0.429
	1 F1_mi: 0.650 and F1_ma: 0.394
sarcasm



Building class transformation: 100%|██████████| 75/75 [00:00<00:00, 1243.18it/s]
Building class transformation: 100%|██████████| 75/75 [00:00<00:00, 1174.39it/s]
Building class transformation: 100%|██████████| 75/75 [00:00<00:00, 1063.72it/s]
Building class transformation: 100%|██████████| 20/20 [00:00<00:00, 1647.31it/s]
Building class transformation: 100%|██████████| 20/20 [00:00<00:00, 1129.08it/s]
Building class transformation: 100%|██████████| 20/20 [00:00<00:00, 1242.68it/s]
  'recall', 'true', average, warn_for)
Building class representations: 100%|██████████| 77/77 [00:00<00:00, 1929.21it/s]
  self._set_arrayXarray(i, j, x)
Building Models: 100%|██████████| 3/3 [00:00<00:00, 450.13it/s]

Fold 2
	-1 F1_mi: 0.500 and F1_ma: 0.500
	0 F1_mi: 0.750 and F1_ma: 0.429
	1 F1_mi: 0.450 and F1_ma: 0.437
sarcasm



Building class transformation: 100%|██████████| 77/77 [00:00<00:00, 1202.29it/s]
Building class transformation: 100%|██████████| 77/77 [00:00<00:00, 1193.02it/s]
Building class transformation: 100%|██████████| 77/77 [00:00<00:00, 1105.69it/s]
Building class transformation: 100%|██████████| 18/18 [00:00<00:00, 1058.51it/s]
Building class transformation: 100%|██████████| 18/18 [00:00<00:00, 2260.20it/s]
Building class transformation: 100%|██████████| 18/18 [00:00<00:00, 1712.89it/s]
  'recall', 'true', average, warn_for)
Building class representations: 100%|██████████| 78/78 [00:00<00:00, 2004.16it/s]
  self._set_arrayXarray(i, j, x)
Building Models: 100%|██████████| 3/3 [00:00<00:00, 463.54it/s]

Fold 3
	-1 F1_mi: 0.389 and F1_ma: 0.387
	0 F1_mi: 0.722 and F1_ma: 0.419
	1 F1_mi: 0.500 and F1_ma: 0.458
sarcasm



Building class transformation: 100%|██████████| 78/78 [00:00<00:00, 814.03it/s]
Building class transformation: 100%|██████████| 78/78 [00:00<00:00, 1145.51it/s]
Building class transformation: 100%|██████████| 78/78 [00:00<00:00, 1153.91it/s]
Building class transformation: 100%|██████████| 17/17 [00:00<00:00, 914.35it/s]
Building class transformation: 100%|██████████| 17/17 [00:00<00:00, 904.87it/s]
Building class transformation: 100%|██████████| 17/17 [00:00<00:00, 360.54it/s]
  'recall', 'true', average, warn_for)
Building class representations:   0%|          | 0/286 [00:00<?, ?it/s]

Fold 4
	-1 F1_mi: 0.471 and F1_ma: 0.471
	0 F1_mi: 0.765 and F1_ma: 0.433
	1 F1_mi: 0.353 and F1_ma: 0.344
stanford_tweets


Building class representations: 100%|██████████| 286/286 [00:00<00:00, 1854.84it/s]
  self._set_arrayXarray(i, j, x)
Building Models: 100%|██████████| 2/2 [00:00<00:00, 277.63it/s]
Building class transformation: 100%|██████████| 286/286 [00:00<00:00, 896.87it/s]
Building class transformation: 100%|██████████| 286/286 [00:00<00:00, 847.67it/s]
Building class transformation: 100%|██████████| 73/73 [00:00<00:00, 957.94it/s]
Building class transformation: 100%|██████████| 73/73 [00:00<00:00, 938.69it/s]
Building class representations: 100%|██████████| 286/286 [00:00<00:00, 2065.28it/s]

Fold 0
	-1 F1_mi: 0.781 and F1_ma: 0.781
	1 F1_mi: 0.781 and F1_ma: 0.780
stanford_tweets



  self._set_arrayXarray(i, j, x)
Building Models: 100%|██████████| 2/2 [00:00<00:00, 297.82it/s]
Building class transformation: 100%|██████████| 286/286 [00:00<00:00, 1010.70it/s]
Building class transformation: 100%|██████████| 286/286 [00:00<00:00, 1015.96it/s]
Building class transformation: 100%|██████████| 73/73 [00:00<00:00, 937.60it/s]
Building class transformation: 100%|██████████| 73/73 [00:00<00:00, 972.82it/s]
Building class representations: 100%|██████████| 288/288 [00:00<00:00, 2073.22it/s]
  self._set_arrayXarray(i, j, x)
Building Models:   0%|          | 0/2 [00:00<?, ?it/s]

Fold 1
	-1 F1_mi: 0.740 and F1_ma: 0.740
	1 F1_mi: 0.808 and F1_ma: 0.807
stanford_tweets


Building Models: 100%|██████████| 2/2 [00:00<00:00, 241.48it/s]
Building class transformation: 100%|██████████| 288/288 [00:00<00:00, 919.35it/s]
Building class transformation: 100%|██████████| 288/288 [00:00<00:00, 1011.52it/s]
Building class transformation: 100%|██████████| 71/71 [00:00<00:00, 925.52it/s]
Building class transformation: 100%|██████████| 71/71 [00:00<00:00, 939.86it/s]
Building class representations: 100%|██████████| 288/288 [00:00<00:00, 2031.88it/s]

Fold 2
	-1 F1_mi: 0.803 and F1_ma: 0.803
	1 F1_mi: 0.817 and F1_ma: 0.816
stanford_tweets



  self._set_arrayXarray(i, j, x)
Building Models: 100%|██████████| 2/2 [00:00<00:00, 274.06it/s]
Building class transformation: 100%|██████████| 288/288 [00:00<00:00, 987.95it/s]
Building class transformation: 100%|██████████| 288/288 [00:00<00:00, 1043.50it/s]
Building class transformation: 100%|██████████| 71/71 [00:00<00:00, 752.86it/s]
Building class transformation: 100%|██████████| 71/71 [00:00<00:00, 935.03it/s]
Building class representations: 100%|██████████| 288/288 [00:00<00:00, 2041.84it/s]

Fold 3
	-1 F1_mi: 0.704 and F1_ma: 0.704
	1 F1_mi: 0.648 and F1_ma: 0.643
stanford_tweets



  self._set_arrayXarray(i, j, x)
Building Models: 100%|██████████| 2/2 [00:00<00:00, 288.81it/s]
Building class transformation: 100%|██████████| 288/288 [00:00<00:00, 998.56it/s]
Building class transformation: 100%|██████████| 288/288 [00:00<00:00, 959.85it/s]


KeyboardInterrupt: 

In [None]:
for datasetdir in sorted(glob("../../../../Documentos/datasets/only_docs_scores/*")):
    print(datasetdir)
    train_test_splits = load_splits_ids(path.join(datasetdir, 'representations', 'split_5.csv' ))
    texts,scores = read_dataset(datasetdir)
    for f, (train_ids, test_ids) in enumerate(train_test_splits):
        X_train = get_array(texts, train_ids)
        y_train = get_array(scores, train_ids)
        
        print(Counter(y_train))

        bows = BoWS(min_df=2)
        X_train_transformed = bows.fit_transform(X_train, y_train)

        weak_clf = LogisticRegression(random_state=42)
        weak_params = {'penalty': ['l1', 'l2'], 'class_weight': ['balanced', None], 'solver': ['liblinear'], 'C': [1, 10, 0.1, 0.01]}
        #weak_params = {'penalty': ['l2'], 'class_weight': [None], 'solver': ['liblinear'], 'C': [0.01]}
        meta_clf = DecisionTreeClassifier()
        meta_params = { 'criterion': [ "gini", "entropy" ] }

        oal = OneVsAllGridClassifier( weak_params, weak_clf, meta_params, meta_clf )

        y_pred = oal.fit_predict(X_train_transformed, y_train)

        X_test = get_array(texts, test_ids)
        X_test_transformed = bows.transform(X_test)

        y_test = get_array(scores, test_ids)
        gc.collect()


In [None]:
X_train_transformed