In [1]:
# Reload all modules every time before executing the Python code typed
%load_ext autoreload
%autoreload 2

In [18]:
import os
import re
import json
from urllib.parse import urljoin
from pathlib import Path
from itertools import chain

from tqdm import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import geopandas as gpd
import datetime

from sklearn.cluster import AgglomerativeClustering, KMeans
import scipy.cluster.hierarchy as shc
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer, HashingVectorizer, TfidfTransformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
import sklearn.model_selection
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import RidgeClassifier, Perceptron, PassiveAggressiveClassifier
from sklearn.naive_bayes import ComplementNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier, NearestCentroid
import sklearn.metrics

import cafeconmiel.data.metadata as metadata
import cafeconmiel.data.token_counts as token_counts
import cafeconmiel.data.manual_process as mprocess
import cafeconmiel.utils.paths as paths_utils
import cafeconmiel.utils.metrics as metrics
import cafeconmiel.visualization.visualize as viz

from dotenv import load_dotenv
load_dotenv()

True

In [3]:
paths = paths_utils.ProjectPaths()
interim_data = paths.interim_data
raw_data = paths.raw_data

In [4]:
with open(paths.ext_data / 'corpora.json') as f:
    corpus_metadata = json.load(f)

# Load pre-processed data

In [6]:
corpus_name = 'all'
list_df = []
for c in corpus_metadata.keys(): # [k for k in corpus_metadata.keys() if k != 'postscriptum']:
    corpus_dir = interim_data / c
    records = []
    for path in corpus_dir.glob('*.json'):
        with open(path) as f:
            d = json.load(f)
            d['retrieved_at'] = datetime.datetime.fromtimestamp(path.stat().st_mtime)
            records.append(d)
    df = pd.DataFrame.from_records(records)
    df = df.loc[df['text'].notnull()].set_index('meta_id')
    df = metadata.normalize(df)
    df['corpus'] = c
    list_df.append(df)
all_docs_df = pd.concat(list_df, axis=0) # , join='inner')

In [7]:
# remove duplicates between codea and charta
all_docs_df = all_docs_df.groupby('meta_id').first().drop_duplicates()

In [8]:
all_docs_df['is_private'] = (
    all_docs_df['doc_type'].str.match('carta(s|) privada(s|)')
    | (all_docs_df['doc_type'] == 'epistolario')
    | all_docs_df['doc_type'].str.match('carta(s|) particular(es|)')
    | all_docs_df['doc_type'].str.match('carta(s|) personal(es|)')
    | all_docs_df['doc_type'].str.extract('(.*) letter,')[0].isin(
        ['personal', 'family', 'friendship', 'love']
    )
)
all_docs_df['is_bal'] = (all_docs_df['region'].str.endswith('Baleares').fillna(False)) | (all_docs_df['corpus'] == 'corpusmallorca')

# Distinguish BAL / PEN and legal / letters

In [9]:
time_mask = (all_docs_df['year'] >= 1700) & (all_docs_df['year'] < 1800)
doc_type_mask = (
    all_docs_df['is_private']
    # all_docs_df['doc_type'].str.match('carta(s|) privada(s|)')
    # | (all_docs_df['doc_type'] == 'epistolario')
)
biling_regions = ('Valencia', 'Castellón', 'Alicante', 'Tarragona', 'Gerona', 'Barcelona', 'Ulldecona', 'Cardedeu', 'S.P.', 'La Coruña', 'Lugo', 'Pontevedra', 'Orense', 'Vitoria', 'Álava', 'Vizcaya', 'Gipuzcoa', 'Navarra')
bal_mask = all_docs_df['corpus'] == 'corpusmallorca'
region_mask = bal_mask | ((~all_docs_df['region'].isin(biling_regions))) # & (all_docs_df['country'] == 'España'))

In [10]:
raw_docs_df = all_docs_df.loc[time_mask & doc_type_mask & region_mask].copy()
raw_docs_df['is_bal'] =  bal_mask.reindex(raw_docs_df.index)
raw_docs_df.shape

(1448, 23)

1. Integreate revised texts
2. Remove punctuation
3. Subsittue abbreviations, case insensitive
4. Make global counts
5. Remove too frequent and too infrequent
6. Remove proper nouns from given list

## Revised texts

In [21]:
revised_dirs = [interim_data / 'private_letters' / d for d in ['PEN_revised', 'BAL_27-09-2023']]
docs_df = mprocess.integrate_revisions(raw_docs_df, revised_dirs)

integrated 498 revised texts


## Filter words

In [50]:
docs_df['text'] = docs_df['text'].replace(regex=r"\. , ; \( \) \? ! :".split(), value=' ').replace(regex=r'[ ]{2,}', value=' ')
subs = pd.read_csv(paths.ext_data / 'substitutions.csv')
subs = subs.set_index('from')['to'].to_dict()
regex = [re.compile(r'\b' + s + r'\b', flags=re.IGNORECASE) for s in subs.keys()]
docs_df['text'] = docs_df['text'].replace(regex=regex, value=list(subs.values()))

In [162]:
words_count_by_doc = token_counts.count_by_doc(docs_df)
nr_tokens_by_doc = words_count_by_doc.groupby('meta_id')['count'].sum()

In [142]:
# rng = np.random.default_rng(1)
# shuffled_docs = rng.permutation(nr_tokens_by_doc.index)
# selected_codea_docs = shuffled_docs[nr_tokens_by_doc.loc[shuffled_docs].cumsum() < max_nr_tokens]
selected_PEN_docs = nr_tokens_by_doc.index

In [143]:
bal_idc = docs_df.index[docs_df['is_bal']]
docs_df = docs_df.loc[bal_idc.union(selected_PEN_docs)].copy()

In [163]:
w = token_counts.count_by_doc(docs_df)
global_counts = token_counts.doc_counts_to_global(w)
global_counts = token_counts.doc_counts_to_global(words_count_by_doc)

In [None]:
prop_upper = 0.8
i = global_counts.index[global_counts['prop_upper'] >= prop_upper]
(paths.processed_data / f'public_proper_nouns_{100-100*prop_upper:.0f}.txt').write_text('\n'.join(i))

In [73]:
words_count_by_doc = token_counts.count_by_doc(docs_df)
global_counts = token_counts.doc_counts_to_global(words_count_by_doc)
# global_counts = token_counts.word_mask(global_counts, min_df=5, max_df=0.5, upper_th=0.4)
proper_nouns = (paths.ext_data / 'public_proper_nouns_20.txt').read_text().split()
global_counts = token_counts.word_mask(global_counts, min_df=5, max_df=0.5, to_exclude=proper_nouns)
# TODO, change params
# global_counts = token_counts.word_mask(global_counts, min_df=0, max_df=1.0, upper_th=1.1)
print(global_counts['word_mask'].sum())
normed_words_count_by_doc = token_counts.filter_doc_counts(
    words_count_by_doc, global_counts['word_mask']
)
normed_words_count_by_doc.head()

1956


Unnamed: 0_level_0,Unnamed: 1_level_0,count,word_mask
meta_id,word_lower,Unnamed: 2_level_1,Unnamed: 3_level_1
AMCV02_01,amado,1,True
AMCV02_01,bien,1,True
AMCV02_01,buena,1,True
AMCV02_01,bueno,1,True
AMCV02_01,casa,1,True


In [146]:
doc_counts = normed_words_count_by_doc
# doc_counts = ngram_doc_counts
counts_mat, docs, tokens = doc_counts['count'].astype('Sparse').sparse.to_coo(sort_labels=True)
clustering_df = docs_df.loc[docs].copy()

## Classification

In [None]:
labels = clustering_df['is_bal'].astype(int)
n_components = 0

X = counts_mat.copy()
transformer = TfidfTransformer(use_idf=True, smooth_idf=False)
X = transformer.fit_transform(X)

print("n_samples: {}, n_features: {}".format(*X.shape))
print()

if n_components:
    print("Performing dimensionality reduction using LSA")
    # Vectorizer results are normalized, which makes KMeans behave as
    # spherical k-means for better results. Since LSA/SVD results are
    # not normalized, we have to redo the normalization.
    svd = TruncatedSVD(n_components)
    normalizer = Normalizer(copy=False)
    lsa = make_pipeline(svd, normalizer)

    X = lsa.fit_transform(X)

    explained_variance = svd.explained_variance_ratio_.sum()
    print(
        "Explained variance of the SVD step: {}%".format(int(explained_variance * 100))
    )

metric_names = [
    'accuracy_score', 'balanced_accuracy_score', 'f1_score', 'precision_score',
    'recall_score', 'jaccard_score'
]

### Unsupervised clustering

In [None]:
# best scores with 10 components
true_k = 2
km = KMeans(n_clusters=true_k,)

print("Clustering sparse data with %s" % km)
km.fit(X)
print()

print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_))
print("Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_))
print("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_))
print("Adjusted MI: %.3f" % metrics.adjusted_mutual_info_score(labels, km.labels_))
print("Adjusted Rand-Index: %.3f" % metrics.adjusted_rand_score(labels, km.labels_))
print(
    "Silhouette Coefficient: %0.3f"
    % sklearn.metrics.silhouette_score(X, km.labels_)
)

if n_components:
    original_space_centroids = svd.inverse_transform(km.cluster_centers_)
else:
    original_space_centroids = km.cluster_centers_

In [None]:
confusion = metrics.confusion_matrix(labels.values, km.labels_)
label_swapped = confusion[0, 0] + confusion[1, 1] < confusion[0, 1] + confusion[1, 0]
if label_swapped:
    print('label 0 means BAL')
    confusion = confusion[:, ::-1]
print(f'True BAL: {confusion[1, 1]}, True notBAL: {confusion[0, 0]}, False BAL: {confusion[0, 1]}, False notBAL: {confusion[1, 0]}, ')
pred_labels = 1 - km.labels_
for mn in metric_names:
    print(f"{mn}: {getattr(metrics, mn)(labels.values, pred_labels):.3f}")

In [93]:
cols = ['notBAL', 'BAL']
if label_swapped:
    cols = cols[::-1]
words_dist = pd.DataFrame(data=original_space_centroids.T, columns=cols, index=tokens)
words_dist['dist'] = (words_dist['BAL'] - words_dist['notBAL'])**2
bal_word_mask = words_dist['BAL'] > words_dist['notBAL']

In [None]:
print('BAL words:')
words_dist.loc[bal_word_mask].nlargest(n=20, columns='dist')

### Supervised classification

In [148]:
cv = sklearn.model_selection.StratifiedKFold(n_splits=5, shuffle=True, random_state=123)
estimators = (
    # ComplementNB(alpha=0.1), MultinomialNB, more for counts
    RidgeClassifier(tol=1e-2, solver="auto"),
    Perceptron(penalty='l2'),
    Perceptron(penalty='l1'),
    PassiveAggressiveClassifier(max_iter=50),
    RandomForestClassifier(),
)

In [149]:
if n_components:
    words_mat = svd.inverse_transform(X)
else:
    words_mat = X.toarray()
bal_center = words_mat[clustering_df['is_bal'], :].mean(axis=0)
notbal_center = words_mat[~clustering_df['is_bal'], :].mean(axis=0)
words_dist = pd.DataFrame({'BAL': bal_center, 'notBAL': notbal_center}, index=tokens)
bal_word_mask = words_dist['BAL'] > words_dist['notBAL']

In [3]:
'{a}'.format(a=1)

'1'

In [150]:
# with all of codea
for est in estimators:
    res = sklearn.model_selection.cross_validate(
        est,
        X,
        labels,
        cv=cv,
        scoring=[mn.replace('_score', '') for mn in metric_names],
        return_estimator=True,
    )
    print(est)
    display(pd.DataFrame.from_dict(res))
    words_dist['dist'] = 0
    for e in res['estimator']:
        coefs = getattr(e, 'feature_importances_', getattr(e, 'coef_', None)).flatten()
        if n_components:
            words_dist['dist'] += (coefs * svd.components_.T).sum(axis=1)
        else:
            words_dist['dist'] += coefs
    top_bal_words = words_dist.loc[bal_word_mask].nlargest(n=20, columns='dist').index
    top_notbal_words = words_dist.loc[~bal_word_mask].nlargest(n=20, columns='dist').index
    print('BAL: ', top_bal_words.to_list())
    print('not BAL: ', top_notbal_words.to_list())

RidgeClassifier(tol=0.01)


Unnamed: 0,fit_time,score_time,estimator,test_accuracy,test_balanced_accuracy,test_f1,test_precision,test_recall,test_jaccard
0,0.015934,0.008281,RidgeClassifier(tol=0.01),0.93007,0.89899,0.878049,0.947368,0.818182,0.782609
1,0.00972,0.006734,RidgeClassifier(tol=0.01),0.943662,0.934137,0.909091,0.909091,0.909091,0.833333
2,0.008498,0.006711,RidgeClassifier(tol=0.01),0.915493,0.87616,0.85,0.944444,0.772727,0.73913
3,0.008458,0.006687,RidgeClassifier(tol=0.01),0.908451,0.864796,0.835443,0.942857,0.75,0.717391
4,0.008433,0.006537,RidgeClassifier(tol=0.01),0.950704,0.918605,0.911392,1.0,0.837209,0.837209


BAL:  ['á', 'mia', 'servir', 'suplico', 'ermano', 'mismo', 'satisfaccion', 'buena', 'cartas', 'servidora', 'dentro', 'mucho', 'tuviese', 'memorias', 'he', 'paraque', 'ha', 'escrito', 'mutxas', 'poco']
not BAL:  ['â', 'ninguna', 'dichos', 'aya', 'dan', 'oi', 'molino', 'asta', 'qual', 'algo', 'arroba', 'dezir', 'siento', 'deque', 'treinta', 'enbarazo', 'firmo', 'saben', 'tios', 'les']
Perceptron(penalty='l2')


Unnamed: 0,fit_time,score_time,estimator,test_accuracy,test_balanced_accuracy,test_f1,test_precision,test_recall,test_jaccard
0,0.003165,0.006671,Perceptron(penalty='l2'),0.895105,0.886364,0.835165,0.808511,0.863636,0.716981
1,0.002738,0.006575,Perceptron(penalty='l2'),0.915493,0.907468,0.866667,0.847826,0.886364,0.764706
2,0.002725,0.006527,Perceptron(penalty='l2'),0.922535,0.881262,0.860759,0.971429,0.772727,0.755556
3,0.002663,0.006537,Perceptron(penalty='l2'),0.887324,0.84949,0.804878,0.868421,0.75,0.673469
4,0.002723,0.006539,Perceptron(penalty='l2'),0.929577,0.910031,0.880952,0.902439,0.860465,0.787234


BAL:  ['á', 'servir', 'mismo', 'memorias', 'cartas', 'suplico', 'buena', 'dentro', 'ermano', 'mia', 'poco', 'servidora', 'nuevo', 'essa', 'mucho', 'satisfaccion', 'informe', 'cuya', 'oraciones', 'mutxas']
not BAL:  ['deque', 'dan', 'echas', 'dezir', 'siento', 'março', 'ninguna', 'arroba', 'molino', 'aya', 'pase', 'estamos', 'algo', 'mejor', 'enbarazo', 'dichos', 'cosa', 'sean', 'grandes', 'està']
Perceptron(penalty='l1')


Unnamed: 0,fit_time,score_time,estimator,test_accuracy,test_balanced_accuracy,test_f1,test_precision,test_recall,test_jaccard
0,0.008647,0.006655,Perceptron(penalty='l1'),0.888112,0.875,0.822222,0.804348,0.840909,0.698113
1,0.006596,0.006561,Perceptron(penalty='l1'),0.93662,0.94782,0.905263,0.843137,0.977273,0.826923
2,0.006451,0.006513,Perceptron(penalty='l1'),0.880282,0.825603,0.779221,0.909091,0.681818,0.638298
3,0.006275,0.006535,Perceptron(penalty='l1'),0.84507,0.837662,0.765957,0.72,0.818182,0.62069
4,0.006448,0.006562,Perceptron(penalty='l1'),0.901408,0.896406,0.844444,0.808511,0.883721,0.730769


BAL:  ['suplico', 'á', 'dentro', 'servir', 'buena', 'ermano', 'cartas', 'satisfaccion', 'tuviese', 'essa', 'informe', 'mucho', 'todo', 'sirva', 'quedar', 'etc', 'mismo', 'sabe', 'poco', 'escrito']
not BAL:  ['pase', 'algo', 'falta', 'deque', 'mejor', 'molino', 'aya', 'pro', 'vino', 'dan', 'avia', 'misa', 'março', 'echas', 'â', 'qual', 'menos', 'oi', 'estoi', 'dichos']
PassiveAggressiveClassifier(max_iter=50)


Unnamed: 0,fit_time,score_time,estimator,test_accuracy,test_balanced_accuracy,test_f1,test_precision,test_recall,test_jaccard
0,0.003939,0.006621,PassiveAggressiveClassifier(max_iter=50),0.93007,0.911616,0.883721,0.904762,0.863636,0.791667
1,0.003722,0.007281,PassiveAggressiveClassifier(max_iter=50),0.943662,0.940399,0.911111,0.891304,0.931818,0.836735
2,0.003599,0.006543,PassiveAggressiveClassifier(max_iter=50),0.93662,0.916512,0.894118,0.926829,0.863636,0.808511
3,0.003534,0.006509,PassiveAggressiveClassifier(max_iter=50),0.922535,0.893785,0.86747,0.923077,0.818182,0.765957
4,0.003677,0.006592,PassiveAggressiveClassifier(max_iter=50),0.950704,0.938337,0.917647,0.928571,0.906977,0.847826


BAL:  ['á', 'servir', 'suplico', 'buena', 'mia', 'mismo', 'ermano', 'dentro', 'cartas', 'escrito', 'memorias', 'satisfaccion', 'poco', 'essa', 'paraque', 'informe', 'nuevo', 'estava', 'sirva', 'dia']
not BAL:  ['dichos', 'dan', 'dezir', 'tios', 'aya', 'siento', 'deque', 'algo', 'ninguna', 'oi', 'qual', 'firmo', 'podido', 'avia', 'entender', 'les', 'echas', 'cual', 'saben', 'echar']
RandomForestClassifier()


Unnamed: 0,fit_time,score_time,estimator,test_accuracy,test_balanced_accuracy,test_f1,test_precision,test_recall,test_jaccard
0,0.388944,0.029161,"(DecisionTreeClassifier(max_features='sqrt', r...",0.923077,0.881313,0.860759,0.971429,0.772727,0.755556
1,0.387084,0.028135,"(DecisionTreeClassifier(max_features='sqrt', r...",0.887324,0.84949,0.804878,0.868421,0.75,0.673469
2,0.383642,0.02865,"(DecisionTreeClassifier(max_features='sqrt', r...",0.908451,0.852273,0.826667,1.0,0.704545,0.704545
3,0.38066,0.028772,"(DecisionTreeClassifier(max_features='sqrt', r...",0.922535,0.881262,0.860759,0.971429,0.772727,0.755556
4,0.383358,0.028572,"(DecisionTreeClassifier(max_features='sqrt', r...",0.894366,0.838736,0.8,0.9375,0.697674,0.666667


BAL:  ['á', 'mia', 'ha', 'he', 'suplico', 'servir', 'ruego', 'servidor', 'seguro', 'mayor', 'mandarme', 'servidora', 'preceptos', 'perdida', 'al', 'esse', 'espero', 'haver', 'conserve', 'buena']
not BAL:  ['mui', 'dela', 'mio', 'ala', 'ai', 'te', 'quien', 'vn', 'deseo', 'este', 'salud', 'ser', 'â', 'pido', 'asi', 'sea', 'dos', 'pues', 'demas', 'tiene']


In [100]:
# with subssample of codea
for est in estimators:
    res = sklearn.model_selection.cross_validate(
        est,
        X,
        labels,
        cv=cv,
        scoring=[mn.replace('_score', '') for mn in metric_names],
        return_estimator=True,
    )
    print(est)
    display(pd.DataFrame.from_dict(res))
    words_dist['dist'] = 0
    for e in res['estimator']:
        coefs = getattr(e, 'feature_importances_', getattr(e, 'coef_', None)).flatten()
        if n_components:
            words_dist['dist'] += (coefs * svd.components_.T).sum(axis=1)
        else:
            words_dist['dist'] += coefs
    top_bal_words = words_dist.loc[bal_word_mask].nlargest(n=20, columns='dist').index
    top_notbal_words = words_dist.loc[~bal_word_mask].nlargest(n=20, columns='dist').index
    print('BAL: ', top_bal_words.to_list())
    print('not BAL: ', top_notbal_words.to_list())

RidgeClassifier(tol=0.01)


Unnamed: 0,fit_time,score_time,estimator,test_accuracy,test_balanced_accuracy,test_f1,test_precision,test_recall,test_jaccard
0,0.008242,0.006877,RidgeClassifier(tol=0.01),0.892473,0.889842,0.880952,0.925,0.840909,0.787234
1,0.00707,0.006523,RidgeClassifier(tol=0.01),0.913978,0.916048,0.913043,0.875,0.954545,0.84
2,0.00691,0.006522,RidgeClassifier(tol=0.01),0.924731,0.925093,0.921348,0.911111,0.931818,0.854167
3,0.007025,0.006538,RidgeClassifier(tol=0.01),0.923913,0.920455,0.91358,1.0,0.840909,0.840909
4,0.006877,0.006523,RidgeClassifier(tol=0.01),0.934783,0.934504,0.930233,0.930233,0.930233,0.869565


BAL:  ['á', 'mia', 'servir', 'suplico', 'servidora', 'consejo', 'mismo', 'al', 'buena', 'una', 'ermano', 'dependencia', 'corazon', 'dentro', 'cartas', 'un', 'he', 'â', 'quando', 'é']
not BAL:  ['qual', 'ay', 'ninguna', 'dichos', 'falta', 'porque', 'te', 'tus', 'tierras', 'era', 'amas', 'ya', 'algo', 'antes', 'nose', 'dichas', 'contra', 'oi', 'vino', 'deque']
Perceptron(penalty='l2')


Unnamed: 0,fit_time,score_time,estimator,test_accuracy,test_balanced_accuracy,test_f1,test_precision,test_recall,test_jaccard
0,0.003039,0.007505,Perceptron(penalty='l2'),0.903226,0.903525,0.898876,0.888889,0.909091,0.816327
1,0.002344,0.006461,Perceptron(penalty='l2'),0.892473,0.894481,0.891304,0.854167,0.931818,0.803922
2,0.002313,0.006453,Perceptron(penalty='l2'),0.913978,0.913729,0.909091,0.909091,0.909091,0.833333
3,0.00234,0.006485,Perceptron(penalty='l2'),0.826087,0.825758,0.818182,0.818182,0.818182,0.692308
4,0.002203,0.006476,Perceptron(penalty='l2'),0.880435,0.880636,0.873563,0.863636,0.883721,0.77551


BAL:  ['á', 'mia', 'servir', 'suplico', 'buena', 'cartas', 'dentro', 'al', 'sirva', 'ermano', 'estava', 'servirte', 'servidora', 'consejo', 'escrito', 'quales', 'etc', 'mismo', 'dependencia', 'nuevo']
not BAL:  ['vino', 'ay', 'qual', 'era', 'octubre', 'falta', 'ninguna', 'tus', 'dichos', 'paciencia', 'cobrar', 'mucha', 'algo', 'fui', 'consta', 'esto', 'sean', 'ños', 'ter', 'cuando']
Perceptron(penalty='l1')


Unnamed: 0,fit_time,score_time,estimator,test_accuracy,test_balanced_accuracy,test_f1,test_precision,test_recall,test_jaccard
0,0.004996,0.006566,Perceptron(penalty='l1'),0.88172,0.877319,0.864198,0.945946,0.795455,0.76087
1,0.005148,0.006523,Perceptron(penalty='l1'),0.83871,0.842301,0.842105,0.784314,0.909091,0.727273
2,0.004669,0.006516,Perceptron(penalty='l1'),0.860215,0.862709,0.860215,0.816327,0.909091,0.754717
3,0.005213,0.006459,Perceptron(penalty='l1'),0.847826,0.844697,0.829268,0.894737,0.772727,0.708333
4,0.005265,0.006471,Perceptron(penalty='l1'),0.880435,0.876364,0.864198,0.921053,0.813953,0.76087


BAL:  ['á', 'suplico', 'servir', 'consejo', 'mia', 'dentro', 'sirva', 'buena', 'quando', 'paraque', 'essa', 'estava', 'escrito', 'todas', 'ruego', 'parte', 'deve', 'quales', 'al', 'cartas']
not BAL:  ['dichos', 'vino', 'fuese', 'mal', 'les', 'qual', 'tu', 'pase', 'algo', 'ya', 'entender', 'te', 'fineza', 'estando', 'hijos', 'oi', 'asta', 'arroba', 'falta', 'deque']
PassiveAggressiveClassifier(max_iter=50)


Unnamed: 0,fit_time,score_time,estimator,test_accuracy,test_balanced_accuracy,test_f1,test_precision,test_recall,test_jaccard
0,0.003586,0.006716,PassiveAggressiveClassifier(max_iter=50),0.892473,0.891002,0.883721,0.904762,0.863636,0.791667
1,0.002773,0.007287,PassiveAggressiveClassifier(max_iter=50),0.903226,0.905844,0.903226,0.857143,0.954545,0.823529
2,0.002673,0.006487,PassiveAggressiveClassifier(max_iter=50),0.892473,0.89564,0.893617,0.84,0.954545,0.807692
3,0.002671,0.006598,PassiveAggressiveClassifier(max_iter=50),0.913043,0.910985,0.904762,0.95,0.863636,0.826087
4,0.002817,0.006502,PassiveAggressiveClassifier(max_iter=50),0.923913,0.9243,0.91954,0.909091,0.930233,0.851064


BAL:  ['á', 'mia', 'servir', 'suplico', 'buena', 'dentro', 'al', 'ermano', 'consejo', 'sirva', 'corazon', 'estava', 'nuevo', 'servidora', 'mismo', 'memorias', 'cartas', 'escrito', 'quando', 'informe']
not BAL:  ['ay', 'qual', 'dichos', 'vino', 'falta', 'porque', 'era', 'paciencia', 'ninguna', 'tus', 'tierras', 'fuese', 'antes', 'cobrar', 'entender', 'octubre', 'deque', 'avia', 'oi', 'mal']
RandomForestClassifier()


Unnamed: 0,fit_time,score_time,estimator,test_accuracy,test_balanced_accuracy,test_f1,test_precision,test_recall,test_jaccard
0,0.289724,0.025057,"(DecisionTreeClassifier(max_features='sqrt', r...",0.903226,0.901206,0.894118,0.926829,0.863636,0.808511
1,0.29092,0.026068,"(DecisionTreeClassifier(max_features='sqrt', r...",0.946237,0.94666,0.94382,0.933333,0.954545,0.893617
2,0.279116,0.025256,"(DecisionTreeClassifier(max_features='sqrt', r...",0.88172,0.879638,0.870588,0.902439,0.840909,0.770833
3,0.285804,0.02527,"(DecisionTreeClassifier(max_features='sqrt', r...",0.923913,0.921402,0.915663,0.974359,0.863636,0.844444
4,0.290058,0.025033,"(DecisionTreeClassifier(max_features='sqrt', r...",0.891304,0.887992,0.878049,0.923077,0.837209,0.782609


BAL:  ['á', 'mia', 'he', 'ruego', 'servidor', 'ha', 'servir', 'suplico', 'al', 'servidora', 'un', 'le', 'mismo', 'mayor', 'seguro', 'noticia', 'buena', 'perdida', 'corazon', 'parte']
not BAL:  ['dela', 'mui', 'mio', 'es', 'ai', 'ala', 'vn', 'ael', 'do', 'estan', 'delos', 'te', 'ami', 'sino', 'este', 'deseo', 'digo', 'enla', 'lugar', 'salud']
