In [1]:
from src.utils import *
from src.preprocess import clean_spacy_tokens

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

import pandas as pd
import numpy as np
from functools import reduce

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\pierr\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
DOCS_PATH = 'data/en_core_web_lg.jsonl'

In [3]:
docs = read_jsonl(DOCS_PATH)

In [4]:
df = pd.DataFrame(docs, columns=['path', 'birthDate', 'birthPlace', 'deathDate'])
df.head()

Unnamed: 0,path,birthDate,birthPlace,deathDate
0,http://dbpedia.org/resource/Cab_Calloway,1907-12-25,"http://dbpedia.org/resource/Rochester,_New_York",1994-11-18
1,http://dbpedia.org/resource/Cabell_Breckinridge,1788-07-14,"http://dbpedia.org/resource/Albemarle_County,_...",1823-09-01
2,http://dbpedia.org/resource/Cabeção_(footballer),1930-08-23,http://dbpedia.org/resource/Brazil,2020-01-06
3,http://dbpedia.org/resource/Cabiria_Andreian_C...,1928-02-19,http://dbpedia.org/resource/Iași,2018-05-22
4,http://dbpedia.org/resource/Cabo_Almi,1962-12-17,http://dbpedia.org/resource/Jardim_Olinda,2021-05-24


In [5]:
cleaner = clean_spacy_tokens()
df['tokens'] = [cleaner(doc['abstract']) for doc in docs]
df.head()

Unnamed: 0,path,birthDate,birthPlace,deathDate,tokens
0,http://dbpedia.org/resource/Cab_Calloway,1907-12-25,"http://dbpedia.org/resource/Rochester,_New_York",1994-11-18,"[Cabell, Calloway, III, american, singer, song..."
1,http://dbpedia.org/resource/Cabell_Breckinridge,1788-07-14,"http://dbpedia.org/resource/Albemarle_County,_...",1823-09-01,"[Joseph, Cabell, Breckinridge, lawyer, soldier..."
2,http://dbpedia.org/resource/Cabeção_(footballer),1930-08-23,http://dbpedia.org/resource/Brazil,2020-01-06,"[Luís, Morais, know, Cabeção, brazilian, footb..."
3,http://dbpedia.org/resource/Cabiria_Andreian_C...,1928-02-19,http://dbpedia.org/resource/Iași,2018-05-22,"[Cabiria, Andreian, Cazacu, romanian, mathemat..."
4,http://dbpedia.org/resource/Cabo_Almi,1962-12-17,http://dbpedia.org/resource/Jardim_Olinda,2021-05-24,"[José, Almi, Pereira, Moura, full, name, Cabo,..."


In [6]:
df['astral_sign'] = df.birthDate.apply(date_to_astral)
df.head()

Unnamed: 0,path,birthDate,birthPlace,deathDate,tokens,astral_sign
0,http://dbpedia.org/resource/Cab_Calloway,1907-12-25,"http://dbpedia.org/resource/Rochester,_New_York",1994-11-18,"[Cabell, Calloway, III, american, singer, song...",Capricorn
1,http://dbpedia.org/resource/Cabell_Breckinridge,1788-07-14,"http://dbpedia.org/resource/Albemarle_County,_...",1823-09-01,"[Joseph, Cabell, Breckinridge, lawyer, soldier...",Cancer
2,http://dbpedia.org/resource/Cabeção_(footballer),1930-08-23,http://dbpedia.org/resource/Brazil,2020-01-06,"[Luís, Morais, know, Cabeção, brazilian, footb...",Virgo
3,http://dbpedia.org/resource/Cabiria_Andreian_C...,1928-02-19,http://dbpedia.org/resource/Iași,2018-05-22,"[Cabiria, Andreian, Cazacu, romanian, mathemat...",Aquarius
4,http://dbpedia.org/resource/Cabo_Almi,1962-12-17,http://dbpedia.org/resource/Jardim_Olinda,2021-05-24,"[José, Almi, Pereira, Moura, full, name, Cabo,...",Sagittarius


In [7]:
df.loc[1, 'birthPlace']

'http://dbpedia.org/resource/Albemarle_County,_Virginia'

## Binary Classification

In [7]:
df['astral_sign'].value_counts()

Capricorn      4183
Aquarius       4079
Scorpio        4018
Aries          3985
Virgo          3961
Pisces         3954
Gemini         3911
Leo            3902
Cancer         3776
Libra          3731
Taurus         3626
Sagittarius    3486
Name: astral_sign, dtype: int64

In [8]:
TARGET_LABEL = 'Capricorn'
df['target'] = df['astral_sign'] == TARGET_LABEL

In [9]:
train_indices, test_indices = train_test_split(list(df.index), test_size=0.3, random_state=42)

In [16]:
# One label
def preprocess(df, train_indices, test_indices):
    tokens = df['tokens'].apply(lambda x: ' '.join(x))
    vectorizer = TfidfVectorizer(lowercase=False, token_pattern=r'([^\s]+)').fit(tokens)

    # Delete this if u don't want equals size of labels in train set
    y_train = df.loc[train_indices, 'astral_sign'] == TARGET_LABEL
    train_indices = list(y_train[y_train == True].index) 
    train_indices += list(y_train[y_train == False].index)[:len(train_indices)]

    y_test = df.loc[test_indices, 'astral_sign'] == TARGET_LABEL
    test_indices = list(y_test[y_test == True].index) 
    test_indices += list(y_test[y_test == False].index)[:len(test_indices)]


    return (
        vectorizer.transform(tokens[train_indices]),
        (df.loc[train_indices, 'astral_sign'] == TARGET_LABEL).astype(int),
        vectorizer.transform(tokens[test_indices]),
        (df.loc[test_indices, 'astral_sign'] == TARGET_LABEL).astype(int),
    )

# Two Label
def preprocess(df, train_indices, test_indices, labels = ['Taurus', 'Sagittarius']):
    tokens = df['tokens'].apply(lambda x: ' '.join(x))
    vectorizer = TfidfVectorizer(lowercase=False, token_pattern=r'([^\s]+)').fit(tokens)

    train_set = df.iloc[train_indices]
    test_set = df.iloc[test_indices]

    train_indices = []
    test_indices = []
    for label in labels:
        train_indices += list(train_set[train_set['astral_sign'] == label].index)
        test_indices += list(test_set[test_set['astral_sign'] == label].index)

    return (
        vectorizer.transform(tokens[train_indices]),
        train_set.loc[train_indices, 'astral_sign'],
        vectorizer.transform(tokens[test_indices]),
        test_set.loc[test_indices, 'astral_sign'],
    )


In [17]:
X_train, y_train, X_test, y_test = preprocess(df, train_indices, test_indices)

In [18]:
y_train.value_counts() / len(y_train)

Taurus         0.507898
Sagittarius    0.492102
Name: astral_sign, dtype: float64

In [19]:
y_test.value_counts()  / len(y_test)

Taurus         0.514448
Sagittarius    0.485552
Name: astral_sign, dtype: float64

In [20]:
(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

((5001, 139538), (5001,), (2111, 139538), (2111,))

In [21]:
clf = LogisticRegression(class_weight='balanced', random_state=42, verbose=1, n_jobs=8).fit(X_train, y_train)

[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   1 out of   1 | elapsed:    2.5s finished


In [23]:
y_pred = clf.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[501 524]
 [530 556]]
              precision    recall  f1-score   support

 Sagittarius       0.49      0.49      0.49      1025
      Taurus       0.51      0.51      0.51      1086

    accuracy                           0.50      2111
   macro avg       0.50      0.50      0.50      2111
weighted avg       0.50      0.50      0.50      2111



In [24]:
rfc = RandomForestClassifier(class_weight='balanced', random_state=42, n_jobs=8, verbose=1).fit(X_train, y_train)

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    6.3s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:   14.4s finished


In [25]:
y_pred = rfc.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[474 551]
 [513 573]]
              precision    recall  f1-score   support

 Sagittarius       0.48      0.46      0.47      1025
      Taurus       0.51      0.53      0.52      1086

    accuracy                           0.50      2111
   macro avg       0.50      0.50      0.49      2111
weighted avg       0.50      0.50      0.50      2111



[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.0s finished


In [26]:
X_train, y_train, X_test, y_test = preprocess(df, train_indices, test_indices, labels=ASTRAL_DATES.keys())

In [27]:
y_train.value_counts() / len(y_train)

Capricorn      0.089524
Aries          0.086919
Scorpio        0.086459
Aquarius       0.085816
Pisces         0.085387
Virgo          0.084713
Gemini         0.084314
Leo            0.082015
Cancer         0.081464
Libra          0.080115
Taurus         0.077847
Sagittarius    0.075426
Name: astral_sign, dtype: float64

In [28]:
y_test.value_counts() / len(y_test)

Aquarius       0.091462
Capricorn      0.090246
Leo            0.087672
Virgo          0.085598
Scorpio        0.085598
Pisces         0.083524
Gemini         0.082952
Aries          0.082165
Cancer         0.079949
Libra          0.079877
Taurus         0.077660
Sagittarius    0.073298
Name: astral_sign, dtype: float64

In [29]:
clf = LogisticRegression(class_weight='balanced', random_state=42, verbose=1, n_jobs=8).fit(X_train, y_train)

[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   1 out of   1 | elapsed:   38.1s finished


In [30]:
y_pred = clf.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[114  92  88 135 100 103 106 106 107 102 116 110]
 [102  97  84 110  92 102  78  93  88  89 124  90]
 [112  68  89  98  93  91 105  94  71  89  99 109]
 [121  81 104 133  97 113  99  94 102  88 114 116]
 [100  95  96 112  82  94 104 105 103  81  90  98]
 [ 97  88 118 121  97 107  99  92  99 103 111  94]
 [ 83  90 100  92  93  92  89  93  81 110  94 100]
 [ 93  95  89 134  95  95  95 111 104  90  87  80]
 [ 95  88  82  97  73  86  83  75  75  88  94  89]
 [115  77  97  97 109  98  95  88 100  97 113 111]
 [ 85  83  90 104  94  89 100  82  82  76 112  89]
 [ 99  88 107 126 104  94  91  93  92  96 100 107]]
              precision    recall  f1-score   support

    Aquarius       0.09      0.09      0.09      1279
       Aries       0.09      0.08      0.09      1149
      Cancer       0.08      0.08      0.08      1118
   Capricorn       0.10      0.11      0.10      1262
      Gemini       0.07      0.07      0.07      1160
         Leo       0.09      0.09      0.09      1226
       L

In [32]:
rfc = RandomForestClassifier(class_weight='balanced', random_state=42, n_jobs=8, verbose=1).fit(X_train, y_train)

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:  1.3min
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:  3.7min finished


In [33]:
y_pred = rfc.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.2s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.6s finished


[[145 152 103 193 116  97  86  97  39  99  56  96]
 [118 140  80 160 114  73  69 106  42 107  64  76]
 [134 126  67 138 118  72  80  91  51 107  47  87]
 [154 147  86 184 110  91  84  88  49 110  65  94]
 [122 135  98 176  89  83  71 101  59 107  48  71]
 [116 144 102 182 117  83  68  97  58 120  50  89]
 [132 136  83 163  94  84  62  79  38 102  56  88]
 [139 135  84 160 106  77  71  95  56 106  50  89]
 [111 137  48 144  90  65  69  89  47  83  51  91]
 [153 128  70 166 116  82  63 119  36  99  61 104]
 [106 124  81 167  97  80  64  86  45 106  46  84]
 [124 153  99 161  91  76  71 104  60 113  62  83]]
              precision    recall  f1-score   support

    Aquarius       0.09      0.11      0.10      1279
       Aries       0.08      0.12      0.10      1149
      Cancer       0.07      0.06      0.06      1118
   Capricorn       0.09      0.15      0.11      1262
      Gemini       0.07      0.08      0.07      1160
         Leo       0.09      0.07      0.08      1226
       L

In [None]:
geonames