In [126]:
import pandas as pd
import numpy as np
import glob
import os
import json
import csv
import math
import scipy.sparse
from scipy.sparse import csr_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from bs4 import BeautifulSoup
import requests
from sklearn.utils import shuffle
from sklearn.base import BaseEstimator
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
from sklearn.utils.estimator_checks import check_estimator
from sklearn.base import clone
from sklearn.model_selection import GridSearchCV

In [4]:
pd.set_option('display.max_colwidth', -1)

Start by exploring academic training dataset.

In [5]:
all_words = pd.read_csv('data/train/academic/words.csv', encoding='ISO--8859-1', na_filter=False)
display(len(all_words))
all_words.head()

48964

Unnamed: 0.1,Unnamed: 0,word_id,word,lemma,word_type,function,seg_type,sentence_id,text_id,text_tag,genre
0,25070,25070,HER,she,DPS,,,1267,a6u-fragment02,a6u,academic
1,25071,25071,DRESS,dress,NN1,,,1267,a6u-fragment02,a6u,academic
2,25072,25072,HANGS,hang,VVZ,,,1267,a6u-fragment02,a6u,academic
3,25073,25073,HERE',here',NP0,,,1267,a6u-fragment02,a6u,academic
4,25074,25074,DE-FROCKING,de-frock,VVG,mrw,met,1267,a6u-fragment02,a6u,academic


## Keep only content words and filter stop verbs!

In [6]:
# Keeping content words: http://ucrel.lancs.ac.uk/bnc2/bnc2guide.htm#m2adv
# Does not include forms of 'be', 'do', 'have', or modal verbs,
# as suggested in https://github.com/EducationalTestingService/metaphor/tree/master/content-words
content_word_pos = set([
    'NN1', 'NN2', 'NN0',
    'NP0',
    'NN1-NP0', 'NN1-VVB', 'NN1-VVG', 'NN2-VVZ',
    'NP0-NN1',
    'VVB', 'VVD', 'VVG', 'VVI', 'VVN', 'VVZ',
    'VVB-NN1', 'VVD-VVN', 'VVD-AJ0', 'VVG-AJ0', 'VVG-NN1', 'VVZ-NN2',
    'AJ0', 'AJC', 'AJS',
    'AJ0-NN1', 'AJ0-VVG', 'AJ0-VVN',
    'AV0', 'AVQ', 'AVP',
    'AV0-AJ0'
])

In [7]:
content_words = all_words[all_words['word_type'].isin(content_word_pos)]
display(len(content_words))
content_words.head(10)

25360

Unnamed: 0.1,Unnamed: 0,word_id,word,lemma,word_type,function,seg_type,sentence_id,text_id,text_tag,genre
1,25071,25071,DRESS,dress,NN1,,,1267,a6u-fragment02,a6u,academic
2,25072,25072,HANGS,hang,VVZ,,,1267,a6u-fragment02,a6u,academic
3,25073,25073,HERE',here',NP0,,,1267,a6u-fragment02,a6u,academic
4,25074,25074,DE-FROCKING,de-frock,VVG,mrw,met,1267,a6u-fragment02,a6u,academic
6,25076,25076,KAHLO,kahlo,NP0,,,1267,a6u-fragment02,a6u,academic
7,25077,25077,CULT,cult,NN1,,,1267,a6u-fragment02,a6u,academic
8,25078,25078,Oriana,oriana,NP0-NN1,,,1268,a6u-fragment02,a6u,academic
9,25079,25079,Baddeley,baddeley,NP0,,,1268,a6u-fragment02,a6u,academic
13,25083,25083,witnessed,witness,VVN,mrw,met,1269,a6u-fragment02,a6u,academic
15,25085,25085,shift,shift,NN1-VVB,,,1269,a6u-fragment02,a6u,academic


## Sparsification

Create a vocabulary using content words. This will be used as the unigram features.

In [8]:
vocabulary = content_words[['word']].drop_duplicates().set_index('word')

display(len(vocabulary))
vocabulary.sample(10)

7327

reductions
grain-growing
PHILOSOPHERS
detached
factory
element
Robert
side
providing
dynasty


In [9]:
def sparse_featurize(words, vocabulary):
    print('Build iterable sentences.')
    iterable_sentences = words.groupby('sentence_id')['word'].apply(list)
    display(len(iterable_sentences))
    display(iterable_sentences[:5])

    indices = []
    data = []
    indptr = [0]
    for i, word, sentence_id in words[['word', 'sentence_id']].itertuples():
        sentence = iterable_sentences[sentence_id]

        # Used to generate a row of 0's and 1's for sparse matrix
        unique_unigram_indices = set()
        for context_word in sentence:
            if context_word == word:
                continue
            if context_word not in vocabulary.index:
                unique_unigram_indices.add(len(vocabulary))
            unique_unigram_indices.add(vocabulary.index.get_loc(context_word))

        # Data = 1 at column index of sparse matrix = row index of word in vocabulary.
        indices.extend(unique_unigram_indices)  # Indices of the data values in this row.
        data.extend(np.ones(len(unique_unigram_indices), dtype=int))  # Data of all one's.
        indptr.append(indptr[-1]+len(unique_unigram_indices))

        if not i % 2000:
            print('-', end='', flush=True)

    return (
        csr_matrix((data, indices, indptr), dtype=int, shape=(len(words), len(vocabulary)+1)),
        iterable_sentences
    )

Width of sparse matrix is len(vocabulary)+1 to process unknown words.

In [13]:
words = content_words
sparse_unigram_features, iterable_sentences = sparse_featurize(words, vocabulary)

scipy.sparse.save_npz('data/sparse_unigram_features.npz', sparse_unigram_features)
display(sparse_unigram_features)
sparse_unigram_features.shape

Build iterable sentences.


1961

sentence_id
1267    [DRESS, HANGS, HERE', DE-FROCKING, KAHLO, CULT]                                                                               
1268    [Oriana, Baddeley]                                                                                                            
1269    [witnessed, shift, art, establishment, attitudes, art, produced, traditional, parameters]                                     
1270    [work, previously, marginalised, artists, become, area, rich, speculation, art, dealers, priced, modern, masters, market]     
1271    [Almost, year, witnessed, discovery, new, artistic, terrain, graffiti, art, Soviet, art, Australian, art, art, Latin, America]
Name: word, dtype: object

-----------

<25360x7328 sparse matrix of type '<class 'numpy.int32'>'
	with 386398 stored elements in Compressed Sparse Row format>

(25360, 7328)

Inspect if sparse features are valid.

In [14]:
sparse_unigram_features = scipy.sparse.load_npz('data/sparse_unigram_features.npz')

In [51]:
select_word_i = 210

selected_word = words.iloc[select_word_i]
display(selected_word)
sentence_of_selected_word = sorted(list(set(iterable_sentences.loc[selected_word.sentence_id])))
display(sentence_of_selected_word)

unigram_select_indices = np.nonzero(sparse_unigram_features[select_word_i])[:50][1]  # Nonzero indices indicating word.
display(unigram_select_indices)
sparse_matrix_sentence_selections = sorted(list(vocabulary.iloc[unigram_select_indices].index))
display(sparse_matrix_sentence_selections)

assert len(sentence_of_selected_word) - 1 == len(sparse_matrix_sentence_selections)

Unnamed: 0     25449         
word_id        25449         
word           passionate    
lemma          passionate    
word_type      AJ0           
function                     
seg_type                     
sentence_id    1282          
text_id        a6u-fragment02
text_tag       a6u           
genre          academic      
Name: 379, dtype: object

['Diego',
 'Rivera',
 'appearance',
 'come',
 'dominate',
 'emotional',
 'flamboyant',
 'husband',
 'obsession',
 'pain',
 'passionate',
 'physical',
 'responses',
 'work']

array([ 70,  71, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180,  16])

['Diego',
 'Rivera',
 'appearance',
 'come',
 'dominate',
 'emotional',
 'flamboyant',
 'husband',
 'obsession',
 'pain',
 'physical',
 'responses',
 'work']

Looks good!

Do you wanna build a model?

In [120]:
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
from sklearn.utils.estimator_checks import check_estimator

def preprocess_data(words):
    def build_content_words(all_words):
        print('Building content words.')
        # Keeping content words: http://ucrel.lancs.ac.uk/bnc2/bnc2guide.htm#m2adv
        # Does not include forms of 'be', 'do', 'have', or modal verbs,
        # as suggested in https://github.com/EducationalTestingService/metaphor/tree/master/content-words
        content_word_pos = set([
            'NN1', 'NN2', 'NN0',
            'NP0',
            'NN1-NP0', 'NN1-VVB', 'NN1-VVG', 'NN2-VVZ',
            'NP0-NN1',
            'VVB', 'VVD', 'VVG', 'VVI', 'VVN', 'VVZ',
            'VVB-NN1', 'VVD-VVN', 'VVD-AJ0', 'VVG-AJ0', 'VVG-NN1', 'VVZ-NN2',
            'AJ0', 'AJC', 'AJS',
            'AJ0-NN1', 'AJ0-VVG', 'AJ0-VVN',
            'AV0', 'AVQ', 'AVP',
            'AV0-AJ0'
        ])
        
        content_words = all_words[all_words['word_type'].isin(content_word_pos)]
        
        return content_words
    
    def build_labels(words):
        print('Building labels.')
        labels = words['function'] == 'mrw'
        
        return labels
    
    content_words = build_content_words(words)
    labels = build_labels(content_words)
    
    return content_words, labels

class UnigramClassifier(BaseEstimator):
    def __init__(self, verbose=10, random_state=0, C=10, penalty='l1',
                 solver='liblinear', class_weight='balanced', max_iter=500):
        self.verbose = verbose
        self.random_state = random_state
        self.C = C
        self.penalty = penalty
        self.solver = solver
        self.class_weight = class_weight
        self.max_iter = max_iter
 
    
    def fit(self, X, y):
        # Check that X and y have correct shape
#         X, y = check_X_y(X, y, accept_sparse=True)
        
        self.build_features(X, y)
        
        self.clf_ = LogisticRegression(
            verbose=self.verbose,
            random_state=self.random_state,
            C=self.C,
            penalty=self.penalty,
            solver=self.solver,
            class_weight=self.class_weight,
            max_iter=self.max_iter
        )
        
        self.clf_.fit(self.X_, self.y_)
        
        return self
    
    def build_features(self, X, y):
        print('Building features.')
#         content_words = self.build_content_words(X)
        
        self.vocabulary_ = X[['word']].drop_duplicates().set_index('word')
        
#         words = content_words
#         labels = self.build_labels(words)
        
        sparse_unigram_features, iterable_sentences = self.sparse_featurize(X)
        
        self.X_, self.y_ = shuffle(sparse_unigram_features, y, random_state=0)

#     def build_content_words(self, all_words):
#         print('Building content words.')
#         # Keeping content words: http://ucrel.lancs.ac.uk/bnc2/bnc2guide.htm#m2adv
#         # Does not include forms of 'be', 'do', 'have', or modal verbs,
#         # as suggested in https://github.com/EducationalTestingService/metaphor/tree/master/content-words
#         content_word_pos = set([
#             'NN1', 'NN2', 'NN0',
#             'NP0',
#             'NN1-NP0', 'NN1-VVB', 'NN1-VVG', 'NN2-VVZ',
#             'NP0-NN1',
#             'VVB', 'VVD', 'VVG', 'VVI', 'VVN', 'VVZ',
#             'VVB-NN1', 'VVD-VVN', 'VVD-AJ0', 'VVG-AJ0', 'VVG-NN1', 'VVZ-NN2',
#             'AJ0', 'AJC', 'AJS',
#             'AJ0-NN1', 'AJ0-VVG', 'AJ0-VVN',
#             'AV0', 'AVQ', 'AVP',
#             'AV0-AJ0'
#         ])
        
#         content_words = all_words[all_words['word_type'].isin(content_word_pos)]
        
#         return content_words
    
#     def build_labels(self, words):
#         print('Building labels.')
#         labels = words['function'] == 'mrw'
        
#         return labels
    
    def sparse_featurize(self, words):
        print('Building iterable sentences.')
        iterable_sentences = words.groupby('sentence_id')['word'].apply(list)

        indices = []
        data = []
        indptr = [0]
        print('Building sparse matrix.')
        for i, word, sentence_id in words[['word', 'sentence_id']].itertuples():
            sentence = iterable_sentences[sentence_id]

            # Used to generate a row of 0's and 1's for sparse matrix
            unique_unigram_indices = set()
            for context_word in sentence:
                if context_word == word:
                    continue
                if context_word not in self.vocabulary_.index:
                    unique_unigram_indices.add(len(self.vocabulary_))
                else:
                    unique_unigram_indices.add(self.vocabulary_.index.get_loc(context_word))

            # Data = 1 at column index of sparse matrix = row index of word in vocabulary.
            indices.extend(unique_unigram_indices)  # Indices of the data values in this row.
            data.extend(np.ones(len(unique_unigram_indices), dtype=int))  # Data of all one's.
            indptr.append(indptr[-1]+len(unique_unigram_indices))

            if not i % 2000:
                print('-', end='', flush=True)
        print('\nSparse matrifixation complete.')
        

        return (
            csr_matrix((data, indices, indptr), dtype=int, shape=(len(words), len(self.vocabulary_)+1)),
            iterable_sentences
        )   
    
    
    def predict(self, X):
        # Check is fit had been called
#         check_is_fitted(self, ['X_', 'y_', 'clf_'])

        # Input validation
#         X = check_array(X, accept_sparse=True)
        featurized_X = self.sparse_featurize(X)[0]
        return self.clf_.predict(featurized_X)
    
# check_estimator(UnigramClassifier)

In [121]:
clf2 = UnigramClassifier(verbose=10, random_state=0, C=10, penalty='l1',
                         solver='liblinear', class_weight='balanced', max_iter=500)

X, y = preprocess_data(all_words)
clf2.fit(X, y)

Building content words.
Building labels.
Building features.
Building iterable sentences.
Building sparse matrix.
-----------
Sparse matrifixation complete.
[LibLinear]

UnigramClassifier(C=10, class_weight='balanced', max_iter=500, penalty='l1',
         random_state=0, solver='liblinear', verbose=10)

In [122]:
clf2.predict(all_words)

Building iterable sentences.
Building sparse matrix.
-------------------------
Sparse matrifixation complete.


array([False, False, False, ...,  True, False, False])

In [124]:
scoring = ['accuracy', 'precision', 'recall', 'f1']
scores = cross_validate(clf2, scoring=scoring, X=X, y=y, cv=10, verbose=10)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  ................................................................
Building features.
Building iterable sentences.
Building sparse matrix.
-----------
Sparse matrifixation complete.
[LibLinear]Building iterable sentences.
Building sparse matrix.

Sparse matrifixation complete.
Building iterable sentences.
Building sparse matrix.

Sparse matrifixation complete.
Building iterable sentences.
Building sparse matrix.

Sparse matrifixation complete.
Building iterable sentences.
Building sparse matrix.

Sparse matrifixation complete.
Building iterable sentences.
Building sparse matrix.
-----------
Sparse matrifixation complete.
Building iterable sentences.
Building sparse matrix.
-----------
Sparse matrifixation complete.
Building iterable sentences.
Building sparse matrix.
-----------
Sparse matrifixation complete.
Building iterable sentences.
Building sparse matrix.
-----------
Sparse matrifixation complete.
[CV]  , accuracy=0.6005520504731862, precision=0.21481481481481482, recall=0.31

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   15.7s remaining:    0.0s


Building sparse matrix.
---------
Sparse matrifixation complete.
[LibLinear]Building iterable sentences.
Building sparse matrix.
--
Sparse matrifixation complete.
Building iterable sentences.
Building sparse matrix.
--
Sparse matrifixation complete.
Building iterable sentences.
Building sparse matrix.
--
Sparse matrifixation complete.
Building iterable sentences.
Building sparse matrix.
--
Sparse matrifixation complete.
Building iterable sentences.
Building sparse matrix.
---------
Sparse matrifixation complete.
Building iterable sentences.
Building sparse matrix.
---------
Sparse matrifixation complete.
Building iterable sentences.
Building sparse matrix.
---------
Sparse matrifixation complete.
Building iterable sentences.
Building sparse matrix.
---------
Sparse matrifixation complete.
[CV]  , accuracy=0.5950315457413249, precision=0.23695652173913043, recall=0.4014732965009208, f1=0.29801777170198224, total=   7.6s
[CV]  .............................................................

[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   31.9s remaining:    0.0s


Building sparse matrix.
----------
Sparse matrifixation complete.
[LibLinear]Building iterable sentences.
Building sparse matrix.
-
Sparse matrifixation complete.
Building iterable sentences.
Building sparse matrix.
-
Sparse matrifixation complete.
Building iterable sentences.
Building sparse matrix.
-
Sparse matrifixation complete.
Building iterable sentences.
Building sparse matrix.
-
Sparse matrifixation complete.
Building iterable sentences.
Building sparse matrix.
----------
Sparse matrifixation complete.
Building iterable sentences.
Building sparse matrix.


KeyboardInterrupt: 

In [None]:
C = np.logspace(-2,4,20)
penalty = ['l1', 'l2']
hyperparameters = {'C':C, 'penalty':penalty}

clf = GridSearchCV(clf2,
                   hyperparameters, verbose=10, cv=10, scoring='f1')

clf.fit(X, y)

Fitting 10 folds for each of 40 candidates, totalling 400 fits
[CV] C=0.01, penalty=l1 ..............................................
Building features.
Building iterable sentences.


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Building sparse matrix.
-----------
Sparse matrifixation complete.
[LibLinear]Building iterable sentences.
Building sparse matrix.

Sparse matrifixation complete.
Building iterable sentences.
Building sparse matrix.
-----------
Sparse matrifixation complete.
[CV] ...... C=0.01, penalty=l1, score=0.358257477243173, total=   2.0s
[CV] C=0.01, penalty=l1 ..............................................
Building features.
Building iterable sentences.
Building sparse matrix.


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    3.7s remaining:    0.0s


---------
Sparse matrifixation complete.
[LibLinear]Building iterable sentences.
Building sparse matrix.
--
Sparse matrifixation complete.
Building iterable sentences.
Building sparse matrix.
---------
Sparse matrifixation complete.
[CV] .... C=0.01, penalty=l1, score=0.35271191945436825, total=   1.9s
[CV] C=0.01, penalty=l1 ..............................................
Building features.
Building iterable sentences.


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    7.4s remaining:    0.0s


Building sparse matrix.
----------
Sparse matrifixation complete.
[LibLinear]Building iterable sentences.
Building sparse matrix.
-
Sparse matrifixation complete.
Building iterable sentences.
Building sparse matrix.
----------
Sparse matrifixation complete.
[CV] .... C=0.01, penalty=l1, score=0.38812937645881956, total=   1.9s
[CV] C=0.01, penalty=l1 ..............................................
Building features.
Building iterable sentences.


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   11.2s remaining:    0.0s


Building sparse matrix.
----------
Sparse matrifixation complete.
[LibLinear]Building iterable sentences.
Building sparse matrix.
-
Sparse matrifixation complete.
Building iterable sentences.
Building sparse matrix.
----------
Sparse matrifixation complete.
[CV] .... C=0.01, penalty=l1, score=0.13436385255648037, total=   2.0s
[CV] C=0.01, penalty=l1 ..............................................
Building features.
Building iterable sentences.


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   15.1s remaining:    0.0s


Building sparse matrix.
-----------
Sparse matrifixation complete.
[LibLinear]Building iterable sentences.
Building sparse matrix.

Sparse matrifixation complete.
Building iterable sentences.
Building sparse matrix.
-----------
Sparse matrifixation complete.
[CV] .................... C=0.01, penalty=l1, score=0.0, total=   1.9s
[CV] C=0.01, penalty=l1 ..............................................
Building features.
Building iterable sentences.


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   18.7s remaining:    0.0s


Building sparse matrix.
----------
Sparse matrifixation complete.
[LibLinear]Building iterable sentences.
Building sparse matrix.
-
Sparse matrifixation complete.
Building iterable sentences.
Building sparse matrix.
----------
Sparse matrifixation complete.
[CV] .. C=0.01, penalty=l1, score=0.0011514104778353484, total=   2.0s
[CV] C=0.01, penalty=l1 ..............................................
Building features.
Building iterable sentences.


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:   22.8s remaining:    0.0s


Building sparse matrix.
----------
Sparse matrifixation complete.
[LibLinear]Building iterable sentences.
Building sparse matrix.
-
Sparse matrifixation complete.
Building iterable sentences.
Building sparse matrix.
----------
Sparse matrifixation complete.
[CV] ... C=0.01, penalty=l1, score=0.014799154334038056, total=   1.8s
[CV] C=0.01, penalty=l1 ..............................................
Building features.
Building iterable sentences.
Building sparse matrix.

[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:   26.5s remaining:    0.0s



----------
Sparse matrifixation complete.
[LibLinear]Building iterable sentences.
Building sparse matrix.
-
Sparse matrifixation complete.
Building iterable sentences.
Building sparse matrix.
----------
Sparse matrifixation complete.
[CV] .... C=0.01, penalty=l1, score=0.32400472999605834, total=   1.8s
[CV] C=0.01, penalty=l1 ..............................................
Building features.
Building iterable sentences.
Building sparse matrix.


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:   30.1s remaining:    0.0s


---------
Sparse matrifixation complete.
[LibLinear]Building iterable sentences.
Building sparse matrix.
--
Sparse matrifixation complete.
Building iterable sentences.
Building sparse matrix.
---------
Sparse matrifixation complete.
[CV] .... C=0.01, penalty=l1, score=0.33882663150955833, total=   1.8s
[CV] C=0.01, penalty=l1 ..............................................
Building features.
Building iterable sentences.


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:   33.7s remaining:    0.0s


Building sparse matrix.
---------
Sparse matrifixation complete.
[LibLinear]Building iterable sentences.
Building sparse matrix.
--
Sparse matrifixation complete.
Building iterable sentences.
Building sparse matrix.
---------
Sparse matrifixation complete.
[CV] ..... C=0.01, penalty=l1, score=0.3949367088607595, total=   1.9s
[CV] C=0.01, penalty=l2 ..............................................
Building features.
Building iterable sentences.
Building sparse matrix.
-----------
Sparse matrifixation complete.
[LibLinear]Building iterable sentences.
Building sparse matrix.

Sparse matrifixation complete.
Building iterable sentences.
Building sparse matrix.
-----------
Sparse matrifixation complete.
[CV] .... C=0.01, penalty=l2, score=0.31799650553290626, total=   1.9s
[CV] C=0.01, penalty=l2 ..............................................
Building features.
Building iterable sentences.
Building sparse matrix.
---------
Sparse matrifixation complete.
[LibLinear]Building iterable sentences.

-
Sparse matrifixation complete.
Building iterable sentences.
Building sparse matrix.
----------
Sparse matrifixation complete.
[CV]  C=0.0206913808111479, penalty=l1, score=0.3128888888888889, total=   2.9s
[CV] C=0.0206913808111479, penalty=l1 ................................
Building features.
Building iterable sentences.
Building sparse matrix.
---------
Sparse matrifixation complete.
[LibLinear]Building iterable sentences.
Building sparse matrix.
--
Sparse matrifixation complete.
Building iterable sentences.
Building sparse matrix.
---------
Sparse matrifixation complete.
[CV]  C=0.0206913808111479, penalty=l1, score=0.3342836778332145, total=   6.0s
[CV] C=0.0206913808111479, penalty=l1 ................................
Building features.
Building iterable sentences.
Building sparse matrix.
---------
Sparse matrifixation complete.
[LibLinear]Building iterable sentences.
Building sparse matrix.
--
Sparse matrifixation complete.
Building iterable sentences.
Building sparse matrix.
-

-
Sparse matrifixation complete.
Building iterable sentences.
Building sparse matrix.
----------
Sparse matrifixation complete.
[CV]  C=0.04281332398719394, penalty=l1, score=0.002081165452653486, total=   1.9s
[CV] C=0.04281332398719394, penalty=l1 ...............................
Building features.
Building iterable sentences.
Building sparse matrix.
----------
Sparse matrifixation complete.
[LibLinear]Building iterable sentences.
Building sparse matrix.
-
Sparse matrifixation complete.
Building iterable sentences.
Building sparse matrix.
----------
Sparse matrifixation complete.
[CV]  C=0.04281332398719394, penalty=l1, score=0.011059907834101382, total=   1.8s
[CV] C=0.04281332398719394, penalty=l1 ...............................
Building features.
Building iterable sentences.
Building sparse matrix.
----------
Sparse matrifixation complete.
[LibLinear]Building iterable sentences.
Building sparse matrix.
-
Sparse matrifixation complete.
Building iterable sentences.
Building sparse ma

-
Sparse matrifixation complete.
Building iterable sentences.
Building sparse matrix.
----------
Sparse matrifixation complete.
[CV]  C=0.08858667904100823, penalty=l1, score=0.222509702457956, total=   1.9s
[CV] C=0.08858667904100823, penalty=l1 ...............................
Building features.
Building iterable sentences.
Building sparse matrix.
-----------
Sparse matrifixation complete.
[LibLinear]Building iterable sentences.
Building sparse matrix.

Sparse matrifixation complete.
Building iterable sentences.
Building sparse matrix.
-----------
Sparse matrifixation complete.
[CV] ..... C=0.08858667904100823, penalty=l1, score=0.0, total=   1.8s
[CV] C=0.08858667904100823, penalty=l1 ...............................
Building features.
Building iterable sentences.
Building sparse matrix.
----------
Sparse matrifixation complete.
[LibLinear]Building iterable sentences.
Building sparse matrix.
-
Sparse matrifixation complete.
Building iterable sentences.
Building sparse matrix.
--------

--
Sparse matrifixation complete.
Building iterable sentences.
Building sparse matrix.
---------
Sparse matrifixation complete.
[CV]  C=0.18329807108324356, penalty=l1, score=0.3236009732360097, total=   1.9s
[CV] C=0.18329807108324356, penalty=l1 ...............................
Building features.
Building iterable sentences.
Building sparse matrix.
----------
Sparse matrifixation complete.
[LibLinear]Building iterable sentences.
Building sparse matrix.
-
Sparse matrifixation complete.
Building iterable sentences.
Building sparse matrix.
----------
Sparse matrifixation complete.
[CV]  C=0.18329807108324356, penalty=l1, score=0.35105315947843535, total=   1.9s
[CV] C=0.18329807108324356, penalty=l1 ...............................
Building features.
Building iterable sentences.
Building sparse matrix.
----------
Sparse matrifixation complete.
[LibLinear]Building iterable sentences.
Building sparse matrix.
-
Sparse matrifixation complete.
Building iterable sentences.
Building sparse matri

--
Sparse matrifixation complete.
Building iterable sentences.
Building sparse matrix.
---------
Sparse matrifixation complete.
[CV]  C=0.18329807108324356, penalty=l2, score=0.26636225266362257, total=   2.1s
[CV] C=0.37926901907322497, penalty=l1 ...............................
Building features.
Building iterable sentences.
Building sparse matrix.
-----------
Sparse matrifixation complete.
[LibLinear]Building iterable sentences.
Building sparse matrix.

Sparse matrifixation complete.
Building iterable sentences.
Building sparse matrix.
-----------
Sparse matrifixation complete.
[CV]  C=0.37926901907322497, penalty=l1, score=0.2787114845938375, total=   2.0s
[CV] C=0.37926901907322497, penalty=l1 ...............................
Building features.
Building iterable sentences.
Building sparse matrix.
---------
Sparse matrifixation complete.
[LibLinear]Building iterable sentences.
Building sparse matrix.
--
Sparse matrifixation complete.
Building iterable sentences.
Building sparse matr

-
Sparse matrifixation complete.
Building iterable sentences.
Building sparse matrix.
----------
Sparse matrifixation complete.
[CV]  C=0.37926901907322497, penalty=l2, score=0.16781609195402297, total=   4.3s
[CV] C=0.37926901907322497, penalty=l2 ...............................
Building features.
Building iterable sentences.
Building sparse matrix.
---------
Sparse matrifixation complete.
[LibLinear]Building iterable sentences.
Building sparse matrix.
--
Sparse matrifixation complete.
Building iterable sentences.
Building sparse matrix.
---------
Sparse matrifixation complete.
[CV]  C=0.37926901907322497, penalty=l2, score=0.22743055555555558, total=   4.0s
[CV] C=0.37926901907322497, penalty=l2 ...............................
Building features.
Building iterable sentences.
Building sparse matrix.
---------
Sparse matrifixation complete.
[LibLinear]Building iterable sentences.
Building sparse matrix.
--
Sparse matrifixation complete.
Building iterable sentences.
Building sparse matri

## Train

In [21]:
clf = LogisticRegression(verbose=10, random_state=0, C=10, penalty='l1',
                         solver='liblinear', class_weight='balanced', max_iter=500)

In [38]:
# hyperparameters = {'C':[1, 5, 10, 20, 100], 'penalty':['l1', 'l2']}
# from sklearn.model_selection import GridSearchCV
# clf = GridSearchCV(LogisticRegression(class_weight='balanced', verbose=10), hyperparameters,
#                                       verbose=10, cv=10, scoring='f1')
# clf.fit(X_train, y_train)

In [39]:
scoring = ['accuracy', 'precision', 'recall', 'f1']
scores = cross_validate(clf, scoring=scoring, X=X, y=y, cv=10, verbose=10)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  ................................................................
[LibLinear][CV]  , accuracy=0.8695309420575483, precision=0.5328798185941043, recall=0.6527777777777778, f1=0.5867665418227216, total=   4.1s
[CV]  ................................................................
[LibLinear]

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    4.1s remaining:    0.0s


[CV]  , accuracy=0.8770201024832479, precision=0.5552995391705069, recall=0.6694444444444444, f1=0.6070528967254407, total=   3.3s
[CV]  ................................................................
[LibLinear]

[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    7.6s remaining:    0.0s


[CV]  , accuracy=0.8829325975561687, precision=0.579746835443038, recall=0.6361111111111111, f1=0.6066225165562913, total=   3.8s
[CV]  ................................................................
[LibLinear]

[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   11.6s remaining:    0.0s


[CV]  , accuracy=0.8651419558359621, precision=0.5214285714285715, recall=0.6083333333333333, f1=0.5615384615384615, total=   6.1s
[CV]  ................................................................
[LibLinear]

[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   17.8s remaining:    0.0s


[CV]  , accuracy=0.8698738170347003, precision=0.5362318840579711, recall=0.6166666666666667, f1=0.5736434108527132, total=   3.6s
[CV]  ................................................................
[LibLinear]

[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   21.5s remaining:    0.0s


[CV]  , accuracy=0.8753943217665615, precision=0.5531400966183575, recall=0.6361111111111111, f1=0.5917312661498708, total=   3.4s
[CV]  ................................................................
[LibLinear]

[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:   25.0s remaining:    0.0s


[CV]  , accuracy=0.8718454258675079, precision=0.5419664268585132, recall=0.6277777777777778, f1=0.5817245817245817, total=   4.2s
[CV]  ................................................................
[LibLinear]

[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:   29.3s remaining:    0.0s


[CV]  , accuracy=0.8852071005917159, precision=0.5867346938775511, recall=0.6406685236768802, f1=0.6125166444740348, total=   3.3s
[CV]  ................................................................
[LibLinear]

[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:   32.8s remaining:    0.0s


[CV]  , accuracy=0.8785009861932939, precision=0.5605700712589073, recall=0.6573816155988857, f1=0.6051282051282051, total=   3.3s
[CV]  ................................................................
[LibLinear]

[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:   36.2s remaining:    0.0s


[CV]  , accuracy=0.8785009861932939, precision=0.5583524027459954, recall=0.6796657381615598, f1=0.6130653266331657, total=   4.3s


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   40.7s finished


In [40]:
clf.fit(X, y)

[LibLinear]

LogisticRegression(C=10, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=500,
          multi_class='warn', n_jobs=None, penalty='l1', random_state=0,
          solver='liblinear', tol=0.0001, verbose=10, warm_start=False)

In [41]:
scores



{'fit_time': array([4.18380523, 3.41509652, 3.94245839, 6.19221807, 3.70330739,
        3.46014977, 4.27606821, 3.43008208, 3.36109948, 4.44207025]),
 'score_time': array([0.0050385 , 0.00496721, 0.00597739, 0.0039897 , 0.00398946,
        0.00404   , 0.00498724, 0.00402713, 0.00498772, 0.00503492]),
 'test_accuracy': array([0.86953094, 0.8770201 , 0.8829326 , 0.86514196, 0.86987382,
        0.87539432, 0.87184543, 0.8852071 , 0.87850099, 0.87850099]),
 'train_accuracy': array([0.97449941, 0.97524427, 0.97463085, 0.97432527, 0.97471959,
        0.97616544, 0.97458815, 0.97550931, 0.97489595, 0.97476451]),
 'test_precision': array([0.53287982, 0.55529954, 0.57974684, 0.52142857, 0.53623188,
        0.5531401 , 0.54196643, 0.58673469, 0.56057007, 0.5583524 ]),
 'train_precision': array([0.85109759, 0.85456476, 0.85195975, 0.85019815, 0.85185185,
        0.85763612, 0.85136207, 0.85596598, 0.85298013, 0.85267621]),
 'test_recall': array([0.65277778, 0.66944444, 0.63611111, 0.60833333, 0.6

In [42]:
sum(scores['test_f1']/len(scores['test_f1']))

0.5939789851605487

In [20]:
clf.predict(X)

from sklearn.metrics import classification_report
print(classification_report(y, clf.predict(X)))

              precision    recall  f1-score   support

       False       1.00      0.96      0.98     25886
        True       0.82      0.99      0.90      4659

   micro avg       0.97      0.97      0.97     30545
   macro avg       0.91      0.98      0.94     30545
weighted avg       0.97      0.97      0.97     30545

