### Задание

Попробуйте поработать с датасетом юридических текстов. В датасете всего две важных колонки признаков: заголовок дела и его текст, а целевая переменная - case_outcome (мультиклассовая классификация). 

В базовом варианте можно оставить только текст дела, если хотите поинтереснее - можно попробовать распарсить case_title, добыв оттуда дополнительные признаки. 

https://www.kaggle.com/datasets/amohankumar/legal-text-classification-dataset

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import *

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.cluster import AgglomerativeClustering, KMeans
from sklearn.decomposition import TruncatedSVD
from sklearn.manifold import TSNE
from sklearn import metrics

from sklearn.metrics import classification_report

from nltk.tokenize import word_tokenize
from nltk import ngrams

from sklearn.metrics import accuracy_score

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
data = pd.read_csv('legal_text_classification.csv')
data.head()

Unnamed: 0,case_id,case_outcome,case_title,case_text
0,Case1,cited,Alpine Hardwood (Aust) Pty Ltd v Hardys Pty Lt...,Ordinarily that discretion will be exercised s...
1,Case2,cited,Black v Lipovac [1998] FCA 699 ; (1998) 217 AL...,The general principles governing the exercise ...
2,Case3,cited,Colgate Palmolive Co v Cussons Pty Ltd (1993) ...,Ordinarily that discretion will be exercised s...
3,Case4,cited,Dais Studio Pty Ltd v Bullett Creative Pty Ltd...,The general principles governing the exercise ...
4,Case5,cited,Dr Martens Australia Pty Ltd v Figgins Holding...,The preceding general principles inform the ex...


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24985 entries, 0 to 24984
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   case_id       24985 non-null  object
 1   case_outcome  24985 non-null  object
 2   case_title    24985 non-null  object
 3   case_text     24809 non-null  object
dtypes: object(4)
memory usage: 780.9+ KB


In [4]:
data = data.drop('case_id', axis=1) # сразу дропу id

In [5]:
data.shape

(24985, 3)

In [6]:
print(set(data.case_outcome))

{'related', 'discussed', 'referred to', 'considered', 'followed', 'affirmed', 'distinguished', 'approved', 'applied', 'cited'}


In [7]:
data.head()

Unnamed: 0,case_outcome,case_title,case_text
0,cited,Alpine Hardwood (Aust) Pty Ltd v Hardys Pty Lt...,Ordinarily that discretion will be exercised s...
1,cited,Black v Lipovac [1998] FCA 699 ; (1998) 217 AL...,The general principles governing the exercise ...
2,cited,Colgate Palmolive Co v Cussons Pty Ltd (1993) ...,Ordinarily that discretion will be exercised s...
3,cited,Dais Studio Pty Ltd v Bullett Creative Pty Ltd...,The general principles governing the exercise ...
4,cited,Dr Martens Australia Pty Ltd v Figgins Holding...,The preceding general principles inform the ex...


In [8]:
data = data.dropna()

In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 24809 entries, 0 to 24984
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   case_outcome  24809 non-null  object
 1   case_title    24809 non-null  object
 2   case_text     24809 non-null  object
dtypes: object(3)
memory usage: 775.3+ KB


### Baseline

In [10]:
x_train, x_test, y_train, y_test = train_test_split(data.case_text, data.case_outcome)

In [12]:
vec = CountVectorizer(ngram_range=(1, 1)) # составляем словарь униграмм
bow = vec.fit_transform(x_train) # слова превращаются в векторы 

In [13]:
list(vec.vocabulary_.items())[:10] # посмотрим, что получилось

[('recently', 32380),
 ('in', 21250),
 ('european', 16302),
 ('community', 11342),
 ('commissioner', 11304),
 ('of', 28535),
 ('patents', 29589),
 ('2006', 1962),
 ('fca', 17079),
 ('706', 4141)]

In [15]:
clf = LogisticRegression(solver='liblinear', random_state=42, max_iter=500)
clf.fit(bow, y_train)



In [16]:
pred = clf.predict(vec.transform(x_test)) # трансформация теста
print(classification_report(pred, y_test))

               precision    recall  f1-score   support

     affirmed       0.36      0.50      0.42        20
      applied       0.27      0.36      0.31       453
     approved       0.24      0.33      0.28        18
        cited       0.79      0.67      0.73      3569
   considered       0.28      0.37      0.32       320
    discussed       0.22      0.34      0.26       173
distinguished       0.31      0.50      0.38       101
     followed       0.36      0.42      0.39       460
  referred to       0.53      0.54      0.54      1075
      related       0.26      0.50      0.34        14

     accuracy                           0.58      6203
    macro avg       0.36      0.45      0.40      6203
 weighted avg       0.62      0.58      0.60      6203



In [11]:
# теперь то же самое, только ->
vec = CountVectorizer(ngram_range=(2, 3)) # составляем словарь би-, триграмм
bow = vec.fit_transform(x_train)

In [12]:
list(vec.vocabulary_.items())[:10] # посмотрим, что получилось

[('an important', 336221),
 ('important consideration', 1306687),
 ('consideration which', 786474),
 ('which informs', 2816685),
 ('informs the', 1376322),
 ('the exercise', 2482731),
 ('exercise of', 1039394),
 ('of the', 1810954),
 ('the discretion', 2477358),
 ('discretion on', 935547)]

In [14]:
clf = LogisticRegression(solver='liblinear', random_state=42, penalty='l1')
clf.fit(bow, y_train)




In [15]:
pred = clf.predict(vec.transform(x_test))
print(classification_report(pred, y_test))

               precision    recall  f1-score   support

     affirmed       0.36      0.62      0.46        13
      applied       0.30      0.36      0.33       495
     approved       0.17      0.62      0.27         8
        cited       0.80      0.66      0.73      3640
   considered       0.23      0.35      0.28       286
    discussed       0.26      0.37      0.31       180
distinguished       0.28      0.56      0.38        81
     followed       0.37      0.47      0.42       462
  referred to       0.50      0.53      0.51      1026
      related       0.17      0.50      0.25        12

     accuracy                           0.58      6203
    macro avg       0.35      0.50      0.39      6203
 weighted avg       0.63      0.58      0.60      6203



Ничего не поменялось, и данные несбалансированные

In [16]:
# попробовала поэтому с class_weight='balanced
vec = CountVectorizer(ngram_range=(1, 1)) 
bow = vec.fit_transform(x_train)  
clf = LogisticRegression(solver='liblinear', random_state=42, max_iter=500, class_weight='balanced')
clf.fit(bow, y_train)
pred = clf.predict(vec.transform(x_test)) 
print(classification_report(pred, y_test))

               precision    recall  f1-score   support

     affirmed       0.50      0.33      0.40        33
      applied       0.34      0.34      0.34       602
     approved       0.17      0.15      0.16        33
        cited       0.70      0.70      0.70      2984
   considered       0.29      0.33      0.31       372
    discussed       0.30      0.29      0.30       257
distinguished       0.35      0.38      0.36       145
     followed       0.40      0.42      0.41       541
  referred to       0.56      0.50      0.53      1223
      related       0.22      0.62      0.33        13

     accuracy                           0.55      6203
    macro avg       0.38      0.41      0.38      6203
 weighted avg       0.56      0.55      0.55      6203



### TF-IDF

In [17]:
vec = TfidfVectorizer(ngram_range=(1, 1))
bow = vec.fit_transform(x_train)
clf = LogisticRegression(solver='liblinear', random_state=42)
clf.fit(bow, y_train)
pred = clf.predict(vec.transform(x_test))
print(classification_report(pred, y_test))

               precision    recall  f1-score   support

     affirmed       0.05      0.33      0.08         3
      applied       0.06      0.39      0.10        84
     approved       0.00      0.00      0.00         0
        cited       0.96      0.53      0.68      5491
   considered       0.03      0.39      0.05        31
    discussed       0.01      0.33      0.02         9
distinguished       0.01      1.00      0.02         2
     followed       0.08      0.60      0.14        78
  referred to       0.25      0.55      0.35       505
      related       0.00      0.00      0.00         0

     accuracy                           0.53      6203
    macro avg       0.14      0.41      0.14      6203
 weighted avg       0.87      0.53      0.63      6203



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Попробуем почистить данные

In [18]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from string import punctuation

In [19]:
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [20]:
# соберём шум
noise = stopwords.words('english') + list(punctuation)

In [21]:
# обучим без шума с униграммами
vec = CountVectorizer(ngram_range=(1, 1), tokenizer=word_tokenize, stop_words=noise)
bow = vec.fit_transform(x_train)
clf = LogisticRegression(solver='liblinear', random_state=42)
clf.fit(bow, y_train)
pred = clf.predict(vec.transform(x_test))
print(classification_report(pred, y_test))



               precision    recall  f1-score   support

     affirmed       0.41      0.45      0.43        20
      applied       0.24      0.38      0.30       376
     approved       0.14      0.33      0.20        12
        cited       0.82      0.65      0.72      3806
   considered       0.20      0.35      0.25       249
    discussed       0.24      0.34      0.28       179
distinguished       0.21      0.49      0.30        70
     followed       0.27      0.44      0.33       358
  referred to       0.48      0.47      0.48      1127
      related       0.11      0.67      0.19         6

     accuracy                           0.56      6203
    macro avg       0.31      0.46      0.35      6203
 weighted avg       0.64      0.56      0.59      6203



Несильно отличается

Теперь другие варианты.

In [2]:
# начну заново
data = pd.read_csv('legal_text_classification.csv')
data.head()

Unnamed: 0,case_id,case_outcome,case_title,case_text
0,Case1,cited,Alpine Hardwood (Aust) Pty Ltd v Hardys Pty Lt...,Ordinarily that discretion will be exercised s...
1,Case2,cited,Black v Lipovac [1998] FCA 699 ; (1998) 217 AL...,The general principles governing the exercise ...
2,Case3,cited,Colgate Palmolive Co v Cussons Pty Ltd (1993) ...,Ordinarily that discretion will be exercised s...
3,Case4,cited,Dais Studio Pty Ltd v Bullett Creative Pty Ltd...,The general principles governing the exercise ...
4,Case5,cited,Dr Martens Australia Pty Ltd v Figgins Holding...,The preceding general principles inform the ex...


In [3]:
data = data.drop('case_id', axis=1)

In [4]:
data = data.fillna('') # в этот раз не буду удалять ряды с пропусками

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24985 entries, 0 to 24984
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   case_outcome  24985 non-null  object
 1   case_title    24985 non-null  object
 2   case_text     24985 non-null  object
dtypes: object(3)
memory usage: 585.7+ KB


In [6]:
data.case_outcome.value_counts() # посмотрела ещё раз на баланс

case_outcome
cited            12219
referred to       4384
applied           2448
followed          2256
considered        1712
discussed         1024
distinguished      608
related            113
affirmed           113
approved           108
Name: count, dtype: int64

In [7]:
# объединю case text и case title
data['case_text_and_title'] = data['case_title'] + data['case_text']

In [8]:
data.head()

Unnamed: 0,case_outcome,case_title,case_text,case_text_and_title
0,cited,Alpine Hardwood (Aust) Pty Ltd v Hardys Pty Lt...,Ordinarily that discretion will be exercised s...,Alpine Hardwood (Aust) Pty Ltd v Hardys Pty Lt...
1,cited,Black v Lipovac [1998] FCA 699 ; (1998) 217 AL...,The general principles governing the exercise ...,Black v Lipovac [1998] FCA 699 ; (1998) 217 AL...
2,cited,Colgate Palmolive Co v Cussons Pty Ltd (1993) ...,Ordinarily that discretion will be exercised s...,Colgate Palmolive Co v Cussons Pty Ltd (1993) ...
3,cited,Dais Studio Pty Ltd v Bullett Creative Pty Ltd...,The general principles governing the exercise ...,Dais Studio Pty Ltd v Bullett Creative Pty Ltd...
4,cited,Dr Martens Australia Pty Ltd v Figgins Holding...,The preceding general principles inform the ex...,Dr Martens Australia Pty Ltd v Figgins Holding...


In [9]:
data = data.drop(['case_text', 'case_title'], axis=1)

In [10]:
# перевожу в числовые данные варианты в колонке с целевой переменной с помощью
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

data['case_outcome_numeric'] = le.fit_transform(data['case_outcome'])

In [11]:
data = data.drop('case_outcome', axis=1)

In [12]:
data.head()

Unnamed: 0,case_text_and_title,case_outcome_numeric
0,Alpine Hardwood (Aust) Pty Ltd v Hardys Pty Lt...,3
1,Black v Lipovac [1998] FCA 699 ; (1998) 217 AL...,3
2,Colgate Palmolive Co v Cussons Pty Ltd (1993) ...,3
3,Dais Studio Pty Ltd v Bullett Creative Pty Ltd...,3
4,Dr Martens Australia Pty Ltd v Figgins Holding...,3


Теперь нужно поработать с текстом

In [13]:
from nltk.corpus import stopwords
import re
from nltk.stem import WordNetLemmatizer
import string

In [14]:
stopwords = set(stopwords.words('english'))

In [15]:
def process(text):
    '''Эта функция для лемматизации, удаления стоп-слов, ссылок, пунктуации и приведения к нижнему регистру'''
    text = re.sub(r'https?://\S+|www\.\S+', '', text) # удаляем ссылки
    text = text.lower()
    text = ''.join([char for char in text if char not in string.punctuation])
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stopwords]
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens) # обработанный текст

In [16]:
from sklearn.pipeline import make_pipeline

In [17]:
def feature_engineering(choice_transformer, choice_ngrams):

    # текстовые характеристики обрабатываем: либо tf-idf, либо мешок слов
    text_features = 'case_text_and_title'
    if choice_transformer == 'tfidf':
        text_transformer = make_pipeline(TfidfVectorizer(ngram_range=choice_ngrams), StandardScaler(with_mean=False))
    else:
        text_transformer = make_pipeline(CountVectorizer(ngram_range=choice_ngrams), StandardScaler(with_mean=False))
    return text_transformer

In [18]:
def modelfit(model):
    '''Функция для обучения и вывода score'''
    model.fit(Xtrain, ytrain)
    
    ypredtest = model.predict(Xtest)
    ypredtrain = model.predict(Xtrain)
    
    print(accuracy_score(ytest, ypredtest), accuracy_score(ytrain, ypredtrain))

In [19]:
data.head()

Unnamed: 0,case_text_and_title,case_outcome_numeric
0,Alpine Hardwood (Aust) Pty Ltd v Hardys Pty Lt...,3
1,Black v Lipovac [1998] FCA 699 ; (1998) 217 AL...,3
2,Colgate Palmolive Co v Cussons Pty Ltd (1993) ...,3
3,Dais Studio Pty Ltd v Bullett Creative Pty Ltd...,3
4,Dr Martens Australia Pty Ltd v Figgins Holding...,3


In [20]:
data['processed_case_text_title'] = data['case_text_and_title'].apply(process)
data.head()

Unnamed: 0,case_text_and_title,case_outcome_numeric,processed_case_text_title
0,Alpine Hardwood (Aust) Pty Ltd v Hardys Pty Lt...,3,alpine hardwood aust pty ltd v hardy pty ltd 2...
1,Black v Lipovac [1998] FCA 699 ; (1998) 217 AL...,3,black v lipovac 1998 fca 699 1998 217 alr 386t...
2,Colgate Palmolive Co v Cussons Pty Ltd (1993) ...,3,colgate palmolive co v cussons pty ltd 1993 47...
3,Dais Studio Pty Ltd v Bullett Creative Pty Ltd...,3,dais studio pty ltd v bullett creative pty ltd...
4,Dr Martens Australia Pty Ltd v Figgins Holding...,3,dr marten australia pty ltd v figgins holding ...


In [21]:
X = data['processed_case_text_title']  
y = data['case_outcome_numeric']  

In [22]:
# разделим
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, random_state=42)
# обучим разные модели !

### TF-IDF unigrams

In [23]:
preprocessor = feature_engineering('tfidf', (1, 1))

clfLR = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", LogisticRegression(solver='liblinear'))]
)

clfLinSVC = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", LinearSVC())]
)

In [24]:
# вроде есть масштабирование данных, но почему-то у меня SVC всегда долго учится, не могу понять, в чём проблема
# поменяла на LinearSVC, потому что ничего не помогало ускориться
modelfit(clfLR)
modelfit(clfLinSVC) 

0.5405243145887533 0.9998999399639784




0.5259155493295977 0.9993996397838704


Очень сильное переобучение получилось :(

### TF-IDF bigrams

In [25]:
preprocessor = feature_engineering('tfidf', (2, 2))

clfLR = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", LogisticRegression())]
)

clfLinSVC = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", LinearSVC())]
)

In [26]:
modelfit(clfLR)
modelfit(clfLinSVC)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.5997598559135481 0.9998999399639784




0.5939563738242946 0.9998999399639784


### BOW unigrams

In [27]:
preprocessor = feature_engineering('bow', (1, 1))

clfLR = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", LogisticRegression())]
)

clfLinSVC = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", LinearSVC())]
)

In [28]:
modelfit(clfLR)
modelfit(clfLinSVC)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.5429257554532719 0.998949369621773




0.5173103862317391 0.999499699819892


### BOW bigrams

In [29]:
preprocessor = feature_engineering('bow', (2, 2))

clfLR = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", LogisticRegression())]
)

clfLinSVC = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", LinearSVC())]
)

In [30]:
modelfit(clfLR)
modelfit(clfLinSVC)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.5997598559135481 0.9998999399639784




0.5885531318791275 0.9998999399639784


### BOW Bagging unigrams

In [31]:
preprocessor = feature_engineering('bow', (1, 1))

bagging = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", BaggingClassifier())]
)

In [32]:
modelfit(bagging)

0.5567340404242546 0.9762357414448669


### Random Forest

Везде получилось переобучение, к сожалению. Не поняла пока причину.

Ещё я везде, кроме TF-IDF unigrams, забыла поменять solver, но я думаю, на результаты это бы вряд ли повлияло. И может всё-таки стоило вместо того, чтобы менять SVC на LinearSVC, ограничить кол-во фич?


In [24]:
def feature_engineering(choice_transformer, choice_ngrams):

    # текстовые характеристики обрабатываем: либо tf-idf, либо мешок слов
    text_features = 'case_text_and_title'
    if choice_transformer == 'tfidf':
        text_transformer = make_pipeline(TfidfVectorizer(ngram_range=choice_ngrams, max_features=4242), StandardScaler(with_mean=False)) # попробую сократить размер
    else:
        text_transformer = make_pipeline(CountVectorizer(ngram_range=choice_ngrams, max_features=4242), StandardScaler(with_mean=False))
    return text_transformer

In [25]:
preprocessor = feature_engineering('bow', (2, 2))

clf = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", RandomForestClassifier())]
)

In [26]:
modelfit(clf)

0.5905543325995597 0.9964478687212327


Опять то же самое, зато обучается теперь побыстрее (надо было раньше уменьшить...)

### Эмбеддинги Doc2Vec из gensim

In [30]:
from gensim.models import Doc2Vec
import gensim
from gensim.models.doc2vec import TaggedDocument
from sklearn import utils
from tqdm import tqdm
import nltk
from nltk.corpus import stopwords

In [28]:
data.head()

Unnamed: 0,case_text_and_title,case_outcome_numeric,processed_case_text_title
0,Alpine Hardwood (Aust) Pty Ltd v Hardys Pty Lt...,3,alpine hardwood aust pty ltd v hardy pty ltd 2...
1,Black v Lipovac [1998] FCA 699 ; (1998) 217 AL...,3,black v lipovac 1998 fca 699 1998 217 alr 386t...
2,Colgate Palmolive Co v Cussons Pty Ltd (1993) ...,3,colgate palmolive co v cussons pty ltd 1993 47...
3,Dais Studio Pty Ltd v Bullett Creative Pty Ltd...,3,dais studio pty ltd v bullett creative pty ltd...
4,Dr Martens Australia Pty Ltd v Figgins Holding...,3,dr marten australia pty ltd v figgins holding ...


In [35]:
def tokenize_text(text):
    
    tokens = []
    for sent in nltk.sent_tokenize(text):
        for word in nltk.word_tokenize(sent):
            if len(word) < 2: # чтобы слишком короткие не попадали
                continue
            tokens.append(word.lower())
    return tokens

train, test = train_test_split(data[['processed_case_text_title', 'case_outcome_numeric']], test_size=0.3, random_state=42)

# соберем специальный объект класса TaggedDocument, чтобы D2V работал
train_tagged = train.apply(
    lambda r: TaggedDocument(words=tokenize_text(r['processed_case_text_title']), tags=[r.case_outcome_numeric]), axis=1)
test_tagged = test.apply(
    lambda r: TaggedDocument(words=tokenize_text(r['processed_case_text_title']), tags=[r.case_outcome_numeric]), axis=1)

In [36]:
import multiprocessing 
cores = multiprocessing.cpu_count() # все ядра в деле

In [38]:
model_dbow = Doc2Vec(dm=0, vector_size=300, negative=5, hs=0, min_count=2, sample=0, workers=cores)
model_dbow.build_vocab([x for x in tqdm(train_tagged.values)])

100%|██████████| 17489/17489 [00:00<00:00, 2191705.24it/s]


In [39]:
for epoch in range(30): 
    model_dbow.train(utils.shuffle([x for x in tqdm(train_tagged.values)]), total_examples=len(train_tagged.values), epochs=1)
    model_dbow.alpha -= 0.002
    model_dbow.min_alpha = model_dbow.alpha

100%|██████████| 17489/17489 [00:00<00:00, 2502786.95it/s]
100%|██████████| 17489/17489 [00:00<00:00, 3507587.75it/s]
100%|██████████| 17489/17489 [00:00<00:00, 4383803.42it/s]
100%|██████████| 17489/17489 [00:00<00:00, 4385113.74it/s]
100%|██████████| 17489/17489 [00:00<00:00, 3507084.66it/s]
100%|██████████| 17489/17489 [00:00<00:00, 3507420.04it/s]
100%|██████████| 17489/17489 [00:00<00:00, 2922943.20it/s]
100%|██████████| 17489/17489 [00:00<00:00, 3507587.75it/s]
100%|██████████| 17489/17489 [00:00<00:00, 3507587.75it/s]
100%|██████████| 17489/17489 [00:00<00:00, 3507420.04it/s]
100%|██████████| 17489/17489 [00:00<00:00, 3507755.48it/s]
100%|██████████| 17489/17489 [00:00<00:00, 2922710.28it/s]
100%|██████████| 17489/17489 [00:00<00:00, 2505180.24it/s]
100%|██████████| 17489/17489 [00:00<00:00, 2923875.27it/s]
100%|██████████| 17489/17489 [00:00<00:00, 3507755.48it/s]
100%|██████████| 17489/17489 [00:00<00:00, 3507420.04it/s]
100%|██████████| 17489/17489 [00:00<00:00, 3506749.34it/

In [40]:
# transform
def vec_for_learning(model, tagged_docs):
    sents = tagged_docs.values
    targets, regressors = zip(*[(doc.tags[0], model.infer_vector(doc.words)) for doc in sents])
    return targets, regressors  

Logistic Regression

In [41]:
y_train, X_train = vec_for_learning(model_dbow, train_tagged)
y_test, X_test = vec_for_learning(model_dbow, test_tagged)
logreg = LogisticRegression(solver='liblinear', n_jobs=1, C=1e5)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
from sklearn.metrics import accuracy_score, f1_score
print('Testing accuracy %s' % accuracy_score(y_test, y_pred))
print('Testing F1 score: {}'.format(f1_score(y_test, y_pred, average='weighted')))

Testing accuracy 0.48519210245464245
Testing F1 score: 0.3259584838828355


### Выводы.

Лучше всего по качеству оказались самые первые модели. Разные классификаторы оказывают разное влияние на качество. У модели с эмбеддингами качество получилось ниже, чем я ожидала.