## Тема “Создание признакового пространства и NER - продолжение”

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import pandas as pd
import numpy as np
from tqdm import tqdm

tqdm.pandas()

PREP_DATA = '../data/prep_tweets.pkl'

#### Загрузим подготовленный датасет твиттов

In [4]:
df_prep = pd.read_pickle(PREP_DATA)
df_prep.head()

Unnamed: 0,id,label,tweet,clean_tweet,tweet_token,tweet_token_filtered,tweet_stemmed,tweet_lemmatized
0,1,0.0,@user when a father is dysfunctional and is s...,when father is dysfunctional and is so selfish...,"[when, father, is, dysfunctional, and, is, so,...","[father, dysfunctional, selfish, drags, kids, ...","[father, dysfunct, selfish, drag, kid, dysfunc...","[father, dysfunctional, selfish, drag, kid, dy..."
1,2,0.0,@user @user thanks for #lyft credit i can't us...,thanks for lyft credit cannot use cause they d...,"[thanks, for, lyft, credit, can, not, use, cau...","[thanks, lyft, credit, use, cause, offer, whee...","[thank, lyft, credit, use, caus, offer, wheelc...","[thank, lyft, credit, use, cause, offer, wheel..."
2,3,0.0,bihday your majesty,bihday your majesty,"[bihday, your, majesty]","[bihday, majesty]","[bihday, majesti]","[bihday, majesty]"
3,4,0.0,#model i love u take with u all the time in ...,model love you take with you all the time in ur,"[model, love, you, take, with, you, all, the, ...","[model, love, take, time, ur]","[model, love, take, time, ur]","[model, love, take, time, ur]"
4,5,0.0,factsguide: society now #motivation,factsguide society now motivation,"[factsguide, society, now, motivation]","[factsguide, society, motivation]","[factsguid, societi, motiv]","[factsguide, society, motivation]"


In [5]:
# получим текст из очищенных твитов
clean_tweets_text = " ".join(df_prep['clean_tweet'])
clean_tweets_text[:200]

'when father is dysfunctional and is so selfish he drags his kids into his dysfunction run thanks for lyft credit cannot use cause they do not offer wheelchair vans in pdx disapointed getthanked bihday'

### 1. Spacy

In [6]:
if 0:
    !pip install -U spacy
    !python -m spacy download en_core_web_md
    !python -m spacy info
    
import spacy
from spacy import displacy
import en_core_web_md


In [7]:
spacy_text = clean_tweets_text

In [8]:
%%time
nlp = en_core_web_md.load()
article = nlp(spacy_text[:1000])
displacy.render(article, jupyter=True, style='ent')

CPU times: user 6.35 s, sys: 365 ms, total: 6.71 s
Wall time: 6.69 s


In [9]:
# дерево зависимости
displacy.render(article[:10], style='dep', jupyter=True)

In [10]:
# слишком большой текст для spacy
len(spacy_text)

3455050

In [11]:
%%time
# применим NER распозновалку к каждому твиту
df_prep['spacy_ents'] = df_prep['clean_tweet'].progress_apply(lambda x: nlp(x).ents)

100%|██████████| 49159/49159 [14:42<00:00, 55.69it/s]

CPU times: user 14min 44s, sys: 5.01 s, total: 14min 49s
Wall time: 14min 42s





In [12]:
# Создадим массив ner с типом и преобразуем в датафрейм
def get_ner_dataframe(in_spacy_ents: pd.Series)-> pd.DataFrame: 
    word_list = []

    def add_to_word_list(ents):
        for ent in ents:
            word_list.append((ent.text, ent.label_))
        
    _ = in_spacy_ents.progress_apply(add_to_word_list)
    
    return pd.DataFrame(word_list, columns=['ner', 'ner_type'])

In [13]:
df_all_NERS = get_ner_dataframe(df_prep['spacy_ents'])
df_train_NERS = get_ner_dataframe(df_prep.loc[df_prep.label < 1, 'spacy_ents'])
df_test_NERS = get_ner_dataframe(df_prep.loc[df_prep.label == 1, 'spacy_ents'])

100%|██████████| 49159/49159 [00:00<00:00, 98630.28it/s]
100%|██████████| 29720/29720 [00:00<00:00, 99807.93it/s]
100%|██████████| 2242/2242 [00:00<00:00, 86349.47it/s]


### TOP 20 NER

In [14]:
# NER топ 20 общих
df_all_NERS.ner.value_counts()[:20]

today       1397
tomorrow     597
friday       588
one          518
first        514
orlando      473
sunday       466
morning      432
tonight      420
bihday       406
summer       402
saturday     323
monday       242
america      218
night        198
two          183
days         179
weekend      172
london       169
thursday     165
Name: ner, dtype: int64

### TOP 20 NER TYPES

In [236]:
df_all_NERS.ner_type.value_counts()[:20]

PERSON         11562
DATE           10821
ORG             8108
GPE             5415
TIME            2302
NORP            1701
CARDINAL        1079
ORDINAL          585
FAC              371
PRODUCT          326
LOC              284
EVENT            220
WORK_OF_ART      106
QUANTITY          55
LANGUAGE          40
MONEY             20
LAW               11
PERCENT            4
Name: ner_type, dtype: int64

#### TOP 20 PERSON

In [15]:
# топ 20 трайн
df_train_NERS[df_train_NERS.ner_type == 'PERSON'].ner.value_counts()[:20]

bing bong bing bong              75
suppo                            32
melancholy melancholymusic       31
hillary                          29
sta                              26
bjp                              25
hu                               24
christina grimmie                22
obama                            20
tgif ff                          18
jo cox                           18
gamedev indiedev indiegamedev    17
lebron                           16
detoxdiet altwaystoheal          15
donald trump                     15
heabroken                        15
ali                              13
anne hathaway                    12
th bihday                        12
karen iqbal                      12
Name: ner, dtype: int64

In [16]:
# топ 20 тест
df_test_NERS[df_test_NERS.ner_type == 'PERSON'].ner.value_counts()[:20]

obama                                                      26
feminismiscancer feminismisterrorism feminismmuktbharat    20
sea shepherd suppoers                                      17
boricua                                                    14
carl paladino                                              11
michelle obama                                              8
hillary                                                     8
boricua miami                                               6
chick                                                       6
endof tedtalks                                              4
jeffsessions                                                4
putin                                                       4
tyler perry                                                 4
donkey zionazis                                             4
suppo                                                       4
putinspuppet                                                4
adam sal

#### TOP 20 TYPES

In [17]:
df_train_NERS.ner_type.value_counts()[:20]

DATE           7041
PERSON         6835
ORG            4617
GPE            3240
TIME           1488
NORP            708
CARDINAL        660
ORDINAL         365
FAC             225
PRODUCT         199
LOC             173
EVENT           127
WORK_OF_ART      62
QUANTITY         37
LANGUAGE         24
LAW               8
MONEY             6
PERCENT           3
Name: ner_type, dtype: int64

In [18]:
df_test_NERS.ner_type.value_counts()[:20]

ORG            645
PERSON         628
GPE            375
NORP           374
DATE           134
CARDINAL        48
TIME            17
LOC             16
EVENT           14
ORDINAL         13
PRODUCT          7
WORK_OF_ART      5
FAC              4
LANGUAGE         4
MONEY            3
Name: ner_type, dtype: int64

### 2. NLTK

In [20]:
import nltk
if 0:
    nltk.download('averaged_perceptron_tagger')
    nltk.download('maxent_ne_chunker')
    nltk.download('words')

In [21]:
def nltk_ner(text:str)-> object:
    
    if not text:
        return None
    # тк зависит от регистра - переведем все в верхний
    ntxt = text.upper()
    nltk_chunks = nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(ntxt)))
    return [(' '.join(c[0] for c in chunk), chunk.label()) for chunk in nltk_chunks if hasattr(chunk, 'label')]
    

In [22]:
%%time
# применим NER распозновалку к каждому твиту
df_prep['nltk_ents'] = df_prep['clean_tweet'].progress_apply(nltk_ner)

100%|██████████| 49159/49159 [06:37<00:00, 123.66it/s]

CPU times: user 6min 36s, sys: 4.76 s, total: 6min 41s
Wall time: 6min 37s





In [23]:
df_prep['nltk_ents'].head()

0       [(FATHER, ORGANIZATION), (INTO, ORGANIZATION)]
1    [(THANKS, ORGANIZATION), (FOR, ORGANIZATION), ...
2                                                   []
3    [(MODEL, ORGANIZATION), (LOVE, ORGANIZATION), ...
4    [(FACTSGUIDE, ORGANIZATION), (SOCIETY, ORGANIZ...
Name: nltk_ents, dtype: object

In [24]:
# Создадим массив ner с типом и преобразуем в датафрейм
def get_nltk_ner_dataframe(in_spacy_ents: pd.Series)-> pd.DataFrame: 
    word_list = []

    def add_to_word_list(ents):
        if not ents:
            return None
        
        for ent in ents:
            word_list.append((ent[0], ent[1]))
        
    _ = in_spacy_ents.progress_apply(add_to_word_list)
    
    return pd.DataFrame(word_list, columns=['ner', 'ner_type'])

In [26]:
df_nltk_ner = get_nltk_ner_dataframe(df_prep['nltk_ents'])
df_train_nltk_ner = get_nltk_ner_dataframe(df_prep.loc[df_prep.label < 1, 'nltk_ents'])
df_test_nltk_ner = get_nltk_ner_dataframe(df_prep.loc[df_prep.label == 1, 'nltk_ents'])

100%|██████████| 49159/49159 [00:00<00:00, 319814.29it/s]
100%|██████████| 29720/29720 [00:00<00:00, 320905.34it/s]
100%|██████████| 2242/2242 [00:00<00:00, 255971.62it/s]


### TOP 20 NER

In [28]:
df_nltk_ner.ner.value_counts()[:20]

THE          5833
YOU          1256
LOVE         1138
NOT          1090
TO           1050
WITH          963
HAPPY         798
TIME          753
FOR           665
FAMILY        650
NEW           585
ARE           545
HAVE          539
THIS          538
MODEL         504
HAPPINESS     495
YOUR          442
GOLD          434
GOOD          393
OF            385
Name: ner, dtype: int64

### TOP 20 NER TYPES

In [29]:
df_nltk_ner.ner_type.value_counts()[:20]

ORGANIZATION    103397
GPE                877
PERSON             410
GSP                198
FACILITY            61
Name: ner_type, dtype: int64

#### TOP 20 PERSON

In [30]:
df_train_nltk_ner[df_train_nltk_ner.ner_type == 'PERSON'].ner.value_counts()[:20]

IN                26
CLINTON           16
SORRY             13
CLICK TO WATCH     8
CLICK              7
CLICK RECIPE       6
STEPHCURRY         4
JOHN               4
STEAK              3
CURRY              3
ANTON              2
JOHN HUNTERS       2
STEPH              2
BILL               2
JOHN BURR          2
DE                 2
KIM                2
ALBUM ON           2
JOHN WOODEN        2
TO THE             2
Name: ner, dtype: int64

In [31]:
df_test_nltk_ner[df_test_nltk_ner.ner_type == 'PERSON'].ner.value_counts()[:20]

PAUL                  3
IN                    2
ARABS                 2
JIMMY CAER            2
OPKKK OPTRUMP         2
ELECT                 2
PERRY SAYS            2
DONALD TRUMP          1
CLINTON TRUMP         1
IN COLOGNE            1
SADDLES ON            1
JACKSON WAS           1
BARRY PROBABLY        1
ELECT TO DENOUNCE     1
CLINTON HATES         1
ARABIAN SPICED        1
JOHN                  1
WOODROWWILSON HELD    1
CLINTON BLACK         1
ARABIC                1
Name: ner, dtype: int64

#### TOP 20 TYPES

In [33]:
df_train_nltk_ner.ner_type.value_counts()[:20]

ORGANIZATION    61458
GPE               496
PERSON            236
GSP               124
FACILITY           16
Name: ner_type, dtype: int64

In [34]:
df_test_nltk_ner.ner_type.value_counts()[:20]

ORGANIZATION    5783
GPE               57
PERSON            30
FACILITY          15
GSP               11
Name: ner_type, dtype: int64

## Эксперимент 2 - NLTK сразу весь текст 

In [35]:
# тк зависит от регистра - переведем все в верхний
nltk_text = clean_tweets_text.upper()

In [36]:
%%time
nltk_tags = nltk.pos_tag(nltk.word_tokenize(nltk_text))

CPU times: user 1min 19s, sys: 102 ms, total: 1min 19s
Wall time: 1min 19s


In [37]:
%%time
nltk_chunks = nltk.ne_chunk(nltk_tags)

CPU times: user 4min 38s, sys: 50.4 ms, total: 4min 38s
Wall time: 4min 38s


In [38]:
word_list = [(' '.join(c[0] for c in chunk), chunk.label()) for chunk in nltk_chunks if hasattr(chunk, 'label')]

In [39]:
df_nltk_ner1 = pd.DataFrame(word_list, columns=['ner', 'ner_type'])

In [40]:
df_nltk_ner1.ner_type.value_counts()

ORGANIZATION    65795
PERSON            269
GPE                40
FACILITY            3
GSP                 2
Name: ner_type, dtype: int64

In [41]:
df_nltk_ner1[df_nltk_ner.ner_type == 'PERSON'].ner.value_counts()[:20]

THE          19
HAPPINESS     4
TO            4
WITH          3
WORLD         3
CITY          3
YOUR          2
ABOUT         2
NOT           2
FAMILY        2
TWEETS        2
THEY          2
AN            2
COMPLETED     2
THIS          2
BECAUSE       2
LOOKS         2
SO            2
HARD          2
IMPOANT       2
Name: ner, dtype: int64

### 3. Какая из библиотек по вашему лучше отработала? 

Сравните качество полученных most_common NER и количество распознаных NER.



Мне по качеству распознования больше понравилась Spacy. Но в целом обе библиотеки показали схожие результаты. 

Вопрос только по NLTK по типам NER - как то их очень мало получилось. Я что то не учел?