## Тема “Создание признакового пространства и NER - продолжение”

In [1]:
import warnings
warnings.filterwarnings("ignore")

In [185]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import pandas as pd
import numpy as np
from tqdm import tqdm

tqdm.pandas()

PREP_DATA = '../data/prep_tweets.pkl'

#### Загрузим подготовленный датасет твиттов

In [3]:
df_prep = pd.read_pickle(PREP_DATA)
df_prep.head()

Unnamed: 0,id,label,tweet,clean_tweet,tweet_token,tweet_token_filtered,tweet_stemmed,tweet_lemmatized
0,1,0.0,@user when a father is dysfunctional and is s...,when father is dysfunctional and is so selfish...,"[when, father, is, dysfunctional, and, is, so,...","[father, dysfunctional, selfish, drags, kids, ...","[father, dysfunct, selfish, drag, kid, dysfunc...","[father, dysfunctional, selfish, drag, kid, dy..."
1,2,0.0,@user @user thanks for #lyft credit i can't us...,thanks for lyft credit cannot use cause they d...,"[thanks, for, lyft, credit, can, not, use, cau...","[thanks, lyft, credit, use, cause, offer, whee...","[thank, lyft, credit, use, caus, offer, wheelc...","[thank, lyft, credit, use, cause, offer, wheel..."
2,3,0.0,bihday your majesty,bihday your majesty,"[bihday, your, majesty]","[bihday, majesty]","[bihday, majesti]","[bihday, majesty]"
3,4,0.0,#model i love u take with u all the time in ...,model love you take with you all the time in ur,"[model, love, you, take, with, you, all, the, ...","[model, love, take, time, ur]","[model, love, take, time, ur]","[model, love, take, time, ur]"
4,5,0.0,factsguide: society now #motivation,factsguide society now motivation,"[factsguide, society, now, motivation]","[factsguide, society, motivation]","[factsguid, societi, motiv]","[factsguide, society, motivation]"


In [5]:
# получим текст из очищенных твитов
clean_tweets_text = " ".join(df_prep['clean_tweet'])
clean_tweets_text[:200]

'when father is dysfunctional and is so selfish he drags his kids into his dysfunction run thanks for lyft credit cannot use cause they do not offer wheelchair vans in pdx disapointed getthanked bihday'

### 1. Spacy

In [12]:
if 0:
    !pip install -U spacy
    !python -m spacy download en_core_web_md
    !python -m spacy info
    
import spacy
from spacy import displacy
import en_core_web_md


In [27]:
spacy_text = clean_tweets_text

In [29]:
%%time
nlp = en_core_web_md.load()
article = nlp(spacy_text[:1000])
displacy.render(article, jupyter=True, style='ent')

CPU times: user 6.58 s, sys: 342 ms, total: 6.92 s
Wall time: 6.9 s


In [30]:
# дерево зависимости
displacy.render(article[:10], style='dep', jupyter=True)

In [66]:
# слишком большой текст для spacy
len(spacy_text)

3455050

In [189]:
%%time
# применим NER распозновалку к каждому твиту
df_prep['spacy_ents'] = df_prep['clean_tweet'].progress_apply(lambda x: nlp(x).ents)

100%|██████████| 49159/49159 [14:32<00:00, 56.32it/s]

CPU times: user 14min 34s, sys: 5.25 s, total: 14min 39s
Wall time: 14min 32s





In [242]:
# Создадим массив ner с типом и преобразуем в датафрейм
def get_ner_dataframe(in_scasy_ents: pd.Series)-> pd.DataFrame: 
    word_list = []

    def add_to_word_dict(ents):
        for ent in ents:
            word_list.append((ent.text, ent.label_))
        
    _ = in_scasy_ents.progress_apply(add_to_word_dict)
    
    return pd.DataFrame(word_list, columns=['ner', 'ner_type'])

In [243]:
df_all_NERS = get_ner_dataframe(df_prep['spacy_ents'])
df_train_NERS = get_ner_dataframe(df_prep.loc[df_prep.label < 1, 'spacy_ents'])
df_test_NERS = get_ner_dataframe(df_prep.loc[df_prep.label == 1, 'spacy_ents'])

100%|██████████| 49159/49159 [00:00<00:00, 100763.34it/s]
100%|██████████| 29720/29720 [00:00<00:00, 99390.61it/s]
100%|██████████| 2242/2242 [00:00<00:00, 88335.98it/s]


### TOP 20 NER

In [233]:
# NER топ 20 общих
df_all_NERS.ner.value_counts()[:20]

today       1397
tomorrow     597
friday       588
one          518
first        514
orlando      473
sunday       466
morning      432
tonight      420
bihday       406
summer       402
saturday     323
monday       242
america      218
night        198
two          183
days         179
weekend      172
london       169
thursday     165
Name: ner, dtype: int64

In [234]:
# топ 20 трайн
df_train_NERS.ner.value_counts()[:20]

today       900
tomorrow    393
friday      390
first       323
one         322
orlando     319
sunday      303
morning     291
tonight     266
summer      263
bihday      261
saturday    211
monday      155
night       136
days        119
two         116
thursday    114
london      109
america     108
weekend     108
Name: ner, dtype: int64

In [235]:
# топ 20 тест
df_test_NERS.ner.value_counts()[:20]

sjw                                                        73
allahsoil                                                  52
america                                                    51
obama                                                      34
tampa                                                      32
miami                                                      30
hispanic                                                   29
calgary                                                    27
sikh                                                       27
wso                                                        26
one                                                        21
daily                                                      20
feminismiscancer feminismisterrorism feminismmuktbharat    20
americans                                                  17
trump                                                      17
sea shepherd suppoers                                      17
nazi    

PERSON         11562
DATE           10821
ORG             8108
GPE             5415
TIME            2302
NORP            1701
CARDINAL        1079
ORDINAL          585
FAC              371
PRODUCT          326
LOC              284
EVENT            220
WORK_OF_ART      106
QUANTITY          55
LANGUAGE          40
MONEY             20
LAW               11
PERCENT            4
Name: ner_type, dtype: int64

### TOP 20 NER TYPES

In [236]:
df_all_NERS.ner_type.value_counts()[:20]

PERSON         11562
DATE           10821
ORG             8108
GPE             5415
TIME            2302
NORP            1701
CARDINAL        1079
ORDINAL          585
FAC              371
PRODUCT          326
LOC              284
EVENT            220
WORK_OF_ART      106
QUANTITY          55
LANGUAGE          40
MONEY             20
LAW               11
PERCENT            4
Name: ner_type, dtype: int64

In [238]:
df_train_NERS.ner_type.value_counts()[:20]

DATE           7041
PERSON         6835
ORG            4617
GPE            3240
TIME           1488
NORP            708
CARDINAL        660
ORDINAL         365
FAC             225
PRODUCT         199
LOC             173
EVENT           127
WORK_OF_ART      62
QUANTITY         37
LANGUAGE         24
LAW               8
MONEY             6
PERCENT           3
Name: ner_type, dtype: int64

In [239]:
df_test_NERS.ner_type.value_counts()[:20]

ORG            645
PERSON         628
GPE            375
NORP           374
DATE           134
CARDINAL        48
TIME            17
LOC             16
EVENT           14
ORDINAL         13
PRODUCT          7
WORK_OF_ART      5
LANGUAGE         4
FAC              4
MONEY            3
Name: ner_type, dtype: int64

### 2. NLTK

### 3. Какая из библиотек по вашему лучше отработала? 

Сравните качество полученных most_common NER и количество распознаных NER.

