## Тема “Создание признакового пространства”

In [74]:
import warnings
warnings.filterwarnings("ignore")

In [140]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import pandas as pd
import numpy as np

PREP_DATA = '../data/prep_tweets.pkl'

#### Загрузим подготовленный датасет твиттов

In [76]:
df_prep = pd.read_pickle(PREP_DATA)
df_prep.head()

Unnamed: 0,id,label,tweet,clean_tweet,tweet_token,tweet_token_filtered,tweet_stemmed,tweet_lemmatized
0,1,0.0,@user when a father is dysfunctional and is s...,when father is dysfunctional and is so selfish...,"[when, father, is, dysfunctional, and, is, so,...","[father, dysfunctional, selfish, drags, kids, ...","[father, dysfunct, selfish, drag, kid, dysfunc...","[father, dysfunctional, selfish, drag, kid, dy..."
1,2,0.0,@user @user thanks for #lyft credit i can't us...,thanks for lyft credit cannot use cause they d...,"[thanks, for, lyft, credit, can, not, use, cau...","[thanks, lyft, credit, use, cause, offer, whee...","[thank, lyft, credit, use, caus, offer, wheelc...","[thank, lyft, credit, use, cause, offer, wheel..."
2,3,0.0,bihday your majesty,bihday your majesty,"[bihday, your, majesty]","[bihday, majesty]","[bihday, majesti]","[bihday, majesty]"
3,4,0.0,#model i love u take with u all the time in ...,model love you take with you all the time in ur,"[model, love, you, take, with, you, all, the, ...","[model, love, take, time, ur]","[model, love, take, time, ur]","[model, love, take, time, ur]"
4,5,0.0,factsguide: society now #motivation,factsguide society now motivation,"[factsguide, society, now, motivation]","[factsguide, society, motivation]","[factsguid, societi, motiv]","[factsguide, society, motivation]"


### 1. Создайте мешок слов с помощью CountVectorizer

In [77]:
stemmed_docs = df_prep['tweet_stemmed'].apply(' '.join)

lemmatized_docs = df_prep['tweet_lemmatized'].apply(' '.join)

In [78]:
def _make_bagbag_of_words(vectorizer: object, docs: list) -> object:
    
    bag_of_words = vectorizer.fit_transform(docs)
    
    # Отобразим Bag-of-Words модель как DataFrame
    feature_names = vectorizer.get_feature_names()
    return pd.DataFrame(bag_of_words.toarray(), columns = feature_names).copy()


def make_bagbag_of_words_simple(docs: list) -> object:
   
    count_vectorizer = CountVectorizer(max_df=0.9, max_features = 1000, stop_words='english')
    return _make_bagbag_of_words(count_vectorizer, docs)


In [79]:
#stemmed
stemmed_bow = make_bagbag_of_words_simple(stemmed_docs)
stemmed_bow.head(3)

Unnamed: 0,abl,absolut,accept,account,act,action,activ,actor,actual,ad,...,yeah,year,yesterday,yo,yoga,york,young,youtub,yr,yummi
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [80]:
stemmed_bow.sum().sum()

216726

In [81]:
#lemmatized
lemmatized_bow = make_bagbag_of_words_simple(lemmatized_docs)
lemmatized_bow.head(3)

Unnamed: 0,able,absolutely,accept,account,act,action,actor,actually,adapt,add,...,yesterday,yo,yoga,york,young,youth,youtube,yr,yrs,yummy
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [82]:
lemmatized_bow.sum().sum()

204283

### 2. Создайте мешок слов с помощью TfidfVectorizer

In [88]:
def make_bagbag_of_words_tfidf(docs: list) -> object:
   
    vectorizer = TfidfVectorizer(max_df=0.9, max_features = 1000, stop_words='english')
    return _make_bagbag_of_words(vectorizer, docs)


In [89]:
#stemmed
stemmed_tfidf = make_bagbag_of_words_tfidf(stemmed_docs)
stemmed_tfidf.head(3)

Unnamed: 0,abl,absolut,accept,account,act,action,activ,actor,actual,ad,...,yeah,year,yesterday,yo,yoga,york,young,youtub,yr,yummi
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [90]:
stemmed_tfidf.sum().sum()

93434.75975058634

In [91]:
#lemmatized
lemmatized_tfidf = make_bagbag_of_words_tfidf(lemmatized_docs)
lemmatized_tfidf.head(3)

Unnamed: 0,able,absolutely,accept,account,act,action,actor,actually,adapt,add,...,yesterday,yo,yoga,york,young,youth,youtube,yr,yrs,yummy
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [92]:
lemmatized_tfidf.sum().sum()

90534.85481794423

### 3. Натренируем gensim.models.Word2Vec модель на наших данных

In [95]:
# !pip install gensim

In [206]:
from gensim.models import Word2Vec

In [207]:
tokenized_docs = df_prep['tweet_token']
tokenized_docs.head()

0    [when, father, is, dysfunctional, and, is, so,...
1    [thanks, for, lyft, credit, can, not, use, cau...
2                              [bihday, your, majesty]
3    [model, love, you, take, with, you, all, the, ...
4               [factsguide, society, now, motivation]
Name: tweet_token, dtype: object

In [220]:
%%time
model_w2v = Word2Vec(tokenized_docs, 
              size=200, 
              window=5, 
              min_count=2, 
              sg = 1, 
              hs = 0, 
              negative = 10, 
              workers= 32, 
              seed = 34)

CPU times: user 1min 4s, sys: 164 ms, total: 1min 4s
Wall time: 23.3 s


In [231]:
%%time
model_w2v.train(tokenized_docs, total_examples=tokenized_docs.size, epochs=20)

CPU times: user 3min 34s, sys: 395 ms, total: 3min 34s
Wall time: 55.5 s


(9142127, 11726520)

### потестируем нашу модель Word2Vec 

In [232]:
# выведим слово наиболее близкое к 'dinner'
result = model_w2v.most_similar(positive=['dinner'])
print("{}: {:.4f}".format(*result[0]))
print(model_w2v.similar_by_word("dinner", topn=3))

bihdaydinner: 0.5136
[('bihdaydinner', 0.5135906934738159), ('bolognese', 0.5046185255050659), ('spaghetti', 0.4828481674194336)]


In [233]:
# выведим слово наиболее близкое к 'trump'
result = model_w2v.most_similar(positive=['trump'])
print("{}: {:.4f}".format(*result[0]))
print(model_w2v.similar_by_word("trump", topn=3))

donald: 0.5253
[('donald', 0.5252756476402283), ('dumptrump', 0.5097389221191406), ('impeachment', 0.5059808492660522)]


### проверим векторное представление

In [235]:
model_w2v['food']

array([-0.05453965,  0.43578646, -0.32321855,  0.12719397, -0.3001119 ,
        0.34627149,  0.6086251 , -0.28485277, -0.4190308 ,  0.1764305 ,
       -1.107064  , -0.13698637,  0.8977579 ,  0.091414  ,  0.32210517,
        0.4963126 ,  0.04173763,  0.18582356, -0.16054681, -0.05413775,
        0.1107304 , -0.2229657 , -0.5314729 , -0.54554015, -0.10198018,
        0.16537338, -0.10190313,  0.03768715,  0.14142726, -0.94008034,
       -0.80502886,  0.32982388,  0.01613015, -0.5596173 , -0.35032305,
        0.27877343, -0.18316564,  0.40505934,  0.09746268, -0.5181812 ,
       -0.23304904, -0.0119802 ,  0.4605858 , -0.10047836,  0.269629  ,
        0.14984307,  0.30088153, -0.92244226,  0.13457018,  0.05147889,
       -0.13373959,  0.77397066,  0.18748955, -0.16660704,  0.32296014,
        0.07679819,  0.04791665,  0.39140555,  0.44024032,  0.19118825,
        0.17425205,  0.20088373, -0.3227794 , -0.14851695,  0.3454507 ,
        0.0327611 ,  0.4996098 , -0.48416063, -0.1470884 , -0.60

In [236]:
df_prep['tweet_token'].apply(lambda x: 'disapointed' in x).sum()

1

In [138]:
model_w2v['food'].size

200

### Необходимо создать вектор для каждого твита

In [352]:
# получаем вектора для каждого слова и затем усредняем
def vectorize_tweet(words: list)-> list:
   
    vec = []
    for item in words:
        try:
            vec.append(model_w2v[item])
        except KeyError as err:
            pass
            
    return np.average(vec, axis=0)

In [358]:
%%time
v2w_tweets = tokenized_docs.apply(vectorize_tweet)


CPU times: user 10.6 s, sys: 63.8 ms, total: 10.6 s
Wall time: 10.6 s
