## Тема “Создание признакового пространства”

In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import pandas as pd
import numpy as np

PREP_DATA = '../data/prep_tweets.pkl'

#### Загрузим подготовленный датасет твиттов

In [3]:
df_prep = pd.read_pickle(PREP_DATA)
df_prep.head()

Unnamed: 0,id,label,tweet,clean_tweet,tweet_token,tweet_token_filtered,tweet_stemmed,tweet_lemmatized
0,1,0.0,@user when a father is dysfunctional and is s...,when father is dysfunctional and is so selfish...,"[when, father, is, dysfunctional, and, is, so,...","[father, dysfunctional, selfish, drags, kids, ...","[father, dysfunct, selfish, drag, kid, dysfunc...","[father, dysfunctional, selfish, drag, kid, dy..."
1,2,0.0,@user @user thanks for #lyft credit i can't us...,thanks for lyft credit cannot use cause they d...,"[thanks, for, lyft, credit, can, not, use, cau...","[thanks, lyft, credit, use, cause, offer, whee...","[thank, lyft, credit, use, caus, offer, wheelc...","[thank, lyft, credit, use, cause, offer, wheel..."
2,3,0.0,bihday your majesty,bihday your majesty,"[bihday, your, majesty]","[bihday, majesty]","[bihday, majesti]","[bihday, majesty]"
3,4,0.0,#model i love u take with u all the time in ...,model love you take with you all the time in ur,"[model, love, you, take, with, you, all, the, ...","[model, love, take, time, ur]","[model, love, take, time, ur]","[model, love, take, time, ur]"
4,5,0.0,factsguide: society now #motivation,factsguide society now motivation,"[factsguide, society, now, motivation]","[factsguide, society, motivation]","[factsguid, societi, motiv]","[factsguide, society, motivation]"


### 1. Создайте мешок слов с помощью CountVectorizer

In [4]:
stemmed_docs = df_prep['tweet_stemmed'].apply(' '.join)

lemmatized_docs = df_prep['tweet_lemmatized'].apply(' '.join)

In [5]:
def _make_bagbag_of_words(vectorizer: object, docs: list) -> object:
    
    bag_of_words = vectorizer.fit_transform(docs)
    
    # Отобразим Bag-of-Words модель как DataFrame
    feature_names = vectorizer.get_feature_names()
    return pd.DataFrame(bag_of_words.toarray(), columns = feature_names).copy()


def make_bagbag_of_words_simple(docs: list) -> object:
   
    count_vectorizer = CountVectorizer(max_df=0.9, max_features = 1000, stop_words='english')
    return _make_bagbag_of_words(count_vectorizer, docs)


In [6]:
#stemmed
stemmed_bow = make_bagbag_of_words_simple(stemmed_docs)
stemmed_bow.head(3)

Unnamed: 0,abl,absolut,accept,account,act,action,activ,actor,actual,ad,...,yeah,year,yesterday,yo,yoga,york,young,youtub,yr,yummi
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
stemmed_bow.sum().sum()

216726

In [8]:
#lemmatized
lemmatized_bow = make_bagbag_of_words_simple(lemmatized_docs)
lemmatized_bow.head(3)

Unnamed: 0,able,absolutely,accept,account,act,action,actor,actually,adapt,add,...,yesterday,yo,yoga,york,young,youth,youtube,yr,yrs,yummy
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
lemmatized_bow.sum().sum()

204283

### 2. Создайте мешок слов с помощью TfidfVectorizer

In [10]:
def make_bagbag_of_words_tfidf(docs: list) -> object:
   
    vectorizer = TfidfVectorizer(max_df=0.9, max_features = 1000, stop_words='english')
    return _make_bagbag_of_words(vectorizer, docs)


In [11]:
#stemmed
stemmed_tfidf = make_bagbag_of_words_tfidf(stemmed_docs)
stemmed_tfidf.head(3)

Unnamed: 0,abl,absolut,accept,account,act,action,activ,actor,actual,ad,...,yeah,year,yesterday,yo,yoga,york,young,youtub,yr,yummi
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
stemmed_tfidf.sum().sum()

93434.75975058634

In [13]:
#lemmatized
lemmatized_tfidf = make_bagbag_of_words_tfidf(lemmatized_docs)
lemmatized_tfidf.head(3)

Unnamed: 0,able,absolutely,accept,account,act,action,actor,actually,adapt,add,...,yesterday,yo,yoga,york,young,youth,youtube,yr,yrs,yummy
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
lemmatized_tfidf.sum().sum()

90534.85481794423

### 3. Натренируем gensim.models.Word2Vec модель на наших данных

In [15]:
# !pip install gensim

In [16]:
from gensim.models import Word2Vec

In [17]:
tokenized_docs = df_prep['tweet_token']
tokenized_docs.head()

0    [when, father, is, dysfunctional, and, is, so,...
1    [thanks, for, lyft, credit, can, not, use, cau...
2                              [bihday, your, majesty]
3    [model, love, you, take, with, you, all, the, ...
4               [factsguide, society, now, motivation]
Name: tweet_token, dtype: object

In [18]:
%%time
model_w2v = Word2Vec(tokenized_docs, 
              size=200, 
              window=5, 
              min_count=2, 
              sg = 1, 
              hs = 0, 
              negative = 10, 
              workers= 32, 
              seed = 34)

CPU times: user 1min 10s, sys: 92.6 ms, total: 1min 10s
Wall time: 25.7 s


In [None]:
%%time
model_w2v.train(tokenized_docs, total_examples=tokenized_docs.size, epochs=20)

### потестируем нашу модель Word2Vec 

In [None]:
# выведим слово наиболее близкое к 'dinner'
result = model_w2v.most_similar(positive=['dinner'])
print("{}: {:.4f}".format(*result[0]))
print(model_w2v.similar_by_word("dinner", topn=3))

In [None]:
# выведим слово наиболее близкое к 'trump'
result = model_w2v.most_similar(positive=['trump'])
print("{}: {:.4f}".format(*result[0]))
print(model_w2v.similar_by_word("trump", topn=3))

### проверим векторное представление

In [None]:
model_w2v['food']

In [None]:
df_prep['tweet_token'].apply(lambda x: 'disapointed' in x).sum()

In [None]:
model_w2v['food'].size

### Необходимо создать вектор для каждого твита

In [None]:
# получаем вектора для каждого слова и затем усредняем
def vectorize_tweet(words: list)-> list or None:
    res = None
    vec = []
    for item in words:
        try:
            vec.append(model_w2v[item])
        except KeyError as err:
            pass
    if len(vec)>0:
        res =  np.average(vec, axis=0)
    return res

In [None]:
%%time
v2w_tweets = tokenized_docs.apply(vectorize_tweet)


In [None]:
v2w_tweets.shape

In [None]:
# проверяем наличие твитов состоящие из слов которых нет в нашем словаре
print(v2w_tweets.isna().sum())
# и удаляем их
v2w_tweets.dropna(inplace=True)
v2w_tweets.shape

In [None]:
wordvec_df = pd.DataFrame.from_records(v2w_tweets)
wordvec_df.head(3)

In [None]:
wordvec_df.shape

In [None]:
wordvec_df.info()