### Import

In [1]:
import os
import pickle

import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from gensim.models import Word2Vec

### Task_1

Создайте мешок слов с помощью sklearn.feature_extraction.text.CountVectorizer.fit_transform(). Применим его к 'tweet_stemmed' и 'tweet_lemmatized' отдельно.
*	Игнорируем слова, частота которых в документе строго превышает порог 0.9 с помощью max_df.
*	Ограничим количество слов, попадающий в мешок, с помощью max_features = 1000.
*	Исключим стоп-слова с помощью stop_words='english'.
*	Отобразим Bag-of-Words модель как DataFrame. columns необходимо извлечь с помощью CountVectorizer.get_feature_names().

In [2]:
os.listdir()

with open('preprocessed_tweets.pickle', 'rb') as f:
    preprocessed_tweets = pickle.load(f)

In [3]:
preprocessed_tweets.shape

(49159, 8)

In [4]:
preprocessed_tweets.head(3)

Unnamed: 0,id,label,tweet,tweet_token,tweet_token_filtered,tweet_stemmed,tweet_source,tweet_lemmatized
0,1,0.0,when father is dysfunctional and is so selfish...,"[when, father, is, dysfunctional, and, is, so,...","[father, dysfunctional, selfish, drags, kids, ...","[father, dysfunct, selfish, drag, kid, dysfunc...",when father is dysfunctional and is so selfish...,"[father, dysfunctional, selfish, drag, kid, dy..."
1,2,0.0,thanks for lyft credit cannot use cause they d...,"[thanks, for, lyft, credit, can, not, use, cau...","[thanks, lyft, credit, use, cause, offer, whee...","[thank, lyft, credit, use, caus, offer, wheelc...",thanks for lyft credit cannot use cause they d...,"[thanks, lyft, credit, use, cause, offer, whee..."
2,3,0.0,bihday your majesty,"[bihday, your, majesty]","[bihday, majesty]","[bihday, majesti]",bihday your majesty,"[bihday, majesty]"


In [5]:
preprocessed_tweets = preprocessed_tweets[:10000].copy()
()

()

In [6]:
vectorizer = CountVectorizer(ngram_range=(1, 1),
                             analyzer='word',
                             max_features=1000,
                             max_df=0.9,
                             stop_words='english')

In [7]:
def get_bag_of_words_by_column_name(column, vectorizer):
    tokens = []
    [[tokens.append(token) for token in list_] for list_ in preprocessed_tweets[column].values]

    bag_of_words = vectorizer.fit_transform(tokens)
    return bag_of_words, vectorizer.get_feature_names()

bag_of_words_stemmed, feature_names_stemmed = get_bag_of_words_by_column_name('tweet_stemmed', vectorizer)
bag_of_words_lemmatized, feature_names_lemmatized = get_bag_of_words_by_column_name('tweet_lemmatized', vectorizer)

In [8]:
bag_of_words_stemmed = pd.DataFrame(bag_of_words_stemmed.toarray(), columns=feature_names_stemmed)

bag_of_words_stemmed.head(5)

Unnamed: 0,aap,abl,absolut,accept,account,act,action,actor,actual,ad,...,yeah,year,yesterday,yo,yoga,young,youth,youtub,yr,yummi
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
bag_of_words_lemmatized = pd.DataFrame(bag_of_words_lemmatized.toarray(), columns=feature_names_lemmatized)

bag_of_words_lemmatized.head(5)

Unnamed: 0,aap,able,absolutely,account,act,action,actor,actually,adapt,add,...,yeah,year,yes,yesterday,yo,yoga,young,youtube,yr,yummy
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Task_2

Создайте мешок слов с помощью sklearn.feature_extraction.text.TfidfVectorizer.fit_transform(). Применим его к 'tweet_stemmed' и 'tweet_lemmatized' отдельно.
*	Игнорируем слова, частота которых в документе строго превышает порог 0.9 с помощью max_df.
*	Ограничим количество слов, попадающий в мешок, с помощью max_features = 1000.
*	Исключим стоп-слова с помощью stop_words='english'.
*	Отобразим Bag-of-Words модель как DataFrame. columns необходимо извлечь с помощью TfidfVectorizer.get_feature_names().

In [10]:
vectorizer = TfidfVectorizer(ngram_range=(1, 1),
                             analyzer='word',
                             max_features=1000,
                             max_df=0.9,
                             stop_words='english')

In [11]:
bag_of_words_stemmed, feature_names_stemmed = get_bag_of_words_by_column_name('tweet_stemmed', vectorizer)
bag_of_words_lemmatized, feature_names_lemmatized = get_bag_of_words_by_column_name('tweet_lemmatized', vectorizer)

In [12]:
bag_of_words_stemmed = pd.DataFrame(bag_of_words_stemmed.toarray(), columns=feature_names_stemmed)

bag_of_words_stemmed.head(5)

Unnamed: 0,aap,abl,absolut,accept,account,act,action,actor,actual,ad,...,yeah,year,yesterday,yo,yoga,young,youth,youtub,yr,yummi
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
bag_of_words_lemmatized = pd.DataFrame(bag_of_words_lemmatized.toarray(), columns=feature_names_lemmatized)

bag_of_words_lemmatized.head(5)

Unnamed: 0,aap,able,absolutely,account,act,action,actor,actually,adapt,add,...,yeah,year,yes,yesterday,yo,yoga,young,youtube,yr,yummy
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Task_3

Натренируем gensim.models.Word2Vec модель на наших данных.
*	Тренировать будем на токенизированных твитах combine_df['tweet_token']
*	Установим следующие параметры: size=200, window=5, min_count=2, sg = 1, hs = 0, negative = 10, workers= 32, seed = 34.
*	Используем функцию train() с параметром total_examples равным длине combine_df['tweet_token'], количество epochs установим 20.


In [14]:
modelW2V = Word2Vec(sentences=preprocessed_tweets['tweet_token'],
                    size=200, 
                    window=5,
                    min_count=2,
                    sg = 1,
                    hs = 0,
                    negative = 10,
                    workers= 32,
                    seed = 34)

%time modelW2V.train(sentences=preprocessed_tweets['tweet_token'], total_examples=modelW2V.corpus_count, epochs=20)

Wall time: 20.7 s


(1748789, 2401300)

### Task_4

Давайте немного потестируем нашу модель Word2Vec и посмотрим, как она работает. Мы зададим слово positive = "dinner", и модель вытащит из корпуса наиболее похожие слова c помощью функции most_similar. То же самое попробуем со словом "trump".

In [15]:
modelW2V.wv.most_similar(positive=['dinner'], negative=[])

[('yum', 0.7131447792053223),
 ('cuddles', 0.703732430934906),
 ('celebrations', 0.7027802467346191),
 ('burgers', 0.6971790194511414),
 ('adventures', 0.6968140602111816),
 ('spaghetti', 0.692824125289917),
 ('delicious', 0.6906303763389587),
 ('indonesia', 0.6883400082588196),
 ('goodfriends', 0.6876944303512573),
 ('sizzle', 0.6870667934417725)]

In [16]:
modelW2V.wv.most_similar(positive=['trump'], negative=[])

[('paladino', 0.6132089495658875),
 ('sentence', 0.6101824045181274),
 ('fuhered', 0.598522424697876),
 ('makeamericagreatagain', 0.5952212810516357),
 ('michelle', 0.5921273827552795),
 ('ally', 0.5892149209976196),
 ('carl', 0.5861276388168335),
 ('republican', 0.5853856801986694),
 ('bigot', 0.5835742950439453),
 ('obama', 0.5809466242790222)]

### Task_5

Из приведенных выше примеров мы видим, что наша модель word2vec хорошо справляется с поиском наиболее похожих слов для данного слова. Но как она это делает? Она изучила векторы для каждого уникального слова наших данных и использует косинусное сходство, чтобы найти наиболее похожие векторы (слова).
Давайте проверим векторное представление любого слова из нашего корпуса, например "food".


In [17]:
modelW2V.wv['food']

array([-0.14039822, -0.27341405, -0.04340313,  0.12114415,  0.1608433 ,
       -0.40633652, -0.166633  ,  0.45642924, -0.11788867,  0.32769233,
       -0.0973983 ,  0.01997017, -0.02217687, -0.2689961 , -0.11332266,
        0.2684986 ,  0.23946719,  0.27677667,  0.096652  , -0.03087369,
       -0.04402805,  0.20057318, -0.255175  ,  0.08254629,  0.16387029,
        0.759173  , -0.10752366,  0.50375724,  0.2783297 , -0.28180802,
       -0.18812907, -0.36689538, -0.2009986 , -0.17646085, -0.47613335,
        0.49207067,  0.08936849, -0.5008136 ,  0.0801122 , -0.44623265,
       -0.39515212, -0.11351427, -0.20132364, -0.35673624, -0.03771339,
       -0.18257959, -0.13106164,  0.396421  , -0.23522   , -0.48553345,
        0.31243217, -0.11469059, -0.01180932, -0.11669469, -0.17518467,
       -0.09934748,  0.32758036,  0.7459076 ,  0.38597414,  0.45526016,
       -0.08103848, -0.04867301,  0.29034093,  0.25527358,  0.4563406 ,
       -0.29469383,  0.4956431 , -0.23957852, -0.06377281,  0.06

### Task_6

Поскольку наши данные содержат твиты, а не только слова, нам придется придумать способ использовать векторы слов из модели word2vec для создания векторного представления всего твита. Существует простое решение этой проблемы, мы можем просто взять среднее значение всех векторов слов, присутствующих в твите. Длина результирующего вектора будет одинаковой, то есть 200. Мы повторим тот же процесс для всех твитов в наших данных и получим их векторы. Теперь у нас есть 200 функций word2vec для наших данных.
Необходимо создать вектор для каждого твита, взяв среднее значение векторов слов, присутствующих в твите. В цикле сделать:  vec += model_w2v[word].reshape((1, size))
и поделить финальный вектор на количество слов в твите.
На выходе должен получиться wordvec_df.shape = (49159, 200).


In [19]:
def get_vec_by_tweet(tweet):
    vector = np.zeros(200)
    words_counter = 0
    
    for word in tweet:
        try:
            vector += modelW2V.wv[word]
            words_counter += 1
        except:
            pass
    return (vector / words_counter).tolist()

wordvec_df = []

wordvec_df = pd.DataFrame([get_vec_by_tweet(tweet) for tweet in preprocessed_tweets['tweet_lemmatized']])
wordvec_df.head()

  # This is added back by InteractiveShellApp.init_path()


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,190,191,192,193,194,195,196,197,198,199
0,0.083529,0.186651,-0.151521,0.179136,0.145685,-0.05851,-0.213681,0.349438,-0.0595,0.06158,...,0.334167,-0.077468,0.033498,0.050663,0.061478,0.193505,-0.07719,0.024321,-0.089858,-0.133916
1,-0.066976,0.085508,-0.118966,0.117939,0.056686,0.039291,-0.202806,0.227319,0.11731,0.089611,...,0.342863,-0.022509,-0.033464,-0.080543,0.134408,0.306458,-0.130963,-0.033288,-0.124975,0.036728
2,0.051177,0.338907,-0.148833,-0.245586,0.384201,-0.377272,-0.11414,0.261264,-0.135387,0.006695,...,0.450665,-0.071401,0.275925,0.293299,-0.124057,0.190717,-0.096149,0.16237,0.236093,-0.055047
3,0.295878,0.346695,-0.348219,0.037336,-0.3517,0.146056,-0.168593,0.409233,0.343677,-0.399432,...,0.145061,-0.099427,0.467728,0.217028,0.08886,0.536089,-0.140193,-0.247336,-0.188619,-0.203294
4,0.055379,-0.223584,-0.239282,-0.001742,-0.094719,0.050812,-0.09793,0.218651,0.092532,0.323498,...,0.411779,-0.14149,-0.023522,-0.137062,0.345956,0.175669,0.076918,0.023394,0.065139,-0.002678
