## Тема “Создание признакового пространства”

In [74]:
import warnings
warnings.filterwarnings("ignore")

In [75]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import pandas as pd

PREP_DATA = '../data/prep_tweets.pkl'

#### Загрузим подготовленный датасет твиттов

In [76]:
df_prep = pd.read_pickle(PREP_DATA)
df_prep.head()

Unnamed: 0,id,label,tweet,clean_tweet,tweet_token,tweet_token_filtered,tweet_stemmed,tweet_lemmatized
0,1,0.0,@user when a father is dysfunctional and is s...,when father is dysfunctional and is so selfish...,"[when, father, is, dysfunctional, and, is, so,...","[father, dysfunctional, selfish, drags, kids, ...","[father, dysfunct, selfish, drag, kid, dysfunc...","[father, dysfunctional, selfish, drag, kid, dy..."
1,2,0.0,@user @user thanks for #lyft credit i can't us...,thanks for lyft credit cannot use cause they d...,"[thanks, for, lyft, credit, can, not, use, cau...","[thanks, lyft, credit, use, cause, offer, whee...","[thank, lyft, credit, use, caus, offer, wheelc...","[thank, lyft, credit, use, cause, offer, wheel..."
2,3,0.0,bihday your majesty,bihday your majesty,"[bihday, your, majesty]","[bihday, majesty]","[bihday, majesti]","[bihday, majesty]"
3,4,0.0,#model i love u take with u all the time in ...,model love you take with you all the time in ur,"[model, love, you, take, with, you, all, the, ...","[model, love, take, time, ur]","[model, love, take, time, ur]","[model, love, take, time, ur]"
4,5,0.0,factsguide: society now #motivation,factsguide society now motivation,"[factsguide, society, now, motivation]","[factsguide, society, motivation]","[factsguid, societi, motiv]","[factsguide, society, motivation]"


### 1. Создайте мешок слов с помощью CountVectorizer

In [77]:
stemmed_docs = df_prep['tweet_stemmed'].apply(' '.join)

lemmatized_docs = df_prep['tweet_lemmatized'].apply(' '.join)

In [78]:
def _make_bagbag_of_words(vectorizer: object, docs: list) -> object:
    
    bag_of_words = vectorizer.fit_transform(docs)
    
    # Отобразим Bag-of-Words модель как DataFrame
    feature_names = vectorizer.get_feature_names()
    return pd.DataFrame(bag_of_words.toarray(), columns = feature_names).copy()


def make_bagbag_of_words_simple(docs: list) -> object:
   
    count_vectorizer = CountVectorizer(max_df=0.9, max_features = 1000, stop_words='english')
    return _make_bagbag_of_words(count_vectorizer, docs)


In [79]:
#stemmed
stemmed_bow = make_bagbag_of_words_simple(stemmed_docs)
stemmed_bow.head(3)

Unnamed: 0,abl,absolut,accept,account,act,action,activ,actor,actual,ad,...,yeah,year,yesterday,yo,yoga,york,young,youtub,yr,yummi
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [80]:
stemmed_bow.sum().sum()

216726

In [81]:
#lemmatized
lemmatized_bow = make_bagbag_of_words_simple(lemmatized_docs)
lemmatized_bow.head(3)

Unnamed: 0,able,absolutely,accept,account,act,action,actor,actually,adapt,add,...,yesterday,yo,yoga,york,young,youth,youtube,yr,yrs,yummy
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [82]:
lemmatized_bow.sum().sum()

204283

### 2. Создайте мешок слов с помощью TfidfVectorizer

In [83]:
def make_bagbag_of_words_tfidf(docs: list) -> object:
   
    vectorizer = TfidfVectorizer(max_df=0.9, max_features = 1000, stop_words='english')
    return _make_bagbag_of_words(count_vectorizer, docs)


In [84]:
#stemmed
stemmed_tfidf = make_bagbag_of_words_tfidf(stemmed_docs)
stemmed_tfidf.head(3)

Unnamed: 0,abl,absolut,accept,account,act,action,actor,actual,ad,adapt,...,year,yesterday,yet,yo,yoga,york,young,youtub,yr,yummi
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [85]:
stemmed_tfidf.sum().sum()

237191

In [86]:
#lemmatized
lemmatized_tfidf = make_bagbag_of_words_tfidf(lemmatized_docs)
lemmatized_tfidf.head(3)

Unnamed: 0,able,absolutely,accept,account,act,action,actor,actually,adapt,add,...,yesterday,yet,yo,yoga,york,young,youtube,yr,yrs,yummy
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [87]:
lemmatized_tfidf.sum().sum()

230961

### 3. Натренируем gensim.models.Word2Vec модель на наших данных