## Тема “Создание признакового пространства”

#### Продолжим обработку данных с Твиттера. 

1. Создайте мешок слов с помощью sklearn.feature_extraction.text.CountVectorizer.fit_transform(). Применим его к 'tweet_stemmed' и 'tweet_lemmatized' отдельно.
Игнорируем слова, частота которых в документе строго превышает порог 0.9 с помощью max_df.
Ограничим количество слов, попадающий в мешок, с помощью max_features = 1000.
Исключим стоп-слова с помощью stop_words='english'. 
Отобразим Bag-of-Words модель как DataFrame. columns необходимо извлечь с помощью CountVectorizer.get_feature_names().
 
2. Создайте мешок слов с помощью sklearn.feature_extraction.text.TfidfVectorizer.fit_transform(). Применим его к 'tweet_stemmed' и 'tweet_lemmatized' отдельно.
Игнорируем слова, частота которых в документе строго превышает порог 0.9 с помощью max_df.
Ограничим количество слов, попадающий в мешок, с помощью max_features = 1000.
Исключим стоп-слова с помощью stop_words='english'.
Отобразим Bag-of-Words модель как DataFrame. columns необходимо извлечь с помощью TfidfVectorizer.get_feature_names().
 
3. Проверьте ваши векторайзеры на корпусе который использовали на вебинаре, составьте таблицу метод векторизации и скор который вы получили (в методах векторизации по изменяйте параметры что бы добиться лучшего скора) обратите внимание как падает/растёт скор при уменьшении количества фичей, и изменении параметров, так же попробуйте применить к векторайзерам PCA для сокращения размерности посмотрите на качество сделайте выводы


In [1]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import warnings 
warnings.filterwarnings("ignore", category=DeprecationWarning)
import os

In [2]:
df = pd.read_pickle("../lesson1/data/combine_df.pkl")


Все колонки не нужны, оставим tweet_stemmed и tweet_lemmatized, преобразовав данные из списков слов в строки (так их ждет векторайзер)

In [3]:
df['str_stemmed'] = df['tweet_stemmed'].apply(lambda x: ' '.join(x))

In [4]:
df['str_stemmed'] = df['tweet_stemmed'].apply(lambda x: ' '.join(x))
df['str_lemmatized'] = df['tweet_lemmatized'].apply(lambda x: ' '.join(x))
df = df[['id', 'label', 'str_stemmed', 'str_lemmatized']]
df.head()

Unnamed: 0,id,label,str_stemmed,str_lemmatized
0,1,0.0,father dysfunct selfish drag kid dysfunct run,father dysfunct selfish drag kid dysfunct run
1,2,0.0,thank lyft credit use caus offer wheelchair va...,thank lyft credit use caus offer wheelchair va...
2,3,0.0,bihday majesti,bihday majesti
3,4,0.0,model love take time ur,model love take time ur
4,5,0.0,factsguid societi motiv,factsguid societi motiv


In [5]:
print(np.isnan(df['label']).sum()) 



17197


### Много неразмеченных данных. Заполним нулями.

In [6]:
df['label'] = df['label'].apply(lambda x: 0 if np.isnan(x) else x) 
print(np.isnan(df['label']).sum()) 

0


## 1 CountVectorizer 

In [7]:
from sklearn.feature_extraction.text import CountVectorizer

count_vectorizer = CountVectorizer(ngram_range=(1, 1), 
                                   analyzer='word', 
                                   binary=False, 
                                   max_df=0.9, 
                                   max_features=1000,
                                   stop_words='english')

In [8]:
# Создаем the Bag-of-Words модель
bag_of_words_stemmed = count_vectorizer.fit_transform(df['str_stemmed'])

# Отобразим Bag-of-Words модель как DataFrame
feature_names = count_vectorizer.get_feature_names()
stemmed_count_vectorizer_matrix = pd.DataFrame(bag_of_words_stemmed.toarray(), columns = feature_names)
stemmed_count_vectorizer_matrix.head()

Unnamed: 0,00,06,10,100,11,12,13,14,15,16,...,yeah,year,yesterday,yo,yoga,york,young,youtub,yr,yummi
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
# Создаем the Bag-of-Words модель
bag_of_words_lemmatized = count_vectorizer.fit_transform(df['str_lemmatized'])

# Отобразим Bag-of-Words модель как DataFrame
feature_names = count_vectorizer.get_feature_names()
lemmatized_count_vectorizer_matrix = pd.DataFrame(bag_of_words_lemmatized.toarray(), columns = feature_names)
lemmatized_count_vectorizer_matrix.head()

Unnamed: 0,00,06,10,100,11,12,13,14,15,16,...,yeah,year,yesterday,yo,yoga,york,young,youtub,yr,yummi
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## 2 TfidfVectorizer

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 1), 
                                   analyzer='word', 
                                   binary=False, 
                                   max_df=0.9, 
                                   max_features=1000,
                                   stop_words='english')

In [11]:
# Создаем the Bag-of-Words модель
bag_of_words_stemmed = tfidf_vectorizer.fit_transform(df['str_stemmed'])

# Отобразим Bag-of-Words модель как DataFrame
feature_names = tfidf_vectorizer.get_feature_names()
stemmed_tfidf_vectorizer_matrix = pd.DataFrame(bag_of_words_stemmed.toarray(), columns = feature_names)
stemmed_tfidf_vectorizer_matrix.head()

Unnamed: 0,00,06,10,100,11,12,13,14,15,16,...,yeah,year,yesterday,yo,yoga,york,young,youtub,yr,yummi
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
# Создаем the Bag-of-Words модель
bag_of_words_lemmatized = tfidf_vectorizer.fit_transform(df['str_lemmatized'])

# Отобразим Bag-of-Words модель как DataFrame
feature_names = tfidf_vectorizer.get_feature_names()
lemmatized_tfidf_vectorizer_matrix = pd.DataFrame(bag_of_words_lemmatized.toarray(), columns = feature_names)
lemmatized_tfidf_vectorizer_matrix.head()

Unnamed: 0,00,06,10,100,11,12,13,14,15,16,...,yeah,year,yesterday,yo,yoga,york,young,youtub,yr,yummi
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
# feature_names

## 3. Проверка

In [14]:
from sklearn import model_selection, preprocessing, linear_model
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, median_absolute_error, r2_score

X_stemmed_count = stemmed_count_vectorizer_matrix.astype('float32')
X_lemmatized_count = lemmatized_count_vectorizer_matrix.astype('float32')
X_lemmatized_tfidf = lemmatized_tfidf_vectorizer_matrix.astype('float32')
X_stemmed_tfidf = stemmed_tfidf_vectorizer_matrix.astype('float32')

y = df['label'].astype('float32')

res_list = []



In [15]:
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(X_stemmed_count, y)

lin_reg = LinearRegression()
lin_reg.fit(train_x, train_y)
pred_y = lin_reg.predict(valid_x)

res_list.append(['---', 'stemmer', 'count', mean_squared_error(valid_y, pred_y), mean_absolute_error(valid_y, pred_y),
                median_absolute_error(valid_y, pred_y), r2_score(valid_y, pred_y)])

In [16]:
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(X_lemmatized_count, y)

lin_reg = LinearRegression()
lin_reg.fit(train_x, train_y)
pred_y = lin_reg.predict(valid_x)

res_list.append(['---', 'lemmatizer', 'count', mean_squared_error(valid_y, pred_y), mean_absolute_error(valid_y, pred_y),
                median_absolute_error(valid_y, pred_y), r2_score(valid_y, pred_y)])

In [17]:
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(X_stemmed_tfidf, y)

lin_reg = LinearRegression()
lin_reg.fit(train_x, train_y)
pred_y = lin_reg.predict(valid_x)

res_list.append(['---', 'stemmer', 'tfidf', mean_squared_error(valid_y, pred_y), mean_absolute_error(valid_y, pred_y),
                median_absolute_error(valid_y, pred_y), r2_score(valid_y, pred_y)])

In [18]:
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(X_lemmatized_tfidf, y)

lin_reg = LinearRegression()
lin_reg.fit(train_x, train_y)
pred_y = lin_reg.predict(valid_x)

res_list.append(['---', 'lemmatizer', 'tfidf', mean_squared_error(valid_y, pred_y), mean_absolute_error(valid_y, pred_y),
                median_absolute_error(valid_y, pred_y), r2_score(valid_y, pred_y)])

In [19]:
# result = pd.DataFrame(res_list, columns=['note', 'vectorizer', 'stemmer', 'MSE:', 'MAE:', 'MedAE:', 'R2:'])

# result

## Увеличим поличество векторов до 2000, уменьшим до 500

In [20]:
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2), 
                                   analyzer='word', 
                                   binary=False, 
                                   max_df=0.9, 
                                   max_features=500,
                                   stop_words='english')

In [21]:
X_lemmatized_tfidf_500 = lemmatized_tfidf_vectorizer_matrix.astype('float32')

train_x, valid_x, train_y, valid_y = model_selection.train_test_split(X_lemmatized_tfidf_500, y)

lin_reg = LinearRegression()
lin_reg.fit(train_x, train_y)
pred_y = lin_reg.predict(valid_x)

res_list.append(['500 features', 'lemmatizer', 'tfidf', mean_squared_error(valid_y, pred_y), mean_absolute_error(valid_y, pred_y),
                median_absolute_error(valid_y, pred_y), r2_score(valid_y, pred_y)])

In [22]:
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2), 
                                   analyzer='word', 
                                   binary=False, 
                                   max_df=0.9, 
                                   max_features=2000,
                                   stop_words='english')

In [23]:
X_lemmatized_tfidf_2000 = lemmatized_tfidf_vectorizer_matrix.astype('float32')

train_x, valid_x, train_y, valid_y = model_selection.train_test_split(X_lemmatized_tfidf_2000, y)

lin_reg = LinearRegression()
lin_reg.fit(train_x, train_y)
pred_y = lin_reg.predict(valid_x)

res_list.append(['500 features', 'lemmatizer', 'tfidf', mean_squared_error(valid_y, pred_y), mean_absolute_error(valid_y, pred_y),
                median_absolute_error(valid_y, pred_y), r2_score(valid_y, pred_y)])

## Применим PCA

In [24]:
from sklearn import decomposition 
    
pca = decomposition.PCA(n_components=20)
pca.fit(X_lemmatized_tfidf)
X_lemmatized_tfidf_pca = pca.transform(X_lemmatized_tfidf)

In [25]:
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(X_lemmatized_tfidf_pca, y)

lin_reg = LinearRegression()
lin_reg.fit(train_x, train_y)
pred_y = lin_reg.predict(valid_x)

res_list.append(['PCA 20 features', 'lemmatizer_pair', 'tfidf', mean_squared_error(valid_y, pred_y), mean_absolute_error(valid_y, pred_y),
                median_absolute_error(valid_y, pred_y), r2_score(valid_y, pred_y)])

In [26]:
from sklearn import decomposition 
    
pca = decomposition.PCA(n_components=100)
pca.fit(X_lemmatized_tfidf)
X_lemmatized_tfidf_pca = pca.transform(X_lemmatized_tfidf)

In [27]:
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(X_lemmatized_tfidf_pca, y)

lin_reg = LinearRegression()
lin_reg.fit(train_x, train_y)
pred_y = lin_reg.predict(valid_x)

res_list.append(['PCA 100 features', 'lemmatizer_pair', 'tfidf', mean_squared_error(valid_y, pred_y), mean_absolute_error(valid_y, pred_y),
                median_absolute_error(valid_y, pred_y), r2_score(valid_y, pred_y)])

In [28]:
result = pd.DataFrame(res_list, columns=['note', 'vectorizer', 'stemmer', 'MSE:', 'MAE:', 'MedAE:', 'R2:'])

result

Unnamed: 0,note,vectorizer,stemmer,MSE:,MAE:,MedAE:,R2:
0,---,stemmer,count,0.034527,0.081554,0.034284,0.203372
1,---,lemmatizer,count,0.035253,0.082778,0.034289,0.217275
2,---,stemmer,tfidf,0.034499,0.081186,0.028601,0.205377
3,---,lemmatizer,tfidf,0.03386,0.081428,0.029609,0.201021
4,500 features,lemmatizer,tfidf,0.035187,0.083253,0.029817,0.228829
5,500 features,lemmatizer,tfidf,0.032586,0.080663,0.030012,0.240378
6,PCA 20 features,lemmatizer_pair,tfidf,0.037643,0.085177,0.054672,0.024284
7,PCA 100 features,lemmatizer_pair,tfidf,0.038566,0.08869,0.043983,0.132357
