<a href="https://colab.research.google.com/github/WhiteAndBlackFox/nlp/blob/lessons2/Lessons_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [186]:
import pandas as pd
import numpy as np

from sklearn import model_selection, preprocessing, linear_model
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score

import pickle

from google.colab import drive 
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


# Урок 2. Создание признакового пространства

In [168]:
vectors = {
    'train': { 'count': None, 'tf': None },
    'test': { 'count': None, 'tf': None }
}
result_check = {
    "vector": [],
    "score": []
}

### Созданные функции

In [1]:
#@title Загружаем ранее подготовленные данные
def load_df(path_to_df):
  load_df = None
  with open(path_to_df, 'rb') as dump_df:
    load_df = pickle.load(dump_df)
  return load_df


In [9]:
def print_data(comment, data):
  print('-' * 40)
  print(comment)
  print('-' * 40)
  print(data)
  print('-' * 40)

In [144]:
#@title Создание мешка
def create_bag(df, type_bag=None, count_print=5):
  for col in ['clean_tweet_token_stemmed', 'clean_tweet_token_lemmed']:
    df[col + "_str"] = df[col].apply(lambda txt: ' '.join(txt))

  if (type_bag == 'CountVectorized'):
    vector_stemmed = CountVectorizer(
        stop_words='english',
        max_df=0.9,
        max_features=1000
    )

    vector_lemmed = CountVectorizer(
        stop_words='english',
        max_df=0.9,
        max_features=1000
    )
  elif (type_bag == 'tf-idf'):
    vector_stemmed = TfidfVectorizer(
        stop_words='english',
        max_df=0.9,
        max_features=1000
    )

    vector_lemmed = TfidfVectorizer(
        stop_words='english',
        max_df=0.9,
        max_features=1000
    )
  else:
    return None

  print("Selected type bag: " + type_bag)

  # stemmed

  bag_stemmed = vector_stemmed.fit_transform(df['clean_tweet_token_stemmed_str'])
  names_bag_stemmed = vector_stemmed.get_feature_names_out()

  bag_stemmed_df = pd.DataFrame(bag_stemmed.toarray(), columns=names_bag_stemmed)
  print_data("Bag-of-Words from data stemmed", bag_stemmed_df.head(count_print))

  # lemmed

  bag_lemmed = vector_lemmed.fit_transform(df['clean_tweet_token_lemmed_str'])
  names_bag_lemmed = vector_lemmed.get_feature_names_out()

  bag_lemmed_df = pd.DataFrame(bag_lemmed.toarray(), columns=names_bag_lemmed)
  print_data("Bag-of-Words from data lemmed", bag_lemmed_df.head(count_print))
  
  return [vector_stemmed, vector_lemmed]

In [82]:
#@title Сбор мешков
def run_lessons_2(df):
  count_vector = create_bag(df, 'CountVectorized')
  tf_vector = create_bag(df, 'tf-idf')
  return [count_vector, tf_vector]

In [100]:
#@title Проверка векторов
def check_vector(vector, df, trains):
  vector.fit(df['text'])
  train_transform = vector.transform(trains['train_x'])
  valid_transform = vector.transform(trains['valid_x'])

  classifier = linear_model.LogisticRegression(max_iter=10000)
  classifier.fit(train_transform, trains['train_y'])
  predict = classifier.predict(valid_transform)
  return accuracy_score(trains['valid_y'], predict)

### Загрузка данных и выполнение

In [83]:
#@title Загружаем тренеровочные данные
train_df = load_df('gdrive/MyDrive/Colab Notebooks/nlp/Lessons 2 - Feature Space Creation/train_df.pkl')

In [86]:
#@title Загружаем тестовые данные
test_df = load_df('gdrive/MyDrive/Colab Notebooks/nlp/Lessons 2 - Feature Space Creation/test_df.pkl')

In [169]:
#@title Запускаем на тренеровочных данных
[ vectors['train']['count'], vectors['train']['tf'] ] = run_lessons_2(train_df)

Selected type bag: CountVectorized
----------------------------------------
Bag-of-Words from data stemmed
----------------------------------------
   abl  absolut  accept  account  act  ...  youth  youtub  yoyou  yoyoy  yr
0    0        0       0        0    0  ...      0       0      0      0   0
1    0        0       0        0    0  ...      0       0      0      0   0
2    0        0       0        0    0  ...      0       0      0      0   0
3    0        0       0        0    0  ...      0       0      0      0   0
4    0        0       0        0    0  ...      0       0      0      0   0

[5 rows x 1000 columns]
----------------------------------------
----------------------------------------
Bag-of-Words from data lemmed
----------------------------------------
   able  absolutely  accept  account  act  ...  youtube  yoyou  yoyoyou  yr  yrs
0     0           0       0        0    0  ...        0      0        0   0    0
1     0           0       0        0    0  ...        0 

In [170]:
#@title Запускаем на тестовых данных
[ vectors['test']['count'], vectors['test']['tf'] ] = run_lessons_2(test_df)

Selected type bag: CountVectorized
----------------------------------------
Bag-of-Words from data stemmed
----------------------------------------
   abl  absolut  abt  abus  accept  ...  young  youtub  yoyou  yoyoy  yr
0    0        0    0     0       0  ...      0       0      0      0   0
1    0        0    0     0       0  ...      0       0      0      0   0
2    0        0    0     0       0  ...      0       0      0      0   0
3    0        0    0     0       0  ...      0       0      0      0   0
4    0        0    0     0       0  ...      0       0      0      0   0

[5 rows x 1000 columns]
----------------------------------------
----------------------------------------
Bag-of-Words from data lemmed
----------------------------------------
   able  absolutely  abt  abuse  accept  ...  youtube  yoyou  yoyoyou  yr  yrs
0     0           0    0      0       0  ...        0      0        0   0    0
1     0           0    0      0       0  ...        0      0        0   0    0

### Проверка векторайзера

In [78]:
#@title Загружаем данные и создаем DataFrame
data = open('gdrive/MyDrive/Colab Notebooks/nlp/Lessons 2 - Feature Space Creation/corpus.txt').read()
labels, text = [], []
for idx, line in enumerate(data.split('\n')):
  cont = line.split()
  labels.append(cont[0])
  text.append(' '.join(cont[1:]))

corp_df = pd.DataFrame()
corp_df['text'] = text
corp_df['labels'] = labels
corp_df.head(10)

Unnamed: 0,text,labels
0,Stuning even for the non-gamer: This sound tra...,__label__2
1,The best soundtrack ever to anything.: I'm rea...,__label__2
2,Amazing!: This soundtrack is my favorite music...,__label__2
3,Excellent Soundtrack: I truly like this soundt...,__label__2
4,"Remember, Pull Your Jaw Off The Floor After He...",__label__2
5,an absolute masterpiece: I am quite sure any o...,__label__2
6,"Buyer beware: This is a self-published book, a...",__label__1
7,Glorious story: I loved Whisper of the wicked ...,__label__2
8,A FIVE STAR BOOK: I just finished reading Whis...,__label__2
9,Whispers of the Wicked Saints: This was a easy...,__label__2


In [93]:
#@title Разбиваем данные и подчищаем label
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(corp_df['text'], corp_df['labels'])

encode = preprocessing.LabelEncoder()
train_y = encode.fit_transform(train_y)
valid_y = encode.fit_transform(valid_y)

In [171]:
#@title Используем уже ранее созданные вектор (CountVectorizer) на тренеровочных данных (stemmed)
score = check_vector(vectors['train']['count'][0], corp_df, {'train_x': train_x, 'valid_x': valid_x, 'train_y': train_y, 'valid_y': valid_y})
result_check['vector'].append(vectors['train']['count'][0])
result_check['score'].append(score)

In [172]:
#@title Используем уже ранее созданные вектор (CountVectorizer) на тренеровочных данных (lemmed)
score = check_vector(vectors['train']['count'][1], corp_df, {'train_x': train_x, 'valid_x': valid_x, 'train_y': train_y, 'valid_y': valid_y})
result_check['vector'].append(vectors['train']['count'][1])
result_check['score'].append(score)

In [173]:
#@title Используем уже ранее созданные вектор (TfidfVectorizer) на тестовых данных (stemmed)
score = check_vector(vectors['train']['tf'][0], corp_df, {'train_x': train_x, 'valid_x': valid_x, 'train_y': train_y, 'valid_y': valid_y})
result_check['vector'].append(vectors['train']['tf'][0])
result_check['score'].append(score)

In [174]:
#@title Используем уже ранее созданные вектор (TfidfVectorizer) на тестовых данных (lemmed)
score = check_vector(vectors['train']['tf'][1], corp_df, {'train_x': train_x, 'valid_x': valid_x, 'train_y': train_y, 'valid_y': valid_y})
result_check['vector'].append(vectors['train']['tf'][1])
result_check['score'].append(score)

In [175]:
#@title Используем уже ранее созданные вектор (CountVectorizer) на тестовых данных (stemmed)
score = check_vector(vectors['test']['count'][0], corp_df, {'train_x': train_x, 'valid_x': valid_x, 'train_y': train_y, 'valid_y': valid_y})
result_check['vector'].append(vectors['train']['count'][0])
result_check['score'].append(score)

In [176]:
#@title Используем уже ранее созданные вектор (CountVectorizer) на тестовых данных (lemmed)
score = check_vector(vectors['test']['count'][1], corp_df, {'train_x': train_x, 'valid_x': valid_x, 'train_y': train_y, 'valid_y': valid_y})
result_check['vector'].append(vectors['train']['count'][1])
result_check['score'].append(score)

In [177]:
#@title Используем уже ранее созданные вектор (TfidfVectorizer) на тестовых данных (stemmed)
score = check_vector(vectors['test']['tf'][0], corp_df, {'train_x': train_x, 'valid_x': valid_x, 'train_y': train_y, 'valid_y': valid_y})
result_check['vector'].append(vectors['train']['tf'][0])
result_check['score'].append(score)

In [178]:
#@title Используем уже ранее созданные вектор (TfidfVectorizer) на тестовых данных (lemmed)
score = check_vector(vectors['test']['tf'][1], corp_df, {'train_x': train_x, 'valid_x': valid_x, 'train_y': train_y, 'valid_y': valid_y})
result_check['vector'].append(vectors['train']['tf'][1])
result_check['score'].append(score)

In [179]:
#@title Создадим вектор с другими параметрами CountVectorizer
vector_corp_count = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
score = check_vector(vector_corp_count, corp_df, {'train_x': train_x, 'valid_x': valid_x, 'train_y': train_y, 'valid_y': valid_y})
result_check['vector'].append(vector_corp_count)
result_check['score'].append(score)

In [180]:
#@title Создадим вектор с другими параметрами TfidfVectorizer
vector_corp_tf = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}')
score = check_vector(vector_corp_tf, corp_df, {'train_x': train_x, 'valid_x': valid_x, 'train_y': train_y, 'valid_y': valid_y})
result_check['vector'].append(vector_corp_tf)
result_check['score'].append(score)

In [183]:
#@title Создадим вектор с другими параметрами CountVectorizer
vector_corp_count_mod = CountVectorizer(max_df=0.9, 
                                      max_features = 1000, 
                                      stop_words='english', 
                                      analyzer='word', 
                                      token_pattern=r'\w{1,}')
score = check_vector(vector_corp_count_mod, corp_df, {'train_x': train_x, 'valid_x': valid_x, 'train_y': train_y, 'valid_y': valid_y})
result_check['vector'].append(vector_corp_count_mod)
result_check['score'].append(score)

In [184]:
#@title Создадим вектор с другими параметрами TfidfVectorizer 
vector_corp_tf_mod = TfidfVectorizer(max_df=0.9, 
                                      max_features = 1000, 
                                      stop_words='english', 
                                      analyzer='word', 
                                      token_pattern=r'\w{1,}')
score = check_vector(vector_corp_tf_mod, corp_df, {'train_x': train_x, 'valid_x': valid_x, 'train_y': train_y, 'valid_y': valid_y})
result_check['vector'].append(vector_corp_tf_mod)
result_check['score'].append(score)

In [185]:
check_df = pd.DataFrame(data=result_check)
check_df.set_index('vector', inplace=False)
check_df

Unnamed: 0,vector,score
0,"CountVectorizer(max_df=0.9, max_features=1000,...",0.875613
1,"CountVectorizer(max_df=0.9, max_features=1000,...",0.875613
2,"TfidfVectorizer(max_df=0.9, max_features=1000,...",0.874858
3,"TfidfVectorizer(max_df=0.9, max_features=1000,...",0.874858
4,"CountVectorizer(max_df=0.9, max_features=1000,...",0.875613
5,"CountVectorizer(max_df=0.9, max_features=1000,...",0.875613
6,"TfidfVectorizer(max_df=0.9, max_features=1000,...",0.874858
7,"TfidfVectorizer(max_df=0.9, max_features=1000,...",0.874858
8,"CountVectorizer(token_pattern='\\w{1,}')",0.97886
9,"TfidfVectorizer(token_pattern='\\w{1,}')",0.939977


#### Вывод проверки: 

*   Лучшие значения у вектора TfidfVectorizer как у stemmed, так и у lemmed. Так же данные одинаковые как у тренеровочной сети, так и у тестовой сети. 
*   Худшие рещультаты у сетей, где использовался только один параметр token_pattern.



### PCA

In [189]:
pca_df = pd.DataFrame()

for vector in [vector_corp_count, vector_corp_tf]:
  for n in [100, 300, 500, 1000, 3000, 5000]:
    svd = TruncatedSVD(n_components=n)

    train_transform = vector.transform(train_x)
    valid_transform = vector.transform(valid_x)

      train_transform_pca = svd.fit_transform(train_transform)
    valid_transform_pca = svd.transform(valid_transform)

    classifier = linear_model.LogisticRegression(max_iter=10000)
    classifier.fit(train_transform_pca, train_y)

    predict = classifier.predict(valid_transform_pca)

    pca_df = pca_df.append(
        pd.DataFrame([[str(n), accuracy_score(valid_y, predict)]], columns=['# Components', 'Score'])
    )

In [190]:
pca_df.sort_values('Score', ascending=False, inplace=True)
pca_df.head(10)

Unnamed: 0,# Components,Score
0,5000,0.97735
0,3000,0.970555
0,5000,0.936391
0,3000,0.924689
0,1000,0.912609
0,1000,0.89883
0,500,0.886938
0,500,0.880143
0,300,0.878256
0,300,0.864288
