# Чтение и грубая очистка данных

In [127]:
import pandas as pd
import numpy as np
import string, re
from collections import Counter

from src.data.read_raw_data import read_from_gsheet, drop_unwanted_data, split_types

In [128]:
# Читаем данные из таблицы, удаляем неинтересующие столбцы, фильтруем по типу задания
raw_data = read_from_gsheet()
data = drop_unwanted_data(raw_data)
_, email_data = split_types(data)

  raw_data = read_from_gsheet()


In [129]:
 # Далее будем работать с данными по письмам
email_data.head()

Unnamed: 0,Type,Question id,Question,Text,Solving a communicative task,Text structure,Use of English (for emails)
2,Email,,…I am so happy that summer has come and we are...,"Dear Ronny, I was glad to hear from you again....",1.0,2.0,2.0
3,Email,,…I am learning to cook from my mother now. But...,"Dear Mary, Thanks for your message. It was gre...",1.0,2.0,2.0
4,Email,,…I am so happy that summer has come and we are...,Moscow\n15 october\nHi!\nThanks you for you re...,1.0,0.0,0.0
5,Email,,…I’ve recently been involved in a school surve...,"Hey, Mike.\r\nHow's it going? As for me, i'm p...",0.0,0.0,0.0
6,Email,,…All of my friends think camping is a perfect ...,"Hi Emily,\r\nThank you for the e-mail. I'm so ...",2.0,1.0,0.0


In [130]:
# Отсортируем колонки по на признаковые и относящиеся к целевой переменной
target = ['Solving a communicative task', 'Text structure', 'Use of English (for emails)']
info_columns = [' Type', 'Question id']
features = email_data.columns.difference(target).difference(info_columns)

In [131]:
email_data = email_data[email_data['Text'].str.strip().astype(bool)] # Удалим строки с пустыми ответами

In [132]:
# К сожалению, пока придется удалить и строки, в которых задание кратко описано на русском или отсутствует
email_data.drop(email_data[email_data['Question'].str.contains('[А-Яа-я]+')].index, inplace=True)

# Очистка текста

In [133]:
import nltk
from nltk.util import ngrams
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

nltk.download('punkt')
nltk.download('stopwords')

import contractions

from collections import Counter

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\max_a\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\max_a\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [134]:
email_data['Text'] = email_data['Text'].str.lower() # Приведем символы к нижнему регистру
email_data['Text'] = email_data['Text'].apply(lambda x: contractions.fix(x)) # Преобразуем сокращения в полную запись: e.g. I'll -> I will
email_data['Text'] = email_data['Text'].apply(lambda x: re.sub('[\.\n\r\t]', ' ', x)) # Уберем символы переноса строк, табуляции
email_data['Text'] = email_data['Text'].apply(lambda x: re.sub(r'[]!"$%&\'()*+,./:;=#@?[\\^_`{|}~-’]+', " ", x)) # Удалим символы пунктуации

In [136]:
# Добавим колонку с токенизированным текстом (показалось, что правильнее будет токенизировать тексты по отдельности)
email_data['Tokenized'] = email_data['Text'].apply(word_tokenize)

In [None]:
# Возможно, НЕ стоит очищать текст от stopwords, так как при этом, например, мы теряем конструкции со вспомогательным глаголом
# filtered_tokens = [word for word in tokens if not word in stopwords.words('english')]

In [138]:
tokens = list()
for tokens_list in email_data['Tokenized']:
    tokens.extend(tokens_list)
bigrams = ngrams(tokens, 3)
cnt_bi = Counter(bigrams)

print(*cnt_bi.most_common(200))

(('by', 'the', 'way'), 300) (('you', 'asked', 'me'), 271) (('asked', 'me', 'about'), 255) (('the', 'way', 'tell'), 240) (('way', 'tell', 'me'), 239) (('tell', 'me', 'more'), 228) (('me', 'more', 'about'), 228) (('for', 'your', 'recent'), 225) (('from', 'you', 'in'), 221) (('you', 'in', 'your'), 218) (('i', 'am', 'always'), 206) (('messages', 'from', 'you'), 192) (('more', 'about', 'your'), 190) (('get', 'messages', 'from'), 186) (('to', 'get', 'messages'), 185) (('in', 'your', 'email'), 181) (('thanks', 'for', 'your'), 179) (('your', 'email', 'you'), 176) (('drop', 'me', 'a'), 173) (('me', 'a', 'line'), 173) (('all', 'for', 'now'), 170) (('that', 'is', 'all'), 162) (('is', 'all', 'for'), 160) (('your', 'recent', 'email'), 157) (('email', 'you', 'asked'), 156) (('soon', 'best', 'wishes'), 146) (('now', 'drop', 'me'), 144) (('email', 'i', 'am'), 142) (('a', 'line', 'best'), 137) (('line', 'best', 'wishes'), 137) (('recent', 'email', 'i'), 136) (('thank', 'you', 'for'), 135) (('you', 'for