# Дополнительная постобработка. Удаление стоп-слов и знаков препинания. Лемматизация

In [1]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

import pandas as pd
import nltk
import string

nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Михаил\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Михаил\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Михаил\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
# Загрузка данных
initial_data = pd.read_csv("data/preprocessed/issue_data_encoded_small.csv")
initial_data.head()

Unnamed: 0,labels,text
0,3,"Encoding issue. Great project, I am testing ou..."
1,2,Update with feedback from Connor. All the comm...
2,2,ProductInventory. Description Describe what d...
3,1,Docs for `IConstructorSelector` optimization. ...
4,3,New user options table does not support (error...


In [3]:
# Загрузка стоп-слов и лемматизатора
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

In [4]:
len(stop_words)

179

In [5]:
# Функция дополнительной обработки текста
def preprocess_extra(text):
    # Lower-case
    res = text.lower()

    # Удаление знаков препинания
    translator = str.maketrans('', '', string.punctuation)
    res = res.translate(translator)

    # Удаление стоп-слов
    word_tokens = word_tokenize(res)
    filtered_text = [word for word in word_tokens if word not in stop_words]

    # Лемматизация
    lemmas = [lemmatizer.lemmatize(word, pos ='v') for word in filtered_text]

    return " ".join(map(str, lemmas))

In [6]:
# Пример работы фунции
preprocess_extra("This is Testing case!")

'test case'

In [7]:
# Применение функции к колонке text
data_preprocessed_extra = initial_data.copy()
data_preprocessed_extra['text'] = data_preprocessed_extra['text'].apply(preprocess_extra)

In [8]:
data_preprocessed_extra = data_preprocessed_extra.dropna()

In [9]:
# Часть набора данных после применения функции обработки
data_preprocessed_extra.head()

Unnamed: 0,labels,text
0,3,encode issue great project test whether could ...
1,2,update feedback connor comment connor need upd...
2,2,productinventory description describe document...
3,1,docs iconstructorselector optimization problem...
4,3,new user options table support errors append r...


In [10]:
# Сохранение обработаннорго набора данных
data_preprocessed_extra.to_csv("data/preprocessed/issue_data_encoded_small_extra_prep.csv", sep=",", index=False)