In [1]:
import numpy as np
import pandas as pd
import nltk
from nltk import pos_tag
from nltk import ngrams
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from collections import defaultdict
import re
from tqdm import tqdm

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Импорт данных перед предварительной обработкой

In [3]:
df = pd.read_csv('/content/drive/MyDrive/NLP/Handin/Data/df_reviews_final_binary.csv')

In [4]:
df.shape

(239031, 13)

In [5]:
df['review_comments'].isna().any()

False

In [6]:
df['review_rating'].value_counts()

review_rating
positive    237589
negative      1442
Name: count, dtype: int64

In [7]:
data = df['review_comments']

### Предварительная обработка данных

In [8]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [9]:
tokenized_texts = [word_tokenize(text) for text in tqdm(data)]

100%|██████████| 239031/239031 [01:07<00:00, 3565.46it/s]


In [10]:
tokenized_texts[0]

['everything', 'was', 'perfect', 'nice', 'and', 'clean']

##### Лемматизация слов

In [11]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [12]:
import nltk
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [13]:
lemmatizer = WordNetLemmatizer()

In [14]:
lemmatized_texts = []
for texts in tqdm(tokenized_texts):
    tokens = []
    for token in texts:
        tag = pos_tag([token])[0][1][0].lower()
        if tag == 'j':
            tag = 'a'
        if tag in ('n','v','r','a'):
            tokens.append(lemmatizer.lemmatize(token, tag))
        else:
            tokens.append(token)
    lemmatized_texts.append(tokens)

100%|██████████| 239031/239031 [24:19<00:00, 163.72it/s]


In [15]:
lemmatized_texts[0]

['everything', 'be', 'perfect', 'nice', 'and', 'clean']

##### Стемминг

In [16]:
stemmer = PorterStemmer()

In [17]:
stemmed_texts = [[stemmer.stem(token) for token in token_list] for token_list in tqdm(lemmatized_texts)]

100%|██████████| 239031/239031 [03:24<00:00, 1166.45it/s]


##### Удаление стоп-слов

In [18]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [19]:
stopwords = stopwords.words('english')

In [20]:
print(stopwords[:5])

['i', 'me', 'my', 'myself', 'we']


In [21]:
negative_words = ['not','no','can','don','won','shouldn','couldn','wouldn','doesn','isn','aren','wasn','weren','but','nor']

In [22]:
final_tokens = [[token for token in token_list if (token not in stopwords or token in negative_words)] for token_list in tqdm(stemmed_texts)]

100%|██████████| 239031/239031 [00:30<00:00, 7908.46it/s] 


In [23]:
final_tokens[0]

['everyth', 'perfect', 'nice', 'clean']

In [24]:
word_counts = defaultdict(int)
for tokens in final_tokens:
    for token in tokens:
        word_counts[token] += 1

In [25]:
len(word_counts.keys())

28478

In [26]:
def remove_infrequent_words(tokenized_documents, min_frequency=10):
    word_counts = defaultdict(int)
    for tokens in tokenized_documents:
        for token in tokens:
            word_counts[token] += 1
    frequent_words = set()
    for word, count in word_counts.items():
        if count >= min_frequency:
            frequent_words.add(word)
    filtered_documents = []
    for tokens in tokenized_documents:
        filtered_tokens = [token for token in tokens if token in frequent_words]
        filtered_documents.append(filtered_tokens)
    return filtered_documents

In [27]:
filtered = remove_infrequent_words(final_tokens, 10)

In [28]:
filtered[0]

['everyth', 'perfect', 'nice', 'clean']

In [29]:
word_counts = defaultdict(int)
for tokens in filtered:
    for token in tokens:
        word_counts[token] += 1

In [30]:
len(word_counts.keys())

6682

##### Извлекаем нграммы

In [31]:
def extract_ngrams(tokens, n=1):
    return ['_'.join(gram) for gram in ngrams(tokens, n)]
ngrams_list = []
for text in tqdm(filtered):
    text_ngrams = []
    text_ngrams += extract_ngrams(text, n=1)
    text_ngrams += extract_ngrams(text, n=2)
    text_ngrams += extract_ngrams(text, n=3)
    ngrams_list.append(text_ngrams)

100%|██████████| 239031/239031 [00:10<00:00, 23698.11it/s]


In [32]:
ngrams_list[0]

['everyth',
 'perfect',
 'nice',
 'clean',
 'everyth_perfect',
 'perfect_nice',
 'nice_clean',
 'everyth_perfect_nice',
 'perfect_nice_clean']

In [33]:
word_counts = defaultdict(int)
for tokens in ngrams_list:
    for token in tokens:
        word_counts[token] += 1

In [34]:
len(word_counts.keys())

3638085

In [35]:
final = remove_infrequent_words(ngrams_list, 10)

In [36]:
final[0]

['everyth',
 'perfect',
 'nice',
 'clean',
 'everyth_perfect',
 'perfect_nice',
 'nice_clean',
 'everyth_perfect_nice']

In [37]:
word_counts = defaultdict(int)
for tokens in final:
    for token in tokens:
        word_counts[token] += 1

In [38]:
len(word_counts.keys())

117479

In [39]:
df = pd.DataFrame(
    {'tokens': final,
     'sentiment': df['review_rating']
    })

In [40]:
df.head()

Unnamed: 0,tokens,sentiment
0,"[everyth, perfect, nice, clean, everyth_perfec...",positive
1,"[appart, beautiful, veri, friendli, help, us, ...",positive
2,"[spent, excel, night, thi, apart, give, us, lo...",positive
3,"[great, host, super, respons, make, stay, veri...",positive
4,"[properti, great, locat, base, look, see, citi...",positive


### Сохранение результатов

In [44]:
df.to_csv('/content/drive/MyDrive/NLP/Handin/Data/df_tokenized.csv', index=False)