## Модуль В. Обработка и анализ текстовых данных (инвариант)

#### Сформировать структуру набора данных.  Провести предварительную обработку данных. Произвести кластеризацию и дать описание кластерам. Произвести классификацию на основе: результатов кластеризации или в соответствии с бизнес-задачей.

In [None]:
import pandas as pd
import re
import string
import nltk
import pymorphy2

In [61]:
def remove_punctuation(text):
    return ''.join([i for i in text if i not in string.punctuation])

In [60]:
def remove_num(text):
    return ''.join([i if not i.isdigit() else '' for i in text])

In [None]:
def tokenize(text):
    txt = word_tokenize(text)
    tokenize_text = ''.join([i for i in txt if i not in all_stopword])
    return tokenize_text

df['token'] = df['token'].apply(tokenize)

In [35]:
import pandas as pd
import string
import re
import nltk
import pymorphy2

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer

In [36]:
df = pd.read_csv('IMDB-Dataset.csv')
df = df.iloc[:10000]

In [37]:
df.shape

(10000, 2)

In [38]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     10000 non-null  object
 1   sentiment  10000 non-null  object
dtypes: object(2)
memory usage: 156.4+ KB


In [39]:
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
9995,"Fun, entertaining movie about WWII German spy ...",positive
9996,Give me a break. How can anyone say that this ...,negative
9997,This movie is a bad movie. But after watching ...,negative
9998,This is a movie that was probably made to ente...,negative


Перевод строковых данных в нижний регистр

In [40]:
df['review'] = df['review'].str.lower()
df['sentiment'] = df['sentiment'].str.lower()

In [41]:
# df['rating'] = df['rating'].fillna(0)
# df['city'] = df['city'].fillna('Не указано')

In [42]:
# df.to_excel('nlp_justpractic_1.xlsx')

Переименовали названия столбцов для удобства

In [43]:
df.rename(columns={'review': 'текст_отзыва', 'sentiment': 'тон'}, inplace=True)

In [44]:
df

Unnamed: 0,текст_отзыва,тон
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. <br /><br />the...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive
...,...,...
9995,"fun, entertaining movie about wwii german spy ...",positive
9996,give me a break. how can anyone say that this ...,negative
9997,this movie is a bad movie. but after watching ...,negative
9998,this is a movie that was probably made to ente...,negative


In [45]:
df.isnull().sum()

текст_отзыва    0
тон             0
dtype: int64

## Обработка естественного языка.


In [46]:
import string
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

Удаление пунктуации

In [54]:
def remove_punctuaction(text):
    return ''.join([i for i in text if i not in string.punctuation])

In [52]:
df['prep_text'] = [remove_punctuation(text.lower()) for text in df['текст_отзыва']]

Очистка от специальных символов и лишних пробелов

In [55]:
def remove_numbers(text): 
    return ''.join([i if not i.isdigit() else ' ' for i in text])

def remove_multiple_spaces(text): 
    return re.sub(r'\s+', ' ', text, flags=re.I)

st = '><\xa0—«»/&%^()$#@!~№;:?*_-+=|.,)""'
def remove_othersymbol(text):
    return ''.join([ch if ch not in st else ' ' for ch in text])

In [74]:
df['prep_text'] = [remove_numbers(text.lower()) for text in df['prep_text']]

In [75]:
df['prep_text'] = [remove_othersymbol(text.lower()) for text in df['prep_text']]

Удаление стоп-слов

In [76]:
nltk.download('punkt')
nltk.download('word_tokenize')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Регина\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Error loading word_tokenize: Package 'word_tokenize' not
[nltk_data]     found in index
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Регина\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [77]:
stopword_ru = stopwords.words('russian')
stopword_eng = stopwords.words('english')

all_stopword = stopword_ru + stopword_eng

In [78]:
all_stopword.append("br")

Токенизация

In [79]:
# сегментация, разделение предлож на слова компоненты
def tokenize(text):
    t = word_tokenize(text)
    tokens = [token for token in t if token not in all_stopword]
    tokenize_text = ' '.join(tokens)
    return tokenize_text

In [80]:
df['text_tokenize'] = [tokenize(text.lower()) for text in df['prep_text']]

Лемматизация (приводим к именительному падежу и ед числу)

In [81]:
from natasha import MorphVocab, Doc, Segmenter

morph_vocab = MorphVocab()

def lemmatize(word_list):
    doc = Doc(word_list)
    segmenter = Segmenter()
    doc.segment(segmenter)
    lemmatized_text = ' '.join([morph_vocab.parse(token.text)[0].normal for token in doc.tokens])
    return lemmatized_text

In [82]:
df['lemm'] = df['text_tokenize'].apply(lemmatize)

In [83]:
df

Unnamed: 0,текст_отзыва,тон,prep_text,text_tokenize,lemm
0,one of the other reviewers has mentioned that ...,positive,one of the other reviewers has mentioned that ...,one reviewers mentioned watching oz episode yo...,one reviewers mentioned watching oz episode yo...
1,a wonderful little production. <br /><br />the...,positive,a wonderful little production br br the filmin...,wonderful little production filming technique ...,wonderful little production filming technique ...
2,i thought this was a wonderful way to spend ti...,positive,i thought this was a wonderful way to spend ti...,thought wonderful way spend time hot summer we...,thought wonderful way spend time hot summer we...
3,basically there's a family where a little boy ...,negative,basically theres a family where a little boy j...,basically theres family little boy jake thinks...,basically theres family little boy jake thinks...
4,"petter mattei's ""love in the time of money"" is...",positive,petter matteis love in the time of money is a ...,petter matteis love time money visually stunni...,petter matteis love time money visually stunni...
...,...,...,...,...,...
9995,"fun, entertaining movie about wwii german spy ...",positive,fun entertaining movie about wwii german spy j...,fun entertaining movie wwii german spy julie a...,fun entertaining movie wwii german spy julie a...
9996,give me a break. how can anyone say that this ...,negative,give me a break how can anyone say that this i...,give break anyone say good hockey movie know m...,give break anyone say good hockey movie know m...
9997,this movie is a bad movie. but after watching ...,negative,this movie is a bad movie but after watching a...,movie bad movie watching endless series bad ho...,movie bad movie watching endless series bad ho...
9998,this is a movie that was probably made to ente...,negative,this is a movie that was probably made to ente...,movie probably made entertain middle school ea...,movie probably made entertain middle school ea...


Векторизация текстовых данных

In [84]:
# Подключение библиотек.
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [85]:
vectorizer = CountVectorizer(analyzer='word', stop_words=stopword_eng, ngram_range=(1, 3), min_df=2)
count_matrix = vectorizer.fit_transform(df['lemm'])

In [86]:
count_matrix.shape

(10000, 158618)

Количество строк в count_matrix не соотвествует количеству строк в df.

In [87]:
vectorizer.get_feature_names_out()

array(['aag', 'aaliyah', 'aaliyahs', ..., 'élan', 'élan unique',
       'élan unique personal'], dtype=object)

## Кластеризация.

In [88]:
# Подключение библиотек.
import numpy as np
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.cluster import KMeans, AgglomerativeClustering, SpectralClustering
from sklearn import metrics
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

Для лучшего нахождения кластеров выполним скалирование данных.

In [89]:
# Создание DataFrame с данными tfidf_matrix.
feature_names = vectorizer.get_feature_names_out()
tfidf_df = pd.DataFrame(count_matrix.toarray(), columns=feature_names)

# Скалирование данных.
scaler = StandardScaler(with_mean=False)
scaler.fit(df)
X_scale = scaler.transform(df)
scaled_data = pd.DataFrame(X_scale, columns=df.columns)

# Уменьшение размерности DataFrame до 2-х измерений
pca = PCA(n_components=2)
pca.fit(scaled_data)

x_pca = pca.transform(scaled_data)
np.shape(x_pca)

MemoryError: Unable to allocate 11.8 GiB for an array with shape (10000, 158618) and data type int64

In [66]:
wgcc = []
for i in range(1,11):
    kmean = KMeans(n_clusters=i, init='k-means++', random_state=0)
    kmean.fit(x_pca)
    wgcc.append(kmean.inertia_)
plt.plot(range(1,11), wgcc)

NameError: name 'x_pca' is not defined

In [54]:
kmeans = KMeans(n_clusters=3)
kmeans.fit_predict(tfidf_matrix)

  super()._check_params_vs_input(X, default_n_init=10)


array([2, 2, 2, ..., 1, 1, 2])

In [63]:
from sklearn import metrics

kmeans.fit(X)
for num_clusters in np.arange(2, 10):

    kmeans = KMeans(init='k-means++', n_clusters=num_clusters, n_init=10)
    kmeans.fit(X)
    score = metrics.silhouette_score(X, kmeans.labels_, 
                metric='euclidean', sample_size=len(X))

    print("\nNumber of clusters =", num_clusters)
    print("Silhouette score =", score)
                    
    scores.append(score)

NameError: name 'kmeans' is not defined

In [50]:
idx = kmeans.fit_predict(tfidf_matrix)
clusters_kmeans = kmeans.labels_.tolist()

In [51]:
out = { 'skills': df['тон'], 'cluster': clusters_kmeans }
frame_kmeans = pd.DataFrame(out, columns = ['skills', 'cluster'])

In [52]:
frame_kmeans['cluster'].value_counts()

cluster
0    4477
1    2924
2    2599
Name: count, dtype: int64

In [None]:
# Код для нахождения 10 распространенных слов для каждого кластера, чтобы потом дать им названия.

top_words = 10
cluster_centers_kmeans = kmeans.cluster_centers_
cluster_keywords_kmeans = {}

for i in range(len(cluster_centers_kmeans)):
    top_words_idx = cluster_centers_kmeans[i].argsort()[:-top_words-1:-1]
    keywords = [feature_names[idx] for idx in top_words_idx]
    cluster_keywords_kmeans[f'Кластер {i}'] = keywords

for cluster, keywords in cluster_keywords_kmeans.items():
    print(f'{cluster}: {", ".join(keywords)}')

In [None]:
# Исходная TF-IDF матрица.
kmeans_clusters = kmeans.fit_predict(tfidf_matrix)
kmeans_score = silhouette_score(tfidf_matrix, kmeans_clusters)
print(f'Силуэт для Kmeans: {kmeans_score}')

print()

# TF-IDF матрица, в которой было произведено скалирование данных и уменьшение размерности до 2-х измерений.
kmeans_ = KMeans(n_clusters=5)
kmeans_clusters_ = kmeans.fit_predict(x_pca)
kmeans_score_ = silhouette_score(x_pca, kmeans_clusters_)
print(f'Силуэт для Kmeans: {kmeans_score_}')

In [None]:
df['cluster'] = clusters_kmeans

In [None]:
df

In [None]:
df['prep_text'][0]