<a href="https://colab.research.google.com/github/NikolayKuraga/ML_lab_2/blob/master/2_nn_text_NEW.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [39]:
!pip install sentence-transformers -q

In [40]:
!echo 'WOW! I can execute shell commands from Jupyter Notebook!'
!echo 'Interesting, what shell is used under Windows...'
!echo "Current shell is: \"${SHELL}\"."
!pip install fast_langdetect iso-639 nltk pandas >/dev/null


import enum
import string
import typing as t


import fast_langdetect
import iso639
import nltk.corpus
import pandas as pd

from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
import gensim
from sentence_transformers import SentenceTransformer

nltk.download('stopwords')
print('Done')

WOW! I can execute shell commands from Jupyter Notebook!
Interesting, what shell is used under Windows...
Current shell is: "/bin/bash".
Done


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [41]:
df_trn = pd.read_csv('https://raw.githubusercontent.com/NikolayKuraga/ML_lab_2/refs/heads/master/train.csv')
df_tst = pd.read_csv('https://raw.githubusercontent.com/NikolayKuraga/ML_lab_2/refs/heads/master/test.csv')

print('Head of train dataset:')
print(df_trn.head())
print()
print('Info of train dataset')
print(df_trn.info())

Head of train dataset:
   Class Index                                              Title  \
0            3  Wall St. Bears Claw Back Into the Black (Reuters)   
1            3  Carlyle Looks Toward Commercial Aerospace (Reu...   
2            3    Oil and Economy Cloud Stocks' Outlook (Reuters)   
3            3  Iraq Halts Oil Exports from Main Southern Pipe...   
4            3  Oil prices soar to all-time record, posing new...   

                                         Description  
0  Reuters - Short-sellers, Wall Street's dwindli...  
1  Reuters - Private investment firm Carlyle Grou...  
2  Reuters - Soaring crude prices plus worries\ab...  
3  Reuters - Authorities have halted oil export\f...  
4  AFP - Tearaway world oil prices, toppling reco...  

Info of train dataset
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120000 entries, 0 to 119999
Data columns (total 3 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   Class Inde

### 1. Try some text cleaning techniques from the list below (at least 2). Does any of it improved your model quality? Try to assume why.

In [42]:
# 1. Try some text cleaning techniques from the list below (at least 2). Does any of it
# improved your model quality? Try to assume why.
#     a. stop words removing
#     b. punctuation removing
#     c. Trash (Extra spaces / special symbols (like @#~<> etc)) removing
#     d. digits removing


# First of all tried to detect what languages are used.
# Found two ways:
#     1) langdetect -- "traditional" module, really slow (several seconds for 1000 entities).
#     2) fast_langdetect -- fast "zoomer" language-machine-learning based module.
# I will use fast_langdetect model 'cause I'm zoomer.

def detectLanguage(text: str) -> str:
    code = fast_langdetect.detect_language(text).lower()
    name = iso639.languages.get(alpha2=code).name
    name = ''.join(name.split()).lower()
    return name


df_trn['Lang'] = (df_trn['Title'] + ' ' + df_trn['Description']).apply(detectLanguage)
df_tst['Lang'] = (df_tst['Title'] + ' ' + df_tst['Description']).apply(detectLanguage)

print('Train dataset')
print(df_trn['Lang'].value_counts())
print()
print('Test dataset')
print(df_tst['Lang'].value_counts())

Train dataset
Lang
english           119944
french                21
spanish               10
polish                 9
german                 7
italian                5
westernfrisian         3
ukrainian              1
Name: count, dtype: int64

Test dataset
Lang
english    7596
polish        3
french        1
Name: count, dtype: int64


In [43]:
# Find necessary stopwords in nltk database.
stopwords = set()
for lang in df_trn['Lang'].unique():
    try:
        stopwords |= set(nltk.corpus.stopwords.words(lang))
    except:
        print(f'no "{lang}" language in stopwords database?')

print()
print(f'Got so many ({len(stopwords)}) stopwords from different languages!')

def cleanTextFromStopwords8Punc(txt: str) -> str:
    txt = ''.join([sym for sym in txt if sym not in string.punctuation])
    txt = ' '.join([wrd for wrd in txt.lower().split() if wrd not in stopwords])
    return txt

def cleanTextFromStopwords8Punc8Dig(txt: str) -> str:
    txt = ''.join([i for i in txt if not i.isdigit()])
    txt = ''.join([sym for sym in txt if sym not in string.punctuation])
    txt = ' '.join([wrd for wrd in txt.lower().split() if wrd not in stopwords])
    return txt

df_trn['Title no stopwords8punc'] = df_trn['Title'].apply(cleanTextFromStopwords8Punc)
df_trn['Title no stopwords8punc8dig'] = df_trn['Title'].apply(cleanTextFromStopwords8Punc8Dig) #Title no stopwords8punc8dig
df_trn['Description no stopwords8punc'] = df_trn['Description'].apply(cleanTextFromStopwords8Punc) #Description no stopwords8punc
df_trn['Description no stopwords8punc8dig'] = df_trn['Description'].apply(cleanTextFromStopwords8Punc8Dig)

df_tst['Title no stopwords8punc'] = df_tst['Title'].apply(cleanTextFromStopwords8Punc)
df_tst['Title no stopwords8punc8dig'] = df_tst['Title'].apply(cleanTextFromStopwords8Punc8Dig)
df_tst['Description no stopwords8punc'] = df_tst['Description'].apply(cleanTextFromStopwords8Punc)
df_tst['Description no stopwords8punc8dig'] = df_tst['Description'].apply(cleanTextFromStopwords8Punc8Dig)

print()
print(df_trn['Title'].iloc[35:40])
print()
print(df_trn['Title no stopwords8punc'].iloc[35:40])
print()
print(df_trn['Title no stopwords8punc8dig'].iloc[35:40])

no "polish" language in stopwords database?
no "ukrainian" language in stopwords database?
no "westernfrisian" language in stopwords database?

Got so many (1089) stopwords from different languages!

35                                  Steady as they go
36         Google IPO: Type in 'confusing,' 'secrecy'
37                        A bargain hunter's paradise
38     Researchers seek to untangle the e-mail thread
39    Microsoft Corp. 2.0: a kinder corporate culture
Name: Title, dtype: object

35                                     steady go
36             google ipo type confusing secrecy
37                      bargain hunters paradise
38        researchers seek untangle email thread
39    microsoft corp 20 kinder corporate culture
Name: Title no stopwords8punc, dtype: object

35                                  steady go
36          google ipo type confusing secrecy
37                   bargain hunters paradise
38     researchers seek untangle email thread
39    microsoft corp kinder

### 2. Try to apply stemming and lemmatization.

In [44]:
X_train =  df_trn[['Title no stopwords8punc8dig','Description no stopwords8punc8dig']]
y_train =  df_trn['Class Index']

X_test =  df_tst[['Title no stopwords8punc8dig','Description no stopwords8punc8dig']]

In [45]:
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()
def lemmatzation(lst):
   new_lem = []
   for i in lst:
       i = lemmatizer.lemmatize(i)
       new_lem.append(i)
   return new_lem

train_x_lem = X_train.apply(lemmatzation)
test_x_lem = X_test.apply(lemmatzation)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [46]:
def lemmatize_text(txt):
    lemmatized_words = [lemmatizer.lemmatize(w) for w in txt.split()]  # Лемматизация
    return lemmatized_words


# Применим лемматизацию к каждому столбцу в тест и трейн
train_x_lem = pd.DataFrame()
train_x_lem['title'] = X_train['Title no stopwords8punc8dig'].apply(lemmatize_text)
train_x_lem['description'] = X_train['Description no stopwords8punc8dig'].apply(lemmatize_text)

test_x_lem = pd.DataFrame()
test_x_lem['title'] = X_test['Title no stopwords8punc8dig'].apply(lemmatize_text)
test_x_lem['description'] = X_test['Description no stopwords8punc8dig'].apply(lemmatize_text)

In [47]:
stemmer = SnowballStemmer("english")

def stemming(txt):
    stem_words = [stemmer.stem(w) for w in txt.split()]
    return stem_words # ' '.join(stem_words)

# Применим стемминг к каждому столбцу в тест и трейн
train_x_stem = pd.DataFrame()
train_x_stem['title'] = X_train['Title no stopwords8punc8dig'].apply(stemming)
train_x_stem['description'] = X_train['Description no stopwords8punc8dig'].apply(stemming)

test_x_stem = pd.DataFrame()
test_x_stem['title'] = X_test['Title no stopwords8punc8dig'].apply(stemming)
test_x_stem['description'] = X_test['Description no stopwords8punc8dig'].apply(stemming)

In [48]:
# Итого два варианта: train_x_lem, test_x_lem и train_x_stem,test_x_stem
# Which is improve model quality better узнаем только по модели, поэтому дальше используем оба набора данных

In [49]:
print(train_x_lem.head(3))
print(train_x_stem.head(3))

                                               title  \
0       [wall, st, bear, claw, back, black, reuters]   
1  [carlyle, look, toward, commercial, aerospace,...   
2     [oil, economy, cloud, stock, outlook, reuters]   

                                         description  
0  [reuters, shortsellers, wall, street, dwindlin...  
1  [reuters, private, investment, firm, carlyle, ...  
2  [reuters, soaring, crude, price, plus, worries...  
                                               title  \
0        [wall, st, bear, claw, back, black, reuter]   
1  [carlyl, look, toward, commerci, aerospac, reu...   
2      [oil, economi, cloud, stock, outlook, reuter]   

                                         description  
0  [reuter, shortsel, wall, street, dwindlingband...  
1  [reuter, privat, invest, firm, carlyl, groupwh...  
2  [reuter, soar, crude, price, plus, worriesabou...  


#3. Apply vectorization

In [None]:
# Предполагаем, что y_train — это список или numpy массив, соответствующий размеру train_x_lem['description']

# Фильтрация train_x_lem['description'] с сохранением индексной информации
filtered_data = [(i, x) for i, x in enumerate(train_x_lem['description']) if len(x) > 1]

# Извлечение списка элементов и соответствующих им индексов
filtered_train_x_lem_for_vect = [x for _, x in filtered_data]
filtered_indices = [i for i, _ in filtered_data]

# Фильтрация y_train по сохраненным индексам
filtered_y_train = [y_train[i] for i in filtered_indices]

# Теперь у вас есть оба фильтрованных списка
train_x_lem_for_vect = filtered_train_x_lem_for_vect
y_train_lem_for_vect = filtered_y_train

In [50]:
# train_x_lem_for_vect = [x for x in train_x_lem['description'] if len(x) > 1]
test_x_lem_for_vect = [x for x in test_x_lem['description'] if len(x) > 1]

In [51]:
train_x_stem_for_vect = [x for x in train_x_stem['description'] if len(x) > 1]
test_x_stem_for_vect = [x for x in test_x_stem['description'] if len(x) > 1]

In [52]:
tfidf_vectorizer = TfidfVectorizer()

In [53]:
desc_tfidf_lem = [tfidf_vectorizer.fit_transform(x) for x in train_x_lem_for_vect]
desc_test_tfidf_lem = [tfidf_vectorizer.transform(x) for x in test_x_lem_for_vect]


In [54]:
desc_tfidf_stem = [tfidf_vectorizer.fit_transform(x) for x in train_x_stem_for_vect]
desc_test_tfidf_stem = [tfidf_vectorizer.transform(x) for x in test_x_stem_for_vect]

In [55]:
word2vec_desc_lem = gensim.models.Word2Vec(train_x_lem_for_vect, min_count=1,
								vector_size=100, window=5)
word2vec_test_desc_lem = gensim.models.Word2Vec(test_x_lem_for_vect, min_count=1,
								vector_size=100, window=5)

In [56]:
word2vec_desc_stem = gensim.models.Word2Vec(train_x_stem_for_vect, min_count=1,
								vector_size=100, window=5)
word2vec_test_desc_stem = gensim.models.Word2Vec(test_x_stem_for_vect, min_count=1,
								vector_size=100, window=5)

In [57]:
transformer = SentenceTransformer('all-MiniLM-L6-v2')



In [58]:
vect_transformers_desc_lem = transformer.encode(train_x_lem_for_vect)
vect_transformers_test_desc_lem = transformer.encode(test_x_lem_for_vect)

In [59]:
vect_transformers_desc_stem = transformer.encode(train_x_stem_for_vect)
vect_transformers_test_desc_stem = transformer.encode(test_x_stem_for_vect)

In [59]:
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping  # Импорт EarlyStopping

# Предполагаемая форма данных: (batch_size, sequence_length, features)
# Например, если ваши векторы имеют 384 признаков
input_shape = (384, 1)

# Создание модели CNN
modelCNN = models.Sequential()

# Первый свёрточный слой для одномерных данных
modelCNN.add(layers.Conv1D(128, 3, activation='relu', input_shape=input_shape))

# Второй свёрточный слой
modelCNN.add(layers.Conv1D(64, 3, activation='relu'))
modelCNN.add(layers.MaxPooling1D(2))

# Полносвязные слои
modelCNN.add(layers.Flatten())
modelCNN.add(layers.Dense(64, activation='relu'))
modelCNN.add(layers.Dense(10, activation='softmax'))

# Компиляция модели
modelCNN.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

# Преобразование векторных меток в формат one-hot
y_train_categorical = to_categorical(y_train_lem_for_vect, num_classes=10)

# Нормализация вектора входных данных
vect_transformers_desc_lem_norm = vect_transformers_desc_lem.reshape((vect_transformers_desc_lem.shape[0], 384, 1))

# Установка EarlyStopping
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Обучение модели
modelCNN.fit(vect_transformers_desc_lem_norm, y_train_categorical, epochs=10, callbacks=[early_stopping])


In [None]:
import numpy as np


# Нормализация вектора входных данных
vect_transformers_desc_lem_norm = vect_transformers_test_desc_lem.reshape((vect_transformers_test_desc_lem.shape[0], 384, 1))

predictions = modelCNN.predict(vect_transformers_desc_lem_norm)
predicted_classes = np.argmax(predictions, axis=1)

ids = np.arange(len(predicted_classes))
df_results = pd.DataFrame({
    'ID': ids,
    'Class Index': predicted_classes
})

df_results.to_csv('predictionsCNN.csv', index=False)
print("Предсказания сохранены в 'predictions.csv'")

