In [None]:
from collections import Counter, defaultdict
import os
from tqdm import tqdm
from pathlib import Path
import nltk
from nltk.corpus import stopwords
import re
import string

In [None]:
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

In [None]:
def read_tsv_file(path_file):
    sentences = []
    with open(path_file, 'r', encoding='utf-8') as file:
        for line in file:
            if line.strip():
                word = line.split('\t')[0].strip()
                if word:
                    sentences.append(word)
    return sentences

def build_token_dictionary(sentences):
    token_freq = Counter(sentences)
    token_freq = {token: freq for token, freq in token_freq.items() if freq > 0 and token not in stop_words and re.sub(r"[^\w\s]", "", token) != ''}
    return token_freq

In [None]:
path_folder  = '/kaggle/input/nlp-lab/train'
folder_category = os.listdir(path_folder)

documents = []
sentences = []

for folder in tqdm(folder_category):
    include_files = os.listdir(os.path.join(path_folder, folder))
    for file in include_files:
        path_file = f'{path_folder}/{folder}/{file}'
        sentence = read_tsv_file(path_file)
        sentences += sentence
        documents.append(f'{folder}/{file}')

token_freq = build_token_dictionary(sentences)


In [None]:
import json

with open('token_freq.json', 'w', encoding='utf-8') as f:
    json.dump(token_freq, f, ensure_ascii=False, indent=4)

In [None]:
import csv
import pandas as pd

terms = list(token_freq.keys())

def create_term_document_matrix(documents, terms):
    matrix = {'Word': terms}
    for doc in tqdm(documents[:]):
        path_file = os.path.join(path_folder, doc)
        sentence = read_tsv_file(path_file)
        tk_freq = build_token_dictionary(sentence)
        count = []
        for token in terms:
            if token in list(tk_freq.keys()):
                count.append(tk_freq[token])
            else:
                count.append(0)
        matrix[doc] = count
    df = pd.DataFrame(matrix)
    df.to_csv('output-lab2.csv', index=False)
    return pd.DataFrame(matrix)

term_document_matrix = create_term_document_matrix(documents, terms)


In [None]:
import pandas as pd

df = pd.read_csv('/kaggle/input/nlp-lab/output-lab2.csv')

df.head(10)

In [None]:
def tokenize(text):
    matches = []
    cleaned_tokens = []
    #Удаляем email
    text = re.sub(r'\S+@\S+', '', text)

    # Удаляем пунктуацию
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Удаляем телефонные номера
    text = re.sub(r"^\\+?[1-9][0-9]{7,14}$", '', text)

    new_sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|!|\n)\s', text)


    for sentences in new_sentences:
        for line in sentences.split(r"(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|!|\n)\s"):
            clear_token = re.findall(r'\b\w+\b|[\(\),.—:;!?|<>"]', line)
            if clear_token:
                cleaned_tokens.append(clear_token)

    for sentence in cleaned_tokens:
        for num, token in enumerate(sentence):
            for i, match in enumerate(matches):
                if token == f'__regex_match_{i}__':
                    sentence[num] = match
    return cleaned_tokens

def processing_sentences(sentences):
    tokens = [tokenize(sentence) for sentence in sentences]
    return tokens

In [None]:
def count_texts_with_word(df, word):
    word_data = df[df[df.columns[0]] == word]
    count = (word_data.iloc[0, 1:] > 0).sum()

    return count

In [None]:
import math
def vectorize_tf_idf(text, df):
    tokens = text
    tf = {}
    idf = {}
    tf_idf = {}
    result = []
    clear_tokens = tokens
    new_clear_tokens = []
    token_fr = build_token_dictionary(clear_tokens)

    total_words = sum(token_fr.values())
    documents_count = len(df.columns[1:])


    for word in list(df['Word']):
        if word in token_fr.keys():
            tf[word] = token_fr[word] / total_words
            idf[word] = math.log((documents_count / count_texts_with_word(df, word)) + 1)
            tf_idf[word] = tf[word] * idf[word]
            result.append(tf[word] * idf[word])
        else:
            tf_idf[word] = 0.0
            result.append(0.0)
    return result



In [None]:
from gensim.models import Word2Vec
path_folder  = '/kaggle/input/nlp-lab/train/train'
folder_category = os.listdir(path_folder)

train_sentences = []

for folder in tqdm(folder_category):
    include_files = os.listdir(os.path.join(path_folder, folder))
    for file in include_files:
        path_file = f'{path_folder}/{folder}/{file}'
        sentence = read_tsv_file(path_file)
        train_sentences.append(sentence)



model = Word2Vec(sentences=train_sentences, vector_size=100, window=5, min_count=1, workers=4, epochs=40)

In [None]:
similar_words = model.wv.most_similar("organization")
similar_words

In [None]:
import numpy as np

def cosine_similarity_custom(vec1, vec2):
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    return dot_product / (norm_vec1 * norm_vec2)



In [None]:
cosine_similarity_custom(model.wv["organization"], model.wv["party"])

In [None]:
from sklearn.decomposition import PCA
sim_tokens = ['organisation', 'party', 'individual', 'dimension', 'component', 'arena', 'establishment', 'initiative', 'ceremony', 'action', 'organization']
vectorized_sim = [model.wv[token] for token in sim_tokens]
pca = PCA(n_components=2)
res = pca.fit_transform(vectorized_sim)


In [None]:
for token in sim_tokens[1:]:
    print(f'Current token: local -- Selected token: {token} --->{cosine_similarity_custom(model.wv["organization"], model.wv[token])}')

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 8))
for i, token in enumerate(sim_tokens):
    plt.scatter(res[i, 0], res[i, 1])
    plt.text(res[i, 0]+0.01, res[i, 1]+0.01, token, fontsize=9)

plt.xlabel('X')
plt.ylabel('Y')
plt.title('2D')
plt.grid(True)
plt.show()

In [None]:
list(df['Word'])

In [None]:
term_text = []

for sample_content in tqdm(list(df['Word'])[:20000]):
    tokens = vectorize_tf_idf(str(sample_content),df)
    term_text.append(tokens)

In [None]:
with open("term_text.txt", "w") as txt_file:
    for line in term_text:
        txt_file.write(" ".join(line) + "\n") # works with any number of elements in a line

In [None]:
pca = PCA(n_components=len(model.wv[0]))
terms_transformed = pca.fit_transform(term_text)

In [None]:
terms_to_compare = pd.DataFrame(terms_transformed)
terms_to_compare.index = df['Word'].loc[:19999]

In [None]:
terms_to_compare.head(20)

In [None]:
def compare_words(word1, word2):
    print(f"Words: {word1} --- {word2}")
    print("W2Vec:", cosine_similarity_custom(model.wv[word1], model.wv[word2]))
    print("Tf-Idf:", cosine_similarity_custom(terms_to_compare.loc[word1], terms_to_compare.loc[word2]))


In [None]:
compare_words('organization', 'lines')

In [None]:
compare_words('music', 'heavy')

In [None]:
def vectorize_text(text, model):
    res_text = [0 for i in range(model.vector_size)]
    sentences = tokenize(text)
    for sentence in sentences:
        res_sent = [0 for i in range(model.vector_size)]
        for token in sentence:
            if model.wv.has_index_for(token):
                res_sent += model.wv[token]
        res_sent = np.array(res_sent) / len(res_sent)
        res_text += res_sent
    return res_text /  len(sentences)


In [None]:
text = "People in the San Francisco Bay area can get Darwin Fish from Lynn Gold"
vectorize_text(text, model)

In [None]:
path_foldes = '/kaggle/input/nlp-lab/20news-bydate/20news-bydate/20news-bydate-train'

catalogs = os.listdir(path_foldes)
for catalog in catalogs:
    include_catalog = os.listdir(os.path.join(path_foldes, catalog))
    for file in include_catalog:
        if os.path.isdir(os.path.join(path_foldes, f'{catalog}/{file}')):
            catalogs.append(f'{catalog}/{file}')


In [None]:
dict_tsv = {}
for catalog in tqdm(catalogs):
    include_catalog = os.listdir(os.path.join(path_foldes, catalog))
    for file in include_catalog:
        if not os.path.isdir(os.path.join(path_foldes, f'{catalog}/{file}')):
            with open(os.path.join(path_foldes, f'{catalog}/{file}'), 'r', encoding='latin1') as file_name:
                sample_content = file_name.read()
            vect = vectorize_text(sample_content, model)
            dict_tsv[f'{catalog}/{file}'] = vect

In [None]:
dict_tsv

In [None]:
with open("/kaggle/working/train_embeddings.tsv", "w") as f:
    for k in dict_tsv.keys():
        print(k.replace(".tsv", ""), *dict_tsv[k], sep="\t", file=f)

In [None]:
train_emb = pd.read_csv('/kaggle/working/train_embeddings.tsv', sep='\t')

In [None]:
train_emb.head

In [None]:
path_foldes = '/kaggle/input/nlp-lab/20news-bydate/20news-bydate/20news-bydate-test'

catalogs = os.listdir(path_foldes)
for catalog in catalogs:
    include_catalog = os.listdir(os.path.join(path_foldes, catalog))
    for file in include_catalog:
        if os.path.isdir(os.path.join(path_foldes, f'{catalog}/{file}')):
            catalogs.append(f'{catalog}/{file}')


In [None]:
dict_tsv_test = {}
for catalog in tqdm(catalogs):
    include_catalog = os.listdir(os.path.join(path_foldes, catalog))
    for file in include_catalog:
        if not os.path.isdir(os.path.join(path_foldes, f'{catalog}/{file}')):
            with open(os.path.join(path_foldes, f'{catalog}/{file}'), 'r', encoding='latin1') as file_name:
                sample_content = file_name.read()
            vect = vectorize_text(sample_content, model)
            dict_tsv_test[f'{catalog}/{file}'] = vect

In [None]:
with open("/kaggle/working/test_embeddings.tsv", "w") as f:
    for k in dict_tsv_test.keys():
        print(k.replace(".tsv", ""), *dict_tsv_test[k], sep="\t", file=f)