In [1]:
import io
import re
import sys
import json
import math
import random
import string
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import tensorflow as tf
import logging
import multiprocessing
import gensim

from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import optimizers
from tensorflow.keras import losses
from tensorflow.keras import metrics
from tensorflow.keras import activations

from tensorflow.keras import utils
from tensorflow.keras.preprocessing.text import Tokenizer

from gensim.models import Word2Vec

from sklearn.model_selection import train_test_split



In [2]:
PATH = "/content/drive/MyDrive/Colab Notebooks/–•–∞–∫–ú—ç—Ä–ú–æ—Å–∫–≤—ã2023/"

In [3]:
word2vec = Word2Vec.load(PATH + "Models/w2v/tweets_model.w2v")


In [4]:
def split(input_text, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', split=' ', outlen=50):

    input_text = str(input_text).lower()

    translate_dict = {c: split for c in filters}
    translate_map = str.maketrans(translate_dict)
    input_text = input_text.translate(translate_map)

    seq = input_text.split(split)
    elem = [i for i in seq if i and i in word2vec.wv]
    return (elem + ["" for _ in range(outlen - len(elem))])[:outlen]


def vectorizator(x, outlen=50):
    return np.array([split(elem, outlen=outlen) for elem in x], dtype=np.str_)


def embedding(x, maxlen=50):
    result = np.zeros(shape=[x.shape[0], maxlen], dtype=np.float32)

    for i, text in enumerate(x):
        
        index = 0
        for word in text:

            if word in word2vec.wv:
                result[i, index] = word2vec.wv.get_index(word)
                index += 1

    return result

def prepare(texts, maxlen=50):

    vectors = vectorizator(texts, outlen=maxlen)
    X = embedding(vectors, maxlen=maxlen)

    return X

In [5]:
data = pd.read_csv(PATH + "sentiment_up.csv", delimiter=',', nrows=30000)


In [6]:
X = data.iloc[:, 0]
X = prepare(X)


In [7]:
# X = np.concatenate([X], axis=0)

In [8]:
from sklearn.cluster import Birch

bm = Birch(n_clusters=12).fit(X)


In [9]:
pd.DataFrame({**data, "pred": bm.labels_})


Unnamed: 0,–ö–æ–º–º–µ–Ω—Ç–∞—Ä–∏–π,–î–∞—Ç–∞,–û—Ü–µ–Ω–∫–∞,–°—É–º–º–∞ –∑–∞–∫–∞–∑–∞,pred
0,"–ë–ª–∞–≥–æ–¥–∞—Ä—é –∑–∞ –∑–∞–∫–∞–∑! –†–∞–¥–∞ –ø–æ–ª—É—á–µ–Ω–∏—é, –Ω–æ –∑–∞–∫–∞–∑—ã–≤...",2022-12-15T15:58:09.796075+03:00,3,2450,0
1,"–í—Å–µ –ø–æ–¥–æ—à–ª–æ, –æ—Ç–ª–∏—á–Ω–æ–µ –∫–∞—á–µ—Å—Ç–≤–æ",2022-11-07T23:37:52.541536+03:00,5,699,2
2,"–£–ø–∞–∫–æ–≤–∞–Ω–æ –≤—Å–µ —Ö–æ—Ä–æ—à–æ, —Ç–æ–ª—å–∫–æ —Ç—Ä–∞–Ω—Å–ø–æ—Ä—Ç–∏—Ä–æ–≤–æ—á–Ω–∞...",2022-12-13T21:24:31.155951+03:00,4,954,7
3,"–ë—ã—Å—Ç—Ä–æ, —É–¥–æ–±–Ω–æ.",2022-11-09T17:55:44.379634+03:00,5,269,2
4,üëç,2022-11-09T20:49:11.383462+03:00,5,229,2
...,...,...,...,...,...
29995,–ö—Ä–∞—Å–∏–≤—ã–µ —Ç–æ —á—Ç–æ –Ω—É–∂–Ω–æ.,2022-05-02T18:03:42.599423+03:00,5,164,2
29996,–í—Å—ë —Ö–æ—Ä–æ—à–æ,2022-05-05T16:52:06.975875+03:00,4,230,2
29997,–°—É–ø–µ—Ä,2022-05-13T16:33:24.176178+03:00,5,3299,2
29998,–í—Å—ë –æ—Ç–ª–∏—á–Ω–æ! –ë–ª–∞–≥–æ–¥–∞—Ä—é!,2022-05-02T14:57:23.329183+03:00,5,297,2




In [10]:
pd.DataFrame({**data, "pred": bm.labels_}).groupby(['pred'])['–ö–æ–º–º–µ–Ω—Ç–∞—Ä–∏–π'].count()


pred
0      3503
1      1349
2     21325
3       591
4       785
5       229
6       860
7       422
8       203
9       455
10      147
11      131
Name: –ö–æ–º–º–µ–Ω—Ç–∞—Ä–∏–π, dtype: int64

In [11]:

texts = [
    "–ü–æ—Ç–µ—Ä—è–ª–∏ –º–æ—é –ø–æ—Å—ã–ª–∫—É ",
    "–¥–æ—Å—Ç–∞–≤–∫–æ–π –≤ –ø–æ—Å—Ç–∞–º–∞—Ç PickPoint. –î–æ—Å—Ç–∞–≤–∏–ª–∏ –±—ã—Å—Ç—Ä–æ",
]
Xt = prepare(texts, 50)


bm.predict(Xt)

array([2, 1])