In [1]:
import numpy as np, pandas as pd, tensorflow as tf, requests as rqst, io
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.keras import layers

rnd = np.random.randint

folder = '/content/drive/MyDrive/Colab Notebooks/NLP_labs/'

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
file = open(folder+'sentences.txt')
sentences = file.read().split('. ')[:-1]
sentences

['which input is most helpful is classifying data without error? These input-weight products are summed and then the sum is passed through a node’s so-called activation function, to determine whether and to what extent that signal should progress further through the network to affect the ultimate outcome, say, an act of classification',
 'And thanks to the demand from gamers around the world, GPUs (graphics processing units) make it possible for us to leverage deep learning algorithms to build and train models with impressive results in a time-efficient manner! So to all the parents who don’t like your kids playing games: Gaming has its silver lining…',
 'Is the dataset you need publicly available, or can you create it (with a data annotation service like Scale or AWS Mechanical Turk)? In this example, spam emails would be labeled as spam, and the labels would enable the algorithm to map from inputs to the classifications you care about',
 '(Neural networks can also extract features th

### Підібрати наукові статті за обраним напрямком англійською мовою, з яких вилучити 100 речень, які найбільш відповідають зазначеному напрямку

In [4]:
sent_df = []
for sent in sentences:
  sent_df.append({'Sentence':sent, 'Label':'Reinforcement Learning'})

sent_df

[{'Label': 'Reinforcement Learning',
  'Sentence': 'which input is most helpful is classifying data without error? These input-weight products are summed and then the sum is passed through a node’s so-called activation function, to determine whether and to what extent that signal should progress further through the network to affect the ultimate outcome, say, an act of classification'},
 {'Label': 'Reinforcement Learning',
  'Sentence': 'And thanks to the demand from gamers around the world, GPUs (graphics processing units) make it possible for us to leverage deep learning algorithms to build and train models with impressive results in a time-efficient manner! So to all the parents who don’t like your kids playing games: Gaming has its silver lining…'},
 {'Label': 'Reinforcement Learning',
  'Sentence': 'Is the dataset you need publicly available, or can you create it (with a data annotation service like Scale or AWS Mechanical Turk)? In this example, spam emails would be labeled as sp

### Створити та зберегти у .csv файл pandas DataFrame з обраних речень із указанням в окремому ствопчику назви напрямку (ця назва має бути однаковою для усіх речень)


In [5]:
sent_df = pd.DataFrame(data=sent_df, columns=['Sentence', 'Label'])

sent_df.to_csv(folder+'sentences.csv')

sent_df

Unnamed: 0,Sentence,Label
0,which input is most helpful is classifying dat...,Reinforcement Learning
1,And thanks to the demand from gamers around th...,Reinforcement Learning
2,"Is the dataset you need publicly available, or...",Reinforcement Learning
3,(Neural networks can also extract features tha...,Reinforcement Learning
4,Deep learning algorithms have improved over th...,Reinforcement Learning
...,...,...
94,That’s why deep learning is important,Reinforcement Learning
95,Deep learning maps inputs to outputs,Reinforcement Learning
96,This is known as supervised learning,Reinforcement Learning
97,Learn to build AI in simulations »,Reinforcement Learning


### Здійснити підготовку набору даних до подальшого моделювання (tokenization&embedding, train_test_split)


### Train/Test Split

In [6]:
from sklearn.model_selection import train_test_split
train_split, test_split = train_test_split(sent_df, train_size=0.8, test_size=0.2)
train_split

Unnamed: 0,Sentence,Label
10,Other types of problems include anomaly detect...,Reinforcement Learning
52,"As you can see, with neural networks, we’re mo...",Reinforcement Learning
91,"Not zero surprises, just marginally fewer",Reinforcement Learning
71,Deep learning is beating us in the most challe...,Reinforcement Learning
41,Anomaly detection: The flipside of detecting s...,Reinforcement Learning
...,...,...
19,A node is just a place where computation happe...,Reinforcement Learning
61,’s post on blockchains for AI as a solution fo...,Reinforcement Learning
39,"Do I have the right data? For example, if you ...",Reinforcement Learning
56,Large corporations and young startups alike ar...,Reinforcement Learning


#### Vectorization

In [7]:
max_tokens = 10000

tokens_count = 0
for sent in sentences:
  tokens_count+=len(sent.split())
avg_tokens = round(tokens_count/len(sentences))
avg_tokens

18

In [8]:
text_vectorizer = TextVectorization(max_tokens=max_tokens, # how many words in the vocabulary (all of the different words in your text)
                                    standardize="lower_and_strip_punctuation", # how to process text
                                    split="whitespace", # how to split tokens
                                    ngrams=None, # create groups of n-words?
                                    output_mode="int", # how to map tokens to numbers
                                    output_sequence_length=avg_tokens, # how long should the output sequence of tokens be?
                                    pad_to_max_tokens=True)
text_vectorizer

<keras.layers.preprocessing.text_vectorization.TextVectorization at 0x7f6ea4b0e410>

In [9]:
text_vectorizer.adapt(sent_df['Sentence'])

In [10]:
text_vectorizer(sent_df['Sentence'])

<tf.Tensor: shape=(99, 18), dtype=int64, numpy=
array([[ 26,  36,   6, ...,   5,  69,   2],
       [  5, 302,   3, ...,  16, 168,  18],
       [  6,   2, 123, ..., 659, 347,  34],
       ...,
       [ 43,   6, 193, ...,   0,   0,   0],
       [ 58,   3, 127, ...,   0,   0,   0],
       [167, 664, 368, ...,   0,   0,   0]])>

In [11]:
print(f"Most Used: {text_vectorizer.get_vocabulary()[:5]}")
print(f"Most Unused: {text_vectorizer.get_vocabulary()[-5:]}")

Most Used: ['', '[UNK]', 'the', 'to', 'of']
Most Unused: ['9x', '3x', '21st', '12', '01']


#### Embedding

In [12]:
embedding = layers.Embedding(input_dim=max_tokens, # set input shape
                             output_dim=128, # set size of embedding vector
                             embeddings_initializer="uniform", # default, intialize randomly
                             input_length=avg_tokens) # how long is each input

embedding(text_vectorizer(sent_df['Sentence']))


<tf.Tensor: shape=(99, 18, 128), dtype=float32, numpy=
array([[[ 0.02649392,  0.03455598,  0.03693109, ...,  0.04001713,
          0.01619015,  0.0329239 ],
        [ 0.01735726,  0.03349419,  0.0407011 , ...,  0.03383284,
         -0.01569266,  0.04784724],
        [ 0.04263499, -0.0074714 ,  0.00940112, ..., -0.04725128,
          0.01254864, -0.04559779],
        ...,
        [-0.04451725, -0.01108446, -0.03888494, ..., -0.01655387,
         -0.0312529 ,  0.01859114],
        [ 0.04264941, -0.03020682, -0.00348865, ..., -0.02534142,
          0.02340144, -0.04692082],
        [-0.00790466, -0.03566766,  0.01512972, ..., -0.01683168,
          0.00136293,  0.0098209 ]],

       [[-0.04451725, -0.01108446, -0.03888494, ..., -0.01655387,
         -0.0312529 ,  0.01859114],
        [-0.01716309,  0.00108542,  0.04675511, ..., -0.03306841,
          0.04894564,  0.04943749],
        [ 0.00415247, -0.01249785,  0.0309405 , ...,  0.03757297,
          0.035754  ,  0.00465264],
        ...,

### Відповіді оформити .ipynb, .csv, .pdf документами