In [None]:
import tensorflow as tf
import numpy as np
import os
import time

text = open('/content/train.txt', 'rb').read().decode(encoding='utf-8')
vocab = sorted(set(text))

chars_from_ids = tf.keras.layers.StringLookup(
    vocabulary=list(vocab), mask_token=None, invert=True)

ids_from_chars = tf.keras.layers.StringLookup(
    vocabulary=list(vocab), mask_token=None)

all_ids = ids_from_chars(tf.strings.unicode_split(text, 'UTF-8'))
ids_dataset = tf.data.Dataset.from_tensor_slices(all_ids)
seq_length = 100
sequences = ids_dataset.batch(seq_length+1, drop_remainder=True)

def split_input_target(sequence):
    input_text = sequence[:-1]
    target_text = sequence[1:]
    return input_text, target_text

dataset = sequences.map(split_input_target)

BATCH_SIZE = 64
BUFFER_SIZE = 10000
dataset = (
    dataset
    .shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE, drop_remainder=True)
    .prefetch(tf.data.experimental.AUTOTUNE))

vocab_size = len(ids_from_chars.get_vocabulary())
embedding_dim = 256
rnn_units = 1024

class MyModel(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, rnn_units):
        super().__init__(self)
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(rnn_units, return_sequences=True, return_state=True)
        self.dense = tf.keras.layers.Dense(vocab_size)

    def call(self, inputs, states=None, return_state=False, training=False):
        x = inputs
        x = self.embedding(x, training=training)
        if states is None:
            states = self.gru.get_initial_state(x)
        x, states = self.gru(x, initial_state=states, training=training)
        x = self.dense(x, training=training)

        if return_state:
            return x, states
        else:
            return x

model = MyModel(vocab_size=vocab_size, embedding_dim=embedding_dim, rnn_units=rnn_units)
loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer='adam', loss=loss)

checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")
checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_prefix, save_weights_only=True)



In [None]:
EPOCHS = 100
history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100

KeyboardInterrupt: ignored

In [None]:
import json

# Создаем словарь, который будет связывать символы с их индексами
char_to_idx = {char: idx for idx, char in enumerate(vocab)}

# Сохраняем словарь в файл JSON
with open("char_to_idx.json", "w") as f:
    json.dump(char_to_idx, f)


In [None]:
# Загружаем словарь из файла JSON
with open("char_to_idx.json", "r") as f:
    char_to_idx = json.load(f)

# Создаем обратный словарь, который будет связывать индексы с символами
idx_to_char = {idx: char for char, idx in char_to_idx.items()}


In [None]:
saved_model_dir = './saved_model'
model.save(saved_model_dir)



In [None]:
loaded_model = tf.keras.models.load_model(saved_model_dir)

In [None]:
all_word_ids = np.arange(len(vocab))
embeddings = loaded_model.embedding(all_word_ids)

In [None]:
import tensorflow as tf
import numpy as np
import os
import time


In [None]:
start = time.time()
states = None
next_char = tf.constant(['eur/usd tomorrow will '])
result = [next_char]

for n in range(100):
    next_char, states = one_step_model.generate_one_step(next_char, states=states)
    result.append(next_char)

result = tf.strings.join(result)
end = time.time()

print(result[0].numpy().decode('utf-8'), '\n\n' + '_'*80)
print('\nRun time:', end - start)

eur/usd tomorrow will close TopOrder with BottomOrder
  Float
  Can be negative
   [AllGrid] Profit ($) for CriticalClo 

________________________________________________________________________________

Run time: 0.703019380569458


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
tf.saved_model.save(one_step_model, '/content/drive/MyDrive/04-17')

NameError: ignored

In [None]:
one_step_reloaded1 = tf.saved_model.load('/content/drive/MyDrive/04-17')

OSError: ignored

In [None]:
class OneStep(tf.keras.Model):
  def __init__(self, model, chars_from_ids, ids_from_chars, temperature=1.0):
    super().__init__()
    self.temperature = temperature
    self.model = model
    self.chars_from_ids = chars_from_ids
    self.ids_from_chars = ids_from_chars

    # Create a mask to prevent "[UNK]" from being generated.
    skip_ids = self.ids_from_chars(['[UNK]'])[:, None]
    sparse_mask = tf.SparseTensor(
        # Put a -inf at each bad index.
        values=[-float('inf')]*len(skip_ids),
        indices=skip_ids,
        # Match the shape to the vocabulary
        dense_shape=[len(ids_from_chars.get_vocabulary())])
    self.prediction_mask = tf.sparse.to_dense(sparse_mask)

  @tf.function
  def generate_one_step(self, inputs, states=None):
    # Convert strings to token IDs.
    input_chars = tf.strings.unicode_split(inputs, 'UTF-8')
    input_ids = self.ids_from_chars(input_chars).to_tensor()

    # Run the model.
    # predicted_logits.shape is [batch, char, next_char_logits]
    predicted_logits, states = self.model(inputs=input_ids, states=states,
                                          return_state=True)
    # Only use the last prediction.
    predicted_logits = predicted_logits[:, -1, :]
    predicted_logits = predicted_logits/self.temperature
    # Apply the prediction mask: prevent "[UNK]" from being generated.
    predicted_logits = predicted_logits + self.prediction_mask

    # Sample the output logits to generate token IDs.
    predicted_ids = tf.random.categorical(predicted_logits, num_samples=1)
    predicted_ids = tf.squeeze(predicted_ids, axis=-1)

    # Convert from token ids to characters
    predicted_chars = self.chars_from_ids(predicted_ids)

    # Return the characters and model state.
    return predicted_chars, states
one_step_model = OneStep(model, chars_from_ids, ids_from_chars)

In [None]:
states = None
next_char = tf.constant(['eur/usd tomorrow will'])
result = [next_char]

for n in range(10):
  next_char, states = loaded_model.generate_one_step(next_char, states=states)
  result.append(next_char)

print(tf.strings.join(result)[0].numpy().decode("utf-8"))

AttributeError: ignored

In [None]:
import os
import pandas as pd
import numpy as np

import tensorflow as tf
import matplotlib.pyplot as plt

from sklearn.metrics.pairwise import cosine_similarity

def most_similar(embedding, embeddings):
    similarity = cosine_similarity(embedding.reshape(1, -1), embeddings)
    most_similar_index = np.argmax(similarity)
    return most_similar_index

saved_model_dir = './saved_model'
loaded_model = tf.keras.models.load_model(saved_model_dir)
embeddings = loaded_model.layers[0].get_weights()[0]

file_path = '/content/clustergigcutval.txt'
with open(file_path, "r") as f:
    text = f.read()

means12 = pd.read_csv('/content/meangig.csv')

numbers = text.split()
n = 1000

iteration = 0

while n + 50 < len(numbers) and iteration < 3:
    cumsum_outputs = []
    clusters_output = []
    for k in range(20):
        input_text = ' '.join(numbers[n + k:n + 100])
        states = None
        next_char = tf.constant([input_text])
        result = [next_char]

        for i in range(500):
            next_char, states = one_step_reloaded1.generate_one_step(next_char, states=states)
            result.append(next_char)

        output_text = tf.strings.join(result).numpy()[0].decode('utf-8')

        output_clusters = list(map(int, output_text.split()))

        output_clusters_first = output_clusters[:100]
        output_clusters_second = output_clusters[100:]

        # Получение эмбеддингов для первого списка
        embedding_output_first = []
        for cluster in output_clusters_first:
            cluster_chars = str(cluster)
            for char in cluster_chars:
                char_idx = char_to_idx[char]
                embedding_output_first.append(embeddings[char_idx])

        # Получение эмбеддингов для второго списка
        embedding_output_second = []
        for cluster in output_clusters_second:
            cluster_chars = str(cluster)
            for char in cluster_chars:
                char_idx = char_to_idx[char]
                embedding_output_second.append(embeddings[char_idx])



        # 2. Вычисление среднего значения первых 100 и последующих элементов output_clusters
        avg_embedding_first_100 = np.mean(embedding_output_first, axis=0)
        avg_embedding_after_100 = np.mean(embedding_output_second, axis=0)

        # 3. Найти most similar эмбеддинг
        most_similar_first_100 = most_similar(avg_embedding_first_100, embeddings)
        most_similar_after_100 = most_similar(avg_embedding_after_100, embeddings)

        # 4. Определение output cluster для most similar эмбеддинга и запись в файл
        output_cluster_first_100 =  most_similar_first_100
        output_cluster_after_100 = most_similar_after_100

        clusters_output.append([output_cluster_first_100, output_cluster_after_100])

        input_clusters = list(map(int, ' '.join(numbers[n + k:n + 200]).split()))

        # Преобразование номеров кластеров в значения Mean_Close_Open
        input_mean_close_open = [means12.loc[means12['Cluster'] == cluster, 'avg(Close-Open)'].values[0] for cluster in input_clusters]
        output_mean_close_open = []
        for cluster in output_clusters:
            cluster_data = means12.loc[means12['Cluster'] == cluster, 'avg(Close-Open)']
            if len(cluster_data.values) > 0:
                output_mean_close_open.append(cluster_data.values[0])
            else:
                output_mean_close_open.append(0)
                print(f"Кластер не найден: {cluster}")
        # Вычисление cumsum массива
        cumsum_input = np.cumsum(np.array(input_mean_close_open))
        cumsum_output = np.cumsum(np.array(output_mean_close_open))

        if k == 0:
            first_cumsum_input = cumsum_input

        cumsum_outputs.append(cumsum_output)

    # Найти наименьшую длину массива
    min_length = min([len(first_cumsum_input)] + [len(cumsum_output) for cumsum_output in cumsum_outputs])

    # Обрезать массивы до наименьшей длины
    first_cumsum_input = first_cumsum_input[:min_length]
    cumsum_outputs = [cumsum_output[:min_length] for cumsum_output in cumsum_outputs]

    # Сохранение результатов в файл CSV
    results_data = {
        'cumsum_input': first_cumsum_input}
        
    for i, cumsum_output in enumerate(cumsum_outputs):
        results_data[f'cumsum_output_{i}'] = cumsum_output

    df = pd.DataFrame(results_data)

    # Сдвиг колонок cumsum_output_X на нужное количество строк вперед
    for i in range(1, len(df.columns)-1):
        df[f'cumsum_output_{i}'] = df[f'cumsum_output_{i}'].shift(i)

    df.to_csv(f'results_{iteration}.csv', index=False)

    print(f"Результаты сохранены в файл results_{iteration}.csv")
    df1 = pd.DataFrame(clusters_output, columns=['output_cluster_first_100', 'output_cluster_after_100'])

    # Сохранение DataFrame в отдельный файл
    df1.to_csv(f'clusters_output_{iteration}.csv', index=False)

    print(f"Значения output_cluster_first_100 и output_cluster_after_100 сохранены в файл clusters_output_{iteration}.csv")

    n += 100
    iteration += 1

               


OSError: ignored

In [None]:
output_clusters_first

In [None]:
output_clusters_second

In [None]:
embedding_output_first

In [None]:
embedding_output_second

In [None]:
first_row = array[0]

In [None]:
first_row

In [None]:
avg_embedding_first_100

In [None]:
avg_embedding_after_100

In [None]:
most_similar_first_100 

2

In [None]:
most_similar_after_100

2

In [None]:
embedding_output[:100]

In [None]:
embedding_output

In [None]:
 output_clusters

In [None]:
embedding_output = [embeddings[min(cluster, len(embeddings)-1)] for cluster in output_clusters]

In [None]:
num_arrays = len(embedding_output)
print(f"Количество массивов в списке: {num_arrays}")

# Визуализация массивов из списка
for i, arr in enumerate(embedding_output):
    plt.plot(arr, label=f'Массив {i + 1}')

plt.xlabel('Индекс')
plt.ylabel('Значение')
plt.title('Визуализация массивов в списке')
plt.legend()
plt.show()

In [None]:

"""
import os
import pandas as pd
import numpy as np

import tensorflow as tf
import matplotlib.pyplot as plt


file_path = '/content/clustergigcutval.txt'
with open(file_path, "r") as f:
    text = f.read()

# Загрузка файла means12.csv
means12 = pd.read_csv('/content/meangig.csv')

# Разделение текста на номера и создание списка
numbers = text.split()
n = 1000

iteration = 0

while n + 50 < len(numbers) and iteration < 3:
    cumsum_outputs = []

    for k in range(20):  # Измените это значение для изменения количества колонок cumsum_output_X
        input_text = ' '.join(numbers[n + k:n + 100])
        states = None
        next_char = tf.constant([input_text])
        result = [next_char]

        for i in range(500):
            next_char, states = one_step_reloaded1.generate_one_step(next_char, states=states)
            result.append(next_char)

        output_text = tf.strings.join(result).numpy()[0].decode('utf-8')

        # Преобразование номеров кластеров в значения Mean_Close_Open
        input_clusters = list(map(int, ' '.join(numbers[n + k:n + 200]).split()))
        output_clusters = list(map(int, output_text.split()))

        input_mean_close_open = [means12.loc[means12['Cluster'] == cluster, 'avg(Close-Open)'].values[0] for cluster in input_clusters]
        output_mean_close_open = []
        for cluster in output_clusters:
            cluster_data = means12.loc[means12['Cluster'] == cluster, 'avg(Close-Open)']
            if len(cluster_data.values) > 0:
                output_mean_close_open.append(cluster_data.values[0])
            else:
                output_mean_close_open.append(0)
                print(f"Кластер не найден: {cluster}")

        # Вычисление cumsum массива
        cumsum_input = np.cumsum(np.array(input_mean_close_open))
        cumsum_output = np.cumsum(np.array(output_mean_close_open))

        if k == 0:
            first_cumsum_input = cumsum_input

        cumsum_outputs.append(cumsum_output)

    # Найти наименьшую длину массива
    min_length = min([len(first_cumsum_input)] + [len(cumsum_output) for cumsum_output in cumsum_outputs])

    # Обрезать массивы до наименьшей длины
    first_cumsum_input = first_cumsum_input[:min_length]
    cumsum_outputs = [cumsum_output[:min_length] for cumsum_output in cumsum_outputs]

    # Сохранение результатов в файл CSV
    results_data = {
        'cumsum_input': first_cumsum_input,
    }
    for i, cumsum_output in enumerate(cumsum_outputs):
        results_data[f'cumsum_output_{i}'] = cumsum_output
    
    df = pd.DataFrame(results_data)
        # Сдвиг колонок cumsum_output_X на нужное количество строк вперед
    for i in range(1, len(df.columns)-1):
        df[f'cumsum_output_{i}'] = df[f'cumsum_output_{i}'].shift(i)

    df.to_csv(f'results_{iteration}.csv', index=False)

    print(f"Результаты сохранены в файл results_{iteration}.csv")

    n += 100
    iteration += 1
    """

KeyboardInterrupt: ignored

In [None]:
"""
import os
import pandas as pd
import numpy as np
import torch
import tensorflow as tf
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader

file_path = '/content/clustergigcutval.txt'
with open(file_path, "r") as f:
    text = f.read()

# Загрузка файла means12.csv
means12 = pd.read_csv('/content/meangig.csv')

# Разделение текста на номера и создание списка
numbers = text.split()
n = 0

# Создание датафрейма
df = pd.DataFrame(columns=['input_text', 'output_text'])
n=10000

iteration=0

while n + 50 < len(numbers) and iteration < 20:
    input_text = ' '.join(numbers[n:n + 100])
    states = None
    next_char = tf.constant([input_text])
    result = [next_char]

    for i in range(500):
        next_char, states = one_step_reloaded1.generate_one_step(next_char, states=states)
        result.append(next_char)

    output_text = tf.strings.join(result).numpy()[0].decode('utf-8')
    df = df.append({'input_text': input_text, 'output_text': output_text}, ignore_index=True)

    # Преобразование номеров кластеров в значения Mean_Close_Open
    input_clusters = list(map(int, ' '.join(numbers[n:n + 200]).split()))  # Используется для вычисления input_clusters
    output_clusters = list(map(int, output_text.split()))

    input_mean_close_open = [means12.loc[means12['Cluster'] == cluster, 'avg(Close-Open)'].values[0] for cluster in input_clusters]
    output_mean_close_open = []
    for cluster in output_clusters:
        cluster_data = means12.loc[means12['Cluster'] == cluster, 'avg(Close-Open)']
        if len(cluster_data.values) > 0:
            output_mean_close_open.append(cluster_data.values[0])
        else:
            output_mean_close_open.append(0)
            print(f"Кластер не найден: {cluster}")

    # Вычисление cumsum массива
    cumsum_input = np.cumsum(np.array(input_mean_close_open))
    cumsum_output = np.cumsum(np.array(output_mean_close_open))

    # Отображение графика cumsum
    plt.plot(cumsum_input, label='Input')
    plt.plot(cumsum_output, label='Output')

    plt.xlabel('Element Index')
    plt.ylabel('Cumulative Sum')
    plt.title(f'Cumulative Sum of Mean_Close_Open for Input and Output (n={n})')
    plt.legend()
    plt.show()
    plt.pause(1)  # Задержка на 1 секунду
    plt.close()  # Закрытие текущего графика

    n += 1
    iteration += 1
# Сохранение результатов в файл CSV
df.to_csv('results4411.csv', index=False)

print("Результаты сохранены в файл results1.csv")
"""

SystemError: ignored