In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
import tensorflow_datasets as tfds
import pandas as pd

# Załaduj dane IMDB
(train_data, test_data), info = tfds.load(
    "imdb_reviews",
    split=("train", "test"),
    as_supervised=True,
    with_info=True
)

# Przetwórz dane tekstowe na indeksy słów
BUFFER_SIZE = 10000
BATCH_SIZE = 64
MAX_TOKENS = 10000  # Maksymalna liczba słów w słowniku
MAX_SEQ_LEN = 256   # Maksymalna długość sekwencji

encoder = tf.keras.layers.TextVectorization(
    max_tokens=MAX_TOKENS, output_sequence_length=MAX_SEQ_LEN
)

# Dopasuj encoder na danych uczących
train_text = train_data.map(lambda text, label: text)
encoder.adapt(train_text)

# Funkcja do przetwarzania danych
def preprocess(text, label):
    return encoder(text), label

train_data = train_data.map(preprocess).shuffle(BUFFER_SIZE).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
test_dataMAP = test_data.map(preprocess).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
# Budowa modelu
model = Sequential([
    layers.Embedding(input_dim=MAX_TOKENS, output_dim=128, mask_zero=True),
    layers.Bidirectional(layers.LSTM(64, return_sequences=True)),
    layers.Bidirectional(layers.LSTM(32)),
    layers.Dense(64, activation='relu'),
    layers.Dropout(0.5),
    layers.Dense(1, activation='sigmoid')
])

# Kompilacja modelu
model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

# Wyświetl podsumowanie modelu
model.summary()

# Trenowanie modelu
history = model.fit(
    train_data,
    validation_data=test_dataMAP,
    epochs=10
)

# Ocena modelu
results = model.evaluate(test_dataMAP)
print(f"Test Loss: {results[0]:.4f}, Test Accuracy: {results[1]:.4f}")

In [4]:
#Wyświetlenie danych 
(train_data, test_data), info = tfds.load(
    "imdb_reviews",
    split=("train", "test"),
    as_supervised=True,
    with_info=True
)

train_df = pd.DataFrame([
    {"Review": text.numpy().decode('utf-8'), "Label": "Positive" if label.numpy() == 1 else "Negative"}
    for text, label in train_data.take(50) 
])
#train_df.style.set_properties(**{'white-space': 'pre-wrap'})

In [None]:

## Optymalizacja 
# Dostosowanie parametrów
BUFFER_SIZE = 10000  
BATCH_SIZE = 64      
MAX_TOKENS = 10000   
MAX_SEQ_LEN = 256   


encoder = tf.keras.layers.TextVectorization(
    max_tokens=MAX_TOKENS, output_sequence_length=MAX_SEQ_LEN
)

# Dopasuj encoder na danych uczących
train_text = train_data.map(lambda text, label: text)
encoder.adapt(train_text)

# Funkcja do przetwarzania danych
def preprocess(text, label):
    return encoder(text), label

train_data = train_data.map(preprocess).shuffle(BUFFER_SIZE).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
test_dataMAP = test_data.map(preprocess).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

#Budowa modelu
model = Sequential([
    layers.Embedding(input_dim=MAX_TOKENS, output_dim=128, mask_zero=True, input_length=MAX_SEQ_LEN),
    layers.Bidirectional(layers.LSTM(64, return_sequences=True, dropout=0.4, recurrent_dropout=0.4)),
    layers.Bidirectional(layers.LSTM(32, dropout=0.4, recurrent_dropout=0.4)),
    layers.BatchNormalization(),
    layers.Dense(128, activation='relu'),
    layers.Dropout(0.5),
    layers.Dense(1, activation='sigmoid')
])



# Kompilacja modelu
optimizer = tf.keras.optimizers.Adam(learning_rate=0.0005)
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])


# Podsumowanie modelu
model.build(input_shape=(None, MAX_SEQ_LEN))
model.summary()

#Konfiguracja EarlyStopping 
from tensorflow.keras.callbacks import EarlyStopping
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Trenowanie modelu
history = model.fit(
    train_data,
    validation_data=test_dataMAP,
    epochs=10,
    callbacks=[early_stopping]
)

# Ocena modelu
results = model.evaluate(test_dataMAP)
print(f"Test Loss: {results[0]:.4f}, Test Accuracy: {results[1]:.4f}")


In [None]:
import pandas as pd

# Rozpakowanie oryginalnych danych testowych
original_texts = []
true_labels = []
for text, label in test_data:  
    original_texts.append(text.numpy().decode('utf-8')) 
    true_labels.append(label.numpy())  

processed_texts = []
processed_labels = []
for text_vector, label in test_dataMAP.unbatch():
    processed_texts.append(text_vector.numpy()) 
    processed_labels.append(label.numpy())  

predicted_probs = model.predict(test_dataMAP)
predicted_labels = (predicted_probs > 0.5).astype("int32").flatten() 

df = pd.DataFrame({
    'Original Text': original_texts,
    'Processed Text': processed_texts,
    'True Label': true_labels,
    'Predicted Label': predicted_labels
})

print(df.head())
df.to_csv("plik.csv", index=False)  

In [None]:
pd.set_option('display.max_colwidth', 300)
row_as_table = df.loc[[15339], ['Original Text', 'True Label', 'Predicted Label']]
row_as_table
row_as_table.style.set_properties(**{'white-space': 'pre-wrap'})

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix


cm = confusion_matrix(true_labels, predicted_labels)

# Wykres macierzy 
def plot_confusion_matrix(cm):
    plt.figure(figsize=(6, 5))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Negative', 'Positive'], yticklabels=['Negative', 'Positive'])
    plt.title('Confusion Matrix')
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.show()

plot_confusion_matrix(cm)

# Wykresy strat i dokładności
def plot_training_history(history):
    plt.figure(figsize=(12, 6))
    plt.subplot(1, 2, 1)
    plt.plot(history.history['loss'], label='Training Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.title('Strata podczas nauki')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()

    plt.subplot(1, 2, 2)
    plt.plot(history.history['accuracy'], label='Training Accuracy')
    plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
    plt.title('Dokładność podczas nauki')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.legend()

    plt.tight_layout()
    plt.show()

plot_training_history(history)