In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
import random
from tensorflow.keras.layers import TextVectorization
import re





In [2]:
df = pd.read_csv("horror_movies.csv")
titles = []
for title in df["title"]:
    cleaned_title = title.lower().strip()
    cleaned_title = re.sub(r'[0-9]', '', cleaned_title)
    cleaned_title = re.sub(r'[^\w\s]', '', cleaned_title)
    cleaned_title = re.sub(r'\s+', ' ', cleaned_title)
    titles.append(cleaned_title)

In [3]:
global_vectorizer = TextVectorization(
    standardize='lower_and_strip_punctuation',
    split='whitespace',
    output_mode='int',
)
global_vectorizer.adapt(titles)
vocab = global_vectorizer.get_vocabulary()
vocab_size = len(vocab)





In [4]:
filtered_titles = []
for title in titles:
    tokens = global_vectorizer([title]).numpy()[0]
    words = [vocab[i] for i in tokens if i != 0]
    if len(words) > 2:
        filtered_titles.append(words)

In [5]:
data_sequences = []
for words in filtered_titles:
    indices = [vocab.index(w) for w in words]
    for i in range(1, len(indices)):
        n = indices[:i+1]
        data_sequences.append(n)

In [6]:
max_len = max([len(x) for x in data_sequences])
data_sequences = pad_sequences(data_sequences, maxlen=max_len, padding='pre')

In [8]:
X = data_sequences[:, :-1]
y = data_sequences[:, -1]

In [9]:
model = Sequential([
    Embedding(vocab_size, 128, input_length=max_len-1),
    LSTM(128, return_sequences=True),
    Dropout(0.2),
    LSTM(64),
    Dropout(0.2),
    Dense(128, activation='relu'),
    Dropout(0.2),
    Dense(vocab_size, activation='softmax')
])

In [10]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])




In [11]:
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True, verbose=1)

In [12]:
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.7, patience=5, min_lr=1e-6, verbose=1)

In [13]:
history = model.fit(X, y, epochs=50, batch_size=16, validation_split=0.15, callbacks=[early_stopping, reduce_lr], verbose=1)

Epoch 1/50

Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 6: ReduceLROnPlateau reducing learning rate to 0.0007000000332482159.
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Restoring model weights from the end of the best epoch: 1.s: 5.3615 - accuracy: 0.1824  

Epoch 11: ReduceLROnPlateau reducing learning rate to 0.0004900000232737511.
Epoch 11: early stopping


In [14]:
def sample_with_temperature(preds, temperature=1.0):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds + 1e-8) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    return np.random.choice(len(preds), p=preds)

In [15]:
def generate_title(seed_text="", min_length=3, max_length=8, temperature=0.8):
    if not seed_text:
        seed_words = ["night", "dark", "blood", "evil", "dead", "horror", "the"]
        seed_text = random.choice(seed_words)
    generated_words = seed_text.split()
    for _ in range(max_length - len(generated_words)):
        token_list = [vocab.index(w) if w in vocab else 1 for w in generated_words] 
        token_list = pad_sequences([token_list], maxlen=max_len-1, padding='pre')
        predicted = model.predict(token_list, verbose=0)
        predicted_word_index = sample_with_temperature(predicted[0], temperature)
        output_word = vocab[predicted_word_index] if predicted_word_index < len(vocab) else ''
        if (not output_word) or (output_word in generated_words) or (output_word == '<OOV>'):
            break
        generated_words.append(output_word)
        if len(generated_words) >= min_length and random.random() > 0.7:
            break
    return " ".join(generated_words)

In [16]:
def generate_multiple_titles(num_titles=5, temperature=0.8):
    titles = []
    for _ in range(num_titles):
        title = generate_title(temperature=temperature)
        if len(title.split()) >= 2:
            titles.append(title)
    return titles

In [18]:
generated_titles = generate_multiple_titles(10, temperature=0.7)
for i, title in enumerate(generated_titles, 1):
    print(f"{i}. {title}")

1. dead love treat
2. blood of the house
3. blood of the dead
4. dead of man the
5. blood the for us of
6. blood of the
7. dark moonlight of
8. night of the clown
9. evil of the
10. blood white the
