# IMDB Prediction

Here we try to predict the IMDB Rating based on the episode descriptions

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
import tensorflow as tf
from keras.models import Sequential
from keras import layers
import keras

2025-01-29 15:50:24.571942: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1738162224.590058   42659 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1738162224.595179   42659 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-01-29 15:50:24.611813: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


### Import Description and IMDB Dataset

In [2]:
PATH_DESCRIPTIONS_CLEAN = "/home/anton/Anton/Studium/DHBW /Semester_5/Text_Analysis/repo/Burg-NLP/topic_modeling/data/episodes_description.csv"
PATH_IMDB_RATINGS_CLEAN = "/home/anton/Anton/Studium/DHBW /Semester_5/Text_Analysis/repo/Burg-NLP/data/episodes.json"


df_descriptions = pd.read_csv(PATH_DESCRIPTIONS_CLEAN)
df_imdb_ratings = pd.read_json(PATH_IMDB_RATINGS_CLEAN , orient="index")


1     7.9
2     8.6
3     8.3
4     8.6
5     9.0
     ... 
67    7.8
68    6.4
69    4.5
70    7.9
71    9.4
Name: rating, Length: 71, dtype: float64

### Add Ratings to episode description dataset

In [69]:
df_descriptions["rating"] = pd.Series(list(df_imdb_ratings["rating"]))
df = df_descriptions
df = df.dropna()

Unnamed: 0.1,Unnamed: 0,title,text,rating
0,0,Pilot,middle night obviously drunk rick bursts morty...,7.9
1,1,Lawnmower Dog,jerry complains family dog snuffles stupid r...,8.6
2,2,Anatomy Park (Episode),"christmas jerry tries enforce idea "" human hol...",8.3
3,3,M. Night Shaym-Aliens!,episode opens rick dissecting large rat garage...,8.6
4,4,Meeseeks and Destroy,mr meeseeks existing solve beth ’s problemafte...,9.0
...,...,...,...,...
66,66,Rickfending Your Mort,morty finds rick drunken stupor floor garage s...,7.8
67,67,Wet Kuat Amortican Summer,"rick ’s chores ( including cleaning another "" ...",6.4
68,68,Rise of the Numbericons: The Movie,"episode starts exact way post credits scene "" ...",4.5
69,69,Mort: Ragnarick,jerry white void golden light appear front nan...,7.9


### Create label for binary classification

In [4]:
df["evaluation"] = (df["rating"] > 8.2).astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["evaluation"] = (df["rating"] > 8.2).astype(int)


### Label Distribution

In [5]:
df["evaluation"].value_counts()

evaluation
1    36
0    35
Name: count, dtype: int64

### Train Test Val Split

In [6]:
train, val, test = np.split(df.sample(frac=1), [int(0.8*len(df)), int(0.9*len(df))])
train

  return bound(*args, **kwds)


Unnamed: 0.1,Unnamed: 0,title,text,rating,evaluation
56,56,JuRicksic Mort,morty late school asks rick help arrive time r...,7.8,0
60,60,Ricktional Mortpoon's Rickmas Mortcation,christmas day uncharacteristically generous ri...,8.0,0
37,37,Promortyus,morty accidentally freed facehugger controllin...,8.0,0
31,31,Edge of Tomorty: Rick Die Rickpeat,morty ’s ideal death jessica old age.the smith...,8.9,1
51,51,Solaricks,getting stranded remains citadel events previo...,8.5,1
1,1,Lawnmower Dog,jerry complains family dog snuffles stupid r...,8.6,1
15,15,Get Schwifty (episode),massive alien head appears earth interfering e...,8.2,0
50,50,Rickmurai Jack,"rick fully immersed "" rick two crows "" show ...",9.3,1
27,27,The Ricklantis Mixup,rick morty prepare go atlantis interrupted ric...,9.8,1
47,47,Gotron Jerrysis Rickvangelion,rick summer morty way boob world rick notice...,6.3,0


In [71]:
test["evaluation"].value_counts()

evaluation
1    5
0    3
Name: count, dtype: int64

In [72]:
val["evaluation"].value_counts()

evaluation
1    4
0    3
Name: count, dtype: int64

### Create Features and Label for Binary Classification and for Regression

In [7]:
X_train = train["text"]
X_val = val["text"]
X_test = test["text"]

Y_train = train["evaluation"]
Y_val = val["evaluation"]
Y_test = test["evaluation"]

Y_train_reg = train["rating"] * 100
Y_val_reg = val["rating"] * 100
Y_test_reg = test["rating"] * 100


print(Y_test_reg)

70    940.0
5     910.0
48    820.0
52    780.0
3     860.0
11    880.0
16    930.0
63    680.0
Name: rating, dtype: float64


### Spacy Pipeline


In [8]:
import spacy
from spacy.language import Language
from spacy.tokens import Doc

In [9]:
nlp = spacy.load("en_core_web_sm")


### Remove Lemmatizer to not confuse the model

In [10]:
nlp.pipeline
nlp.remove_pipe("lemmatizer")

('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x7fbfa1ea3780>)

### Add Component to lower Doc and Return String

In [11]:
@Language.component("to_lower")
def custom_to_lower(doc):
    valid_doc = [token.text.lower()  for token in doc]
    valid_doc_pos = [token.pos_   for token in doc ]

    #return Doc(nlp.vocab , words=valid_doc , pos=valid_doc_pos)
    return valid_doc

In [12]:
nlp.add_pipe("to_lower", after="ner")


<function __main__.custom_to_lower(doc)>

In [13]:
nlp.pipeline

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x7fbfa1e97f40>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x7fbfa1e97d00>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x7fbee0598ba0>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x7fbfa1ec2200>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x7fbee0598c80>),
 ('to_lower', <function __main__.custom_to_lower(doc)>)]

### Method to tokenize a whole Series of Strings to List of STrings

In [14]:
def tokenize_list( input_list : pd.Series):
    return list(nlp.pipe(input_list))
    

In [74]:
# def df_to_dataset(dataframe, shuffle=True, batch_size=1024):
#   df = dataframe.copy()
#   labels = df.pop('evaluation')
#   df = df["text"]
#   ds = tf.data.Dataset.from_tensor_slices((df, labels))
#   if shuffle:
#     ds = ds.shuffle(buffer_size=len(dataframe))
#   ds = ds.batch(batch_size)
#   ds = ds.prefetch(tf.data.AUTOTUNE)
#   return ds

In [16]:
# train_data = df_to_dataset(train)
# valid_data = df_to_dataset(val)
# test_data = df_to_dataset(test)
# train_data

In [17]:
# from sklearn.feature_extraction.text import CountVectorizer

# vectorizer = CountVectorizer()
# vectorizer.fit(train["text"])

# X_train = vectorizer.transform(train["text"])
# X_test  = vectorizer.transform(test["text"])
# Y_train = train["evaluation"]
# Y_valid = val["evaluation"]
# Y_test = test["evaluation"]

In [18]:
# # label encode the target variable 
# encoder = preprocessing.LabelEncoder()
# train_y = encoder.fit_transform(train_y)
# valid_y = encoder.fit_transform(valid_y)

## Word2Vec embedding Model

### Corpus for all tokens in each text

In [19]:
corpus = tokenize_list(df["text"])
corpus = [x for x in corpus]

In [20]:
corpus[1]

['  ',
 'jerry',
 'complains',
 'family',
 'dog',
 'snuffles',
 'stupid',
 'rick',
 'gives',
 'jerry',
 'device',
 'enhances',
 'dog',
 '’s',
 'intelligence',
 'morty',
 'go',
 'dreams',
 'morty',
 '’s',
 'math',
 'teacher',
 'persuade',
 'give',
 'morty',
 '"',
 '"',
 'class',
 'morty',
 'assigned',
 'less',
 'homework',
 'go',
 'morty',
 '’s',
 'math',
 'teacher',
 '’s',
 'dreams',
 'dreams',
 'strong',
 'black',
 'female',
 'tv',
 'character',
 'named',
 'mrs',
 'pancakes',
 'centaur',
 '’s',
 'dreams',
 'results',
 'rick',
 'morty',
 'eventually',
 'encountering',
 'scary',
 'terry',
 '(',
 'parody',
 'freddy',
 'krueger',
 'villain',
 'ina',
 'nightmare',
 'elm',
 'street',
 ')',
 'scary',
 'terry',
 'begins',
 'chasing',
 'two',
 'multiple',
 'dreams',
 'causing',
 'rick',
 'become',
 'unhinged.at',
 'one',
 'point',
 'two',
 'come',
 'across',
 'dream',
 'post',
 'apocalyptic',
 'city',
 'decide',
 'hide',
 'scary',
 'terry',
 'gets',
 'tired',
 'scary',
 'terry',
 'eventually',

### Create Word2Vec Model with All Data

In [21]:
import gensim
w2v_model = gensim.models.Word2Vec(corpus, min_count=5,
                                 vector_size=100, window=5)

### Print All Keys of Model to verify Results

In [22]:
print(list(w2v_model.wv.key_to_index.keys()))



### Tesing

In [23]:
print(w2v_model.wv.similarity('birdperson', 'planet'))

0.9997021


In [24]:
print(w2v_model.wv["rick"])

[-0.727202    0.62608945 -0.1386765   0.36537522  0.0585373  -0.9544966
  0.05284276  1.3044839  -0.28951487 -0.5478123  -0.25373077 -0.9001071
 -0.17683557  0.4536273   0.35818505 -0.46485695  0.09589595 -0.47621137
 -0.21946847 -1.1508756   0.7980473   0.75792724  0.44852173 -0.3113449
 -0.23688167 -0.15163186 -0.9829633  -0.52889484 -0.56268424  0.2082649
  1.0506023   0.06553474  0.20429268 -0.8318434  -0.1859098   0.70889425
 -0.25388512 -0.77442485 -0.51421005 -0.85655415  0.5625726  -0.8074664
 -0.47160897 -0.08017732  0.6685002  -0.40894485 -0.10854598 -0.32836592
  0.5208163   0.359428    0.54980767 -0.68031996 -0.90321827 -0.55596274
 -0.5002239   0.3300627   0.56145513  0.10184269 -0.71380335  0.69366866
  0.37864023  0.39290017  0.16622724 -0.36072493 -0.73252916  0.87373006
  0.5733215   0.41138947 -0.9349678   0.91201633 -0.16686253  0.404694
  0.8593446  -0.3032137   0.98320585  0.13456495  0.19780286  0.10214608
 -0.40568897  0.01089427 -0.555665   -0.00437387 -0.504717

### Method to Calculate Average Vector for a tokenized sentence

In [76]:
def calculate_average_vector(array , model):
    counter = 0
    vector = np.zeros(shape=(100,))
    for doc in array:
        try:
            vector += np.array(model.wv[doc])
            counter +=1
        except:
            continue
    return vector / counter


## Normal Neural Network Classification

In [26]:
from keras.layers import Dense

In [27]:
model = Sequential()
model.add(Dense(100, input_shape=(100,) , activation='relu'))
model.add(Dense(70, activation='relu'))
model.add(Dense(70, activation='relu'))
model.add(Dense(30, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
2025-01-29 15:50:32.263317: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:152] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)


In [28]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
              loss=tf.keras.losses.BinaryCrossentropy(),
              metrics=['accuracy'])

### Embedd All Train Descriptions 

In [30]:
embedded_x_train = []


for x_data in X_train:
    corpus = tokenize_list([x_data])
    corpus = [str(x) for x in corpus]
    embedded_x_train.append(calculate_average_vector(corpus[0] , w2v_model))

embedded_x_train

[array([-0.16009254,  0.13286437, -0.03073573,  0.07860217,  0.01774951,
        -0.19627577,  0.00350824,  0.26739219, -0.06276883, -0.11629569,
        -0.04968518, -0.18848278, -0.03885357,  0.08774078,  0.07619349,
        -0.10378878,  0.02924772, -0.10242294, -0.04391629, -0.24286667,
         0.1621238 ,  0.15561845,  0.09571472, -0.0671634 , -0.05367709,
        -0.02374439, -0.19980565, -0.11414833, -0.11606415,  0.04026586,
         0.22498147,  0.00766765,  0.03429482, -0.17498815, -0.03572261,
         0.14615613, -0.04721212, -0.1583821 , -0.10677691, -0.18056401,
         0.1214667 , -0.16325517, -0.0894105 , -0.01512203,  0.14485787,
        -0.08679867, -0.02644179, -0.07649928,  0.10807732,  0.07615321,
         0.11230369, -0.14418029, -0.18487105, -0.11255212, -0.10690133,
         0.07129862,  0.11672599,  0.02666253, -0.1446929 ,  0.13637363,
         0.08253684,  0.08007813,  0.03120731, -0.07635919, -0.15341105,
         0.17739542,  0.1159396 ,  0.08976842, -0.2

### Fit Model

In [31]:
model.fit(np.array(embedded_x_train), np.array(Y_train), epochs=150, batch_size=10)

Epoch 1/150
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.5530 - loss: 0.6921  
Epoch 2/150
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.5218 - loss: 0.6946 
Epoch 3/150
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.5301 - loss: 0.6925 
Epoch 4/150
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.4546 - loss: 0.6954 
Epoch 5/150
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.5615 - loss: 0.6907 
Epoch 6/150
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.4937 - loss: 0.6968 
Epoch 7/150
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.4844 - loss: 0.6948 
Epoch 8/150
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.4763 - loss: 0.6949 
Epoch 9/150
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[

<keras.src.callbacks.history.History at 0x7fbedb3a6a70>

### Function that predicts Evaluation by String

In [32]:
def test_string_model(input_str , model , emb_model):
    docs = tokenize_list([input_str])[0]
    print(docs)
    counter = 0
    vector = np.zeros(shape=(100,))
    avrg_vector = np.zeros(shape=(100,))
    for doc in docs:
        try:
            vector += np.array(emb_model.wv[doc])
            counter +=1

        except:
            continue

    avrg_vector = vector / counter
    
    vector_with_batch = np.expand_dims(avrg_vector, axis=0)
    
    return model.predict(vector_with_batch)
    


### test Function

In [33]:
test_string_model("angela Merkel" , model , w2v_model)

['angela', 'merkel']
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step


  avrg_vector = vector / counter


array([[nan]], dtype=float32)

## LSTM Model Classification

### Size of Embedding Vector in w2vmodel

In [39]:
EMBEDDING_DIM = 100

### Embedd a whole sentence function

In [40]:
MAX_LENGTH = 500
def sentence_to_embedding(sentence , embedding_model):
    temp_embedding_matrix = np.zeros((MAX_LENGTH, EMBEDDING_DIM))  # Leere Matrix für Embeddings
    for i, word in enumerate(sentence[:MAX_LENGTH]):  # Maximal 5 Wörter
        if word in embedding_model.wv:
            temp_embedding_matrix[i] = embedding_model.wv[word]  # Wortvektor einfügen
    return temp_embedding_matrix

### Prepare Test and Train Data

In [None]:
X_train_lstm = [sentence_to_embedding(x , w2v_model) for x in X_train ]
X_test_lstm = [sentence_to_embedding(x , w2v_model) for x in X_test ]


### LSTM Model Architecture

In [41]:

#embedding_vector_features=45

ltsm_model=Sequential()

ltsm_model.add(layers.Masking(mask_value=0.0, input_shape=(MAX_LENGTH, EMBEDDING_DIM)))

ltsm_model.add(layers.LSTM(128,activation='relu',return_sequences=False))

ltsm_model.add(layers.Dropout(0.2))
ltsm_model.add(layers.Dense(1, activation="sigmoid"))


56


  super().__init__(**kwargs)


### Compile Model

In [42]:
ltsm_model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

### Fit LSTM Model

In [44]:
ltsm_model.fit(np.array(X_train_lstm), np.array(Y_train).reshape(-1, 1), epochs=30, batch_size=10)

Epoch 1/30
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 216ms/step - accuracy: 0.5501 - loss: 0.6336
Epoch 2/30
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 197ms/step - accuracy: 0.5512 - loss: 0.6697
Epoch 3/30
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 171ms/step - accuracy: 0.7485 - loss: 0.5968
Epoch 4/30
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 175ms/step - accuracy: 0.5640 - loss: 0.5904
Epoch 5/30
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 223ms/step - accuracy: 0.7101 - loss: 0.6371
Epoch 6/30
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 170ms/step - accuracy: 0.6371 - loss: 0.6142
Epoch 7/30
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 167ms/step - accuracy: 0.6342 - loss: 0.6474
Epoch 8/30
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 160ms/step - accuracy: 0.6063 - loss: 0.6726
Epoch 9/30
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37

<keras.src.callbacks.history.History at 0x7fbed8b878e0>

### Function to Predict by Sentence

In [45]:
def predict_sentence_ltsm(word2vec_model, sentence):
    """
    Funktion zur Vorhersage eines Satzes mit dem trainierten LSTM-Modell.
    """
    model = ltsm_model
    words = sentence.lower().split()  # Satz in Wörter umwandeln
    embedding_matrix = np.zeros((MAX_LENGTH, EMBEDDING_DIM))  # Leere Matrix für Embeddings
    
    for i, word in enumerate(words[:MAX_LENGTH]):  # Nur die ersten MAX_LENGTH Wörter nehmen
        word = word.lower()
        if word in word2vec_model.wv:
            embedding_matrix[i] = word2vec_model.wv[word]  # Wortvektor einfügen

    # In 3D-Form bringen: (1, MAX_LENGTH, EMBEDDING_DIM)
    input_data = np.expand_dims(embedding_matrix, axis=0)
    
    # Vorhersage mit Modell
    prediction = model.predict(input_data)[0][0]  # Output ist eine 2D-Matrix, daher [0][0]
    
    # In "Positiv" oder "Negativ" umwandeln
    sentiment = "Positiv" if prediction > 0.5 else "Negativ"
    print(f"Satz: '{sentence}' → {sentiment} (Score: {prediction:.4f})")


### Test

In [59]:
predict_sentence_ltsm(w2v_model, str(df[(df["title"] == "The Ricks Must Be Crazy")]["text"]))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 65ms/step
Satz: '16    rick morty summer exit theater showcasing ball...
Name: text, dtype: object' → Negativ (Score: 0.2231)


In [67]:
for title in test["title"]:
    print(predict_sentence_ltsm(w2v_model, str(df[(df["title"] == title)]["text"])))
for rating in test["evaluation"]:
    print(rating)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 60ms/step
Satz: '70      rick morty explore " carnival nightmares " t...
Name: text, dtype: object' → Negativ (Score: 0.0061)
None
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step
Satz: '5      morty interested jessica lacks opportunity e...
Name: text, dtype: object' → Negativ (Score: 0.3114)
None
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 57ms/step
Satz: '48    leaving trip smiths leave rick home alone leav...
Name: text, dtype: object' → Negativ (Score: 0.2063)
None
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 57ms/step
Satz: '52    rick morty summer trip blips chitz goes awry a...
Name: text, dtype: object' → Negativ (Score: 0.0878)
None
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step
Satz: '3    episode opens rick dissecting large rat garage...
Name: text, dtype: object' → Negativ (Score: 0.2400)
None
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━

## Regression with LSTM

### Create Embedding Matrix

In [None]:
VOCAB_SIZE = len(w2v_model.wv.key_to_index.keys()) + 1  # +1 für Padding-Token
EMBEDDING_DIM = 100
embedding_matrix = np.zeros((VOCAB_SIZE, EMBEDDING_DIM))

for word in w2v_model.wv.key_to_index:
    if word in w2v_model.wv:
        embedding_matrix[w2v_model.wv.key_to_index[word]] = w2v_model.wv[word]
    

### Model Architecture

In [None]:
X_train_lstm = [sentence_to_embedding(x , w2v_model) for x in X_train ]
print(len(X_train_lstm))

embedding_vector_features=45

ltsm_model_reg=Sequential()

ltsm_model_reg.add(layers.Masking(mask_value=0.0, input_shape=(MAX_LENGTH, EMBEDDING_DIM)))

ltsm_model_reg.add(layers.LSTM(128,activation='relu',return_sequences=False))

ltsm_model_reg.add(layers.Dropout(0.4))
ltsm_model_reg.add(layers.Dense(100, activation="sigmoid"))
ltsm_model_reg.add(layers.Dense(30, activation="sigmoid"))
ltsm_model_reg.add(layers.Dense(30, activation="sigmoid"))
ltsm_model_reg.add(layers.Dense(1, activation=None))

# ltsm_model.add(layers.LSTM(128,activation='relu'))

# ltsm_model.add(layers.Dropout(0.2))

### Compile Model

In [None]:
ltsm_model_reg.compile(loss="binary_crossentropy", optimizer=keras.optimizers.Adam(learning_rate=1e-1), metrics=["mean_squared_error"])

### Fit Model 

In [None]:
ltsm_model_reg.fit(np.array(X_train_lstm), np.array(Y_train_reg).reshape(-1, 1), epochs=15, batch_size=10)

### Function to Predict Rating based on Sentence

In [None]:
def predict_sentence_ltsm_reg(word2vec_model, sentence):
    """
    Funktion zur Vorhersage eines Satzes mit dem trainierten LSTM-Modell.
    """
    model = ltsm_model_reg
    words = sentence.lower().split()  # Satz in Wörter umwandeln
    embedding_matrix = np.zeros((MAX_LENGTH, EMBEDDING_DIM))  # Leere Matrix für Embeddings
    
    for i, word in enumerate(words[:MAX_LENGTH]):  # Nur die ersten MAX_LENGTH Wörter nehmen
        word = word.lower()
        if word in word2vec_model.wv:
            embedding_matrix[i] = word2vec_model.wv[word]  # Wortvektor einfügen

    # In 3D-Form bringen: (1, MAX_LENGTH, EMBEDDING_DIM)
    input_data = np.expand_dims(embedding_matrix, axis=0)
    
    # Vorhersage mit Modell
    prediction = model.predict(input_data)[0][0]  # Output ist eine 2D-Matrix, daher [0][0]
    
    # In "Positiv" oder "Negativ" umwandeln
    print(f"{prediction}")


In [None]:
predict_sentence_ltsm_reg(w2v_model, str(df[(df["title"] == "JuRicksic Mort")]["text"]))

In [None]:
predict_sentence_ltsm_reg(w2v_model, "rick and morty are on an adveture")

In [None]:
ltsm_model_reg.evaluate(np.array(X_test_lstm) , np.array(Y_test_reg))