In [1]:
# 1. Imports & Setup

import pandas as pd
import numpy as np
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Klassisches Modell
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

# Deep Learning
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

# Reproduzierbarkeit
np.random.seed(42)
tf.random.set_seed(42)  



In [4]:
# 2. Daten laden & Überblick

# Pfad anpassen, falls nötig
df = pd.read_csv("imdb_reviews.csv")

print(df.head())
# Spalten checken
print(df.columns)

# sentiment -> label (0 = negative, 1 = positive)
df['label'] = df['sentiment'].map({'positive': 1, 'negative': 0})

# Jetzt funktioniert alles wie im restlichen Code
print(df['label'].value_counts())

print(df['label'].value_counts())

# Schauen ob balanced:
pos = (df['label'] == 1).sum()
neg = (df['label'] == 0).sum()
print(f"Positive: {pos}, Negative: {neg}")


                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive
Index(['review', 'sentiment'], dtype='object')
label
1    25000
0    25000
Name: count, dtype: int64
label
1    25000
0    25000
Name: count, dtype: int64
Positive: 25000, Negative: 25000


In [6]:
# 3. Preprocessing
df = df.rename(columns={'review': 'text'})
def clean_text(text):
    # Lowercase
    text = text.lower()
    # URLs entfernen
    text = re.sub(r'http\S+|www\S+', '', text)
    # HTML Tags entfernen
    text = re.sub(r'<.*?>', '', text)
    # Zahlen entfernen
    text = re.sub(r'\d+', '', text)
    # Satzzeichen entfernen
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Extra Leerzeichen entfernen
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df['text_clean'] = df['text'].astype(str).apply(clean_text)

print(df[['text', 'text_clean']].head())


                                                text  \
0  One of the other reviewers has mentioned that ...   
1  A wonderful little production. <br /><br />The...   
2  I thought this was a wonderful way to spend ti...   
3  Basically there's a family where a little boy ...   
4  Petter Mattei's "Love in the Time of Money" is...   

                                          text_clean  
0  one of the other reviewers has mentioned that ...  
1  a wonderful little production the filming tech...  
2  i thought this was a wonderful way to spend ti...  
3  basically theres a family where a little boy j...  
4  petter matteis love in the time of money is a ...  


In [7]:
X = df['text_clean'].values
y = df['label'].values

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

len(X_train), len(X_test)


(40000, 10000)

In [8]:
# 4. Klassisches Modell: Naive Bayes

max_features = 20000

tfidf = TfidfVectorizer(
    max_features=max_features,
    ngram_range=(1,2),   # Unigram + Bigram
    stop_words='english' # bei deutschem Datensatz hier 'german' einsetzen
)

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)


In [9]:
nb_clf = MultinomialNB()
nb_clf.fit(X_train_tfidf, y_train)

y_pred_nb = nb_clf.predict(X_test_tfidf)

print("Naive Bayes Accuracy:", accuracy_score(y_test, y_pred_nb))
print("\nClassification Report (Naive Bayes):\n")
print(classification_report(y_test, y_pred_nb))


Naive Bayes Accuracy: 0.8683

Classification Report (Naive Bayes):

              precision    recall  f1-score   support

           0       0.88      0.85      0.87      5000
           1       0.86      0.88      0.87      5000

    accuracy                           0.87     10000
   macro avg       0.87      0.87      0.87     10000
weighted avg       0.87      0.87      0.87     10000



In [13]:
# Tokenizer vorbereiten
vocab_size = 20000
max_len = 200

tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=max_len, padding='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len, padding='post')

X_train_pad.shape, X_test_pad.shape


((40000, 200), (10000, 200))

In [None]:
embedding_dim = 64

model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_len),
    LSTM(64, return_sequences=False),
    Dropout(0.5),
    Dense(32, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

model.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)
model.build(input_shape=(None, max_len))
model.summary()

model.summary()


In [None]:
history = model.fit(
    X_train_pad,
    y_train,
    epochs=5,          # ggf. erhöhen
    batch_size=128,
    validation_split=0.2
)


In [None]:
y_pred_dl_prob = model.predict(X_test_pad)
y_pred_dl = (y_pred_dl_prob > 0.5).astype(int).reshape(-1)

print("LSTM Accuracy:", accuracy_score(y_test, y_pred_dl))
print("\nClassification Report (LSTM):\n")
print(classification_report(y_test, y_pred_dl))
