In [None]:

import nltk
import pandas as pd
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

DATA_PATH = "../../data/preprocessed_data.csv"

In [None]:
df = pd.read_csv(DATA_PATH)

df.head()

# Preprocessing

In [None]:
def remove_stopwords(sentence: str)-> str:
    if sentence is not str:
        sentence = str(sentence)
    tokens = word_tokenize(sentence)
    tokens = [t for t in tokens if t not in set(stopwords.words('english'))]
    return " ".join(tokens)


In [None]:
df = df.map(lambda x: str(x))

In [None]:
df.head()

In [None]:
lematizer = WordNetLemmatizer()

def lematize(sentence: str) -> str:
    lematizer = WordNetLemmatizer()
    tokens = word_tokenize(sentence)
    lematized_tokens = [lematizer.lemmatize(t) for t in tokens]
    return " ".join(lematized_tokens)

df = df.map(lematize)

In [None]:
df.head()

# Embeddings

## Word2Vec

In [None]:
import gensim.downloader as api
model_name = "word2vec-google-news-300"
# api._download(model_name)

In [None]:
import numpy as np
model = api.load(model_name)

def get_avg_embedding(model, sentence):
    tokens = word_tokenize(sentence)
    word_vectors = []
    for token in tokens:
        if token in model.key_to_index:
            word_vectors.append(model[token])

    if word_vectors:
        sentence_embedding = np.mean(word_vectors, axis=0)
        return sentence_embedding
    else:
        print("Empty embedding. Generating random one.")
        return np.random.rand(300)

title_word2vec = df['title'].apply(lambda sentence: get_avg_embedding(model, sentence))
plot_word2vec = df['plot'].apply(lambda sentence: get_avg_embedding(model, sentence))
director_word2vec = df['director'].apply(lambda sentence: get_avg_embedding(model, sentence))


## TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
title_TfIdf = vectorizer.fit_transform(df['title'])
plot_TfIdf = vectorizer.fit_transform(df['plot'])
director_TfIdf = vectorizer.fit_transform(df['director'])
X_tfIdf = np.concatenate(
    [
    np.stack(title_TfIdf.toarray()),
    np.stack(plot_TfIdf.toarray()),
    np.stack(director_TfIdf.toarray())
    ],
    axis=1
)

In [None]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(sparse_output=False)
one_hot_encoded_y = encoder.fit_transform(df['genre'].to_numpy().reshape((-1, 1)))
y = np.array([np.argmax(number) for number in one_hot_encoded_y])

# Zamiana np.concat na np.concatenate
X_word2vec = np.concatenate(
    [
        np.stack(title_word2vec.values).reshape((-1, 300)),
        np.stack(plot_word2vec.values).reshape((-1, 300)),
        np.stack(director_word2vec.values).reshape((-1, 300))
    ],
    axis=1
)


In [None]:
print(X_word2vec.shape)
print(y.shape)

# Logistic Regression

## Word2Vec

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(X_word2vec, y, test_size=0.2)

model = LogisticRegression(max_iter = 10_000)
model.fit(X_train, y_train)



In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
confusion_matrix_display = ConfusionMatrixDisplay(confusion_matrix = conf_matrix)
print("accuracy:", accuracy)
confusion_matrix_display.plot()


## TF-IDF

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_tfIdf, y, test_size=0.2)

model = LogisticRegression(max_iter = 10_000)
model.fit(X_train, y_train)

In [None]:
y_pred_tfIdf = model.predict(X_test)
accuracy_tfIdf = accuracy_score(y_test, y_pred_tfIdf)
conf_matrix = confusion_matrix(y_test, y_pred_tfIdf)
confusion_matrix_display = ConfusionMatrixDisplay(confusion_matrix = conf_matrix)
print("accuracy:", accuracy_tfIdf)
confusion_matrix_display.plot()