In [51]:
import pandas as pd
import numpy as np
import nltk
import spacy
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Download NLTK resourcen
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\robin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\robin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\robin\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\robin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [24]:
# Lade Filme von csv
df_movies = pd.read_csv("Movie_PLots.csv")

In [29]:
df_movies.head()

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
0,1901,Kansas Saloon Smashers,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Kansas_Saloon_Sm...,"A bartender is working at a saloon, serving dr..."
1,1901,Love by the Light of the Moon,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Love_by_the_Ligh...,"The moon, painted with a smiling face hangs ov..."
2,1901,The Martyred Presidents,American,Unknown,,unknown,https://en.wikipedia.org/wiki/The_Martyred_Pre...,"The film, just over a minute long, is composed..."
3,1901,"Terrible Teddy, the Grizzly King",American,Unknown,,unknown,"https://en.wikipedia.org/wiki/Terrible_Teddy,_...",Lasting just 61 seconds and consisting of two ...
4,1902,Jack and the Beanstalk,American,"George S. Fleming, Edwin S. Porter",,unknown,https://en.wikipedia.org/wiki/Jack_and_the_Bea...,The earliest known adaptation of the classic f...


$python -m spacy download en_core_web_sm

In [37]:
# Laden des SpaCy-Modells
nlp = spacy.load('en_core_web_sm') 

# Funktion zur Vorverarbeitung
def preprocess_text_english(text):
    # Tokenisierung und Lemmatisierung mit SpaCy und Filtern von Satzzeichen
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if token.is_alpha]

    # Entfernen von Stoppwörtern
    stops_nltk = set(stopwords.words('english'))
    filtered_tokens = [word.lower() for word in tokens if word.lower() not in stops_nltk]

    # Zusammenfügen der Tokens zu einem String
    preprocessed_text = ' '.join(filtered_tokens)

    return preprocessed_text

# Anwenden der Funktion auf die 'Plot'-Spalte
df_movies['preprocessed_text'] = df_movies['Plot'].apply(preprocess_text_english)

In [38]:
# Anzeigen der ersten Zeilen des DataFrames zur Überprüfung
print(df_movies.head())

   Release Year                             Title Origin/Ethnicity  \
0          1901            Kansas Saloon Smashers         American   
1          1901     Love by the Light of the Moon         American   
2          1901           The Martyred Presidents         American   
3          1901  Terrible Teddy, the Grizzly King         American   
4          1902            Jack and the Beanstalk         American   

                             Director Cast    Genre  \
0                             Unknown  NaN  unknown   
1                             Unknown  NaN  unknown   
2                             Unknown  NaN  unknown   
3                             Unknown  NaN  unknown   
4  George S. Fleming, Edwin S. Porter  NaN  unknown   

                                           Wiki Page  \
0  https://en.wikipedia.org/wiki/Kansas_Saloon_Sm...   
1  https://en.wikipedia.org/wiki/Love_by_the_Ligh...   
2  https://en.wikipedia.org/wiki/The_Martyred_Pre...   
3  https://en.wikipedia.

In [39]:
# Nicht benutzt!
# Vektorisierung mit Bag of Words
count_vectorizer = CountVectorizer()
df_bow = count_vectorizer.fit_transform(df_movies['preprocessed_text'])

In [39]:
# Vektorisierung mit TF-IDF
tfidf_vectorizer = TfidfVectorizer()
df_tfidf = tfidf_vectorizer.fit_transform(df_movies['preprocessed_text'])

In [46]:
# Funktion zur Durchführung der Suche
def search_plots(query, tfidf_vectorizer, tfidf_matrix, df):
    query_processed = preprocess_text_english(query)
    query_vector = tfidf_vectorizer.transform([query_processed])
    
    # Berechnung der Kosinus-Ähnlichkeiten
    similarities = cosine_similarity(query_vector, tfidf_matrix).flatten()
    
    # Sortieren der Filme nach Ähnlichkeit
    sorted_indices = np.argsort(similarities)[::-1]
    sorted_titles = df.iloc[sorted_indices]['Title']
    sorted_plots = df.iloc[sorted_indices]['Plot']
    
    return sorted_titles, sorted_plots, similarities[sorted_indices]

In [52]:
# Suche
query = "Hero fights evil Robot"
sorted_titles, sorted_plots, sorted_similarities = search_plots(query, tfidf_vectorizer, df_tfidf, df_movies)

# Anzeigen der Ergebnisse
print("Suchergebnisse für den Query:", query)
for title, plot, similarity in zip(sorted_titles.head(10), sorted_plots.head(10), sorted_similarities[:10]):
    print(f"Title: {title}, Similarity: {similarity:.4f}")
    print(f"Plot: {plot}")
    print("------")

Suchergebnisse für den Query: Hero fights evil Robot
Title: Daft Punk's Electroma, Similarity: 0.5277
Plot: The two lead characters appear as the robotic forms of Daft Punk and are credited as "Hero Robot No. 1" and "Hero Robot No. 2". One wears a silver helmet and the other wears a golden one. An opening scene shows the duo driving in a 1987 Ferrari 412 with its license plate displaying "HUMAN". After passing through a Southwestern United States landscape, the duo arrives by car at a town in Inyo County, California.[1] The town's resident are also shown to be robots physically identical to the two main characters, but at different ages, with different clothing and alternating gender.
The pair drive to a high-tech facility where liquid latex is poured over their heads. The latex is shaped into human-like faces with the aid of prosthetic appliances and wigs. The resulting look caricaturizes the members of Daft Punk, Thomas Bangalter and Guy-Manuel de Homem-Christo. When the two leave th