In [59]:
import nltk
import pickle
import pandas as pd
import os
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MultiLabelBinarizer
from scipy.sparse import hstack
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

In [60]:
# import kagglehub

# # Download latest version
# path = kagglehub.dataset_download("harshitshankhdhar/imdb-dataset-of-top-1000-movies-and-tv-shows")

# print("Path to dataset files:", path)

In [61]:
df = pd.read_csv("./imdb_top_1000.csv")
df.head(5)

Unnamed: 0,Poster_Link,Series_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross
0,https://m.media-amazon.com/images/M/MV5BMDFkYT...,The Shawshank Redemption,1994,A,142 min,Drama,9.3,Two imprisoned men bond over a number of years...,80.0,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,2343110,28341469
1,https://m.media-amazon.com/images/M/MV5BM2MyNj...,The Godfather,1972,A,175 min,"Crime, Drama",9.2,An organized crime dynasty's aging patriarch t...,100.0,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,1620367,134966411
2,https://m.media-amazon.com/images/M/MV5BMTMxNT...,The Dark Knight,2008,UA,152 min,"Action, Crime, Drama",9.0,When the menace known as the Joker wreaks havo...,84.0,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,2303232,534858444
3,https://m.media-amazon.com/images/M/MV5BMWMwMG...,The Godfather: Part II,1974,A,202 min,"Crime, Drama",9.0,The early life and career of Vito Corleone in ...,90.0,Francis Ford Coppola,Al Pacino,Robert De Niro,Robert Duvall,Diane Keaton,1129952,57300000
4,https://m.media-amazon.com/images/M/MV5BMWU4N2...,12 Angry Men,1957,U,96 min,"Crime, Drama",9.0,A jury holdout attempts to prevent a miscarria...,96.0,Sidney Lumet,Henry Fonda,Lee J. Cobb,Martin Balsam,John Fiedler,689845,4360000


In [62]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 16 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Poster_Link    1000 non-null   object 
 1   Series_Title   1000 non-null   object 
 2   Released_Year  1000 non-null   object 
 3   Certificate    899 non-null    object 
 4   Runtime        1000 non-null   object 
 5   Genre          1000 non-null   object 
 6   IMDB_Rating    1000 non-null   float64
 7   Overview       1000 non-null   object 
 8   Meta_score     843 non-null    float64
 9   Director       1000 non-null   object 
 10  Star1          1000 non-null   object 
 11  Star2          1000 non-null   object 
 12  Star3          1000 non-null   object 
 13  Star4          1000 non-null   object 
 14  No_of_Votes    1000 non-null   int64  
 15  Gross          831 non-null    object 
dtypes: float64(2), int64(1), object(13)
memory usage: 125.1+ KB


In [63]:
df.isnull().sum()

Poster_Link        0
Series_Title       0
Released_Year      0
Certificate      101
Runtime            0
Genre              0
IMDB_Rating        0
Overview           0
Meta_score       157
Director           0
Star1              0
Star2              0
Star3              0
Star4              0
No_of_Votes        0
Gross            169
dtype: int64

In [64]:
df = df.dropna()
df.isnull().sum()

Poster_Link      0
Series_Title     0
Released_Year    0
Certificate      0
Runtime          0
Genre            0
IMDB_Rating      0
Overview         0
Meta_score       0
Director         0
Star1            0
Star2            0
Star3            0
Star4            0
No_of_Votes      0
Gross            0
dtype: int64

In [65]:
print(df.columns)

Index(['Poster_Link', 'Series_Title', 'Released_Year', 'Certificate',
       'Runtime', 'Genre', 'IMDB_Rating', 'Overview', 'Meta_score', 'Director',
       'Star1', 'Star2', 'Star3', 'Star4', 'No_of_Votes', 'Gross'],
      dtype='object')


In [66]:
df = df.drop(columns=['Poster_Link', 'Released_Year', 'Certificate', 'Runtime', 'Meta_score', 'Director',
                      'Star1', 'Star2', 'Star3', 'Star4', 'No_of_Votes', 'Gross'])

In [67]:
df

Unnamed: 0,Series_Title,Genre,IMDB_Rating,Overview
0,The Shawshank Redemption,Drama,9.3,Two imprisoned men bond over a number of years...
1,The Godfather,"Crime, Drama",9.2,An organized crime dynasty's aging patriarch t...
2,The Dark Knight,"Action, Crime, Drama",9.0,When the menace known as the Joker wreaks havo...
3,The Godfather: Part II,"Crime, Drama",9.0,The early life and career of Vito Corleone in ...
4,12 Angry Men,"Crime, Drama",9.0,A jury holdout attempts to prevent a miscarria...
...,...,...,...,...
990,Giù la testa,"Drama, War, Western",7.6,A low-life bandit and an I.R.A. explosives exp...
991,Kelly's Heroes,"Adventure, Comedy, War",7.6,A group of U.S. soldiers sneaks across enemy l...
992,The Jungle Book,"Animation, Adventure, Family",7.6,Bagheera the Panther and Baloo the Bear have a...
994,A Hard Day's Night,"Comedy, Music, Musical",7.6,"Over two ""typical"" days in the life of The Bea..."


In [74]:
df.to_csv("Cleaned_imdb_dataset.csv", index=False)

In [68]:
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def preprocess_desc(overview):
    # overview is a valid string
    if not isinstance(overview, str):
        return ""

    # remove extra spaces
    overview = ' '.join(overview.split())

    # tokenize word
    words = word_tokenize(overview)
    # ada uppercase kita ganti jadi lowercase
    words = [w.lower() for w in words if w.isalpha() and w not in stop_words]

    # apply stemming
    words = [stemmer.stem(w) for w in words]

    return ' '.join(words)

In [69]:
def load_train_model(df, model_file='model.pickle'):

    if os.path.exists(model_file):
        with open(model_file, 'rb') as f:
            tfidf, mlb, combined_features = pickle.load(f)
            print("Model loaded from model.pickle")
        return tfidf, mlb, combined_features
    
    overview = df['Overview']

    # preprocess overview column
    overview = overview.apply(preprocess_desc)

    # Split genre by comma and strip spaces
    df["genre_list"] = df["Genre"].apply(lambda x: [g.strip() for g in x.split(",")])

    mlb = MultiLabelBinarizer()
    genre_matrix = mlb.fit_transform(df["genre_list"])

    tfidf = TfidfVectorizer(stop_words='english')
    overview_matrix = tfidf.fit_transform(overview)

    combined_features = hstack([overview_matrix * 0.7, genre_matrix * 0.3])
    
    # Save model
    with open(model_file, 'wb') as f:
        pickle.dump((tfidf, mlb, combined_features), f)
        print("Model created and saved successfully!")

    return tfidf, mlb, combined_features

In [70]:
def movie_recommendation(tfidf, mlb, combined_features, df, top_k=5):
    movie_titles = df['Series_Title'].values
    imdb_ratings = df['IMDB_Rating'].values
    genres = df['Genre'].values
    overview = df['Overview'].values

    # input query
    query = input("Enter words to get movie recommendations: ")

    # preprocess
    query_preprocess = preprocess_desc(query)

    # vectorize
    query_overview_vector = tfidf.transform([query_preprocess])

    # Get the number of genre features
    num_genre_features = len(mlb.classes_)
    query_genre_vector = np.zeros((1, num_genre_features))

    query_vector = hstack([query_overview_vector, query_genre_vector])

    # calculate cosine similarity
    cosine_sim = cosine_similarity(query_vector, combined_features)[0]

    # sort indices by similarity (descending)
    sorted_indices = np.argsort(cosine_sim)[::-1]

    # take top K movies
    top_indices = sorted_indices[:top_k]

    # Extract recommendation data
    top_movies = movie_titles[top_indices]
    top_ratings = imdb_ratings[top_indices]
    top_genres = genres[top_indices]
    top_overviews = overview[top_indices]
    top_scores = cosine_sim[top_indices]

    # Create a DataFrame for output
    recommendation_df = pd.DataFrame({
        'Series_Title': top_movies,
        'Genre': top_genres,
        'IMDB_Rating': top_ratings,
        'Overview' : top_overviews,
        'Similarity_Score': top_scores
    })

    print("\nYour query:", query)
    print("\nTop movie recommendations:\n")
    
    return recommendation_df

In [71]:
tfidf, mlb, combined_features = load_train_model(df)

Model loaded from model.pickle


In [72]:
mlb.classes_

array(['Action', 'Adventure', 'Animation', 'Biography', 'Comedy', 'Crime',
       'Drama', 'Family', 'Fantasy', 'Film-Noir', 'History', 'Horror',
       'Music', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Sport',
       'Thriller', 'War', 'Western'], dtype=object)

In [73]:
recommendations = movie_recommendation(tfidf, mlb, combined_features, df)

# Display the resulting DataFrame
recommendations.head()


Your query: space battle mission

Top movie recommendations:



Unnamed: 0,Series_Title,Genre,IMDB_Rating,Overview,Similarity_Score
0,Taegukgi hwinalrimyeo,"Action, Drama, War",8.1,When two brothers are forced to fight in the K...,0.234224
1,Aliens,"Action, Adventure, Sci-Fi",8.3,Fifty-seven years after surviving an apocalypt...,0.180301
2,Gravity,"Drama, Sci-Fi, Thriller",7.7,Two astronauts work together to survive after ...,0.159627
3,Predator,"Action, Adventure, Sci-Fi",7.8,A team of commandos on a mission in a Central ...,0.155517
4,Interstellar,"Adventure, Drama, Sci-Fi",8.6,A team of explorers travel through a wormhole ...,0.147052
