In [69]:
import nltk
import pickle
import pandas as pd
import os
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

In [70]:
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def preprocess_review(review):
    # review is a valid string
    if not isinstance(review, str):
        return ""

    # remove extra spaces
    review = ' '.join(review.split())

    # tokenize word
    words = word_tokenize(review)
    # ada uppercase kita ganti jadi lowercase
    words = [w.lower() for w in words if w.isalpha() and w not in stop_words]

    # apply stemming
    words = [stemmer.stem(w) for w in words]

    return ' '.join(words)

In [71]:
def load_train_model(model_file='model.pickle'):
    # model.pickle is found
    if os.path.exists(model_file):
        with open(model_file, 'rb') as f:
            modelNB, tfidf, movie_review_matrix = pickle.load(f)
            print("Model loaded from model.pickle")
        return modelNB, tfidf, movie_review_matrix
    # model.pickle is not found
    else:
        # read dataset
        df = pd.read_csv("imdb-movies-dataset.csv")
        rating = df['Rating']
        review = df['Review']

        # preprocess column review
        review = review.apply(preprocess_review)
        # make categories for column rating
        rating = rating.apply(lambda x: 'positive' if x > 6 else 'negative')

        # feature extraction with tfidf
        tfidf = TfidfVectorizer(max_features=5000)
        x = tfidf.fit_transform(review)
        y = rating

        # train using naive bayes
        modelNB = MultinomialNB()
        modelNB.fit(x, y)

        # getting tfidf matrix result
        movie_review_matrix = tfidf.transform(review)

        # Make predictions
        y_pred = modelNB.predict(x)

        # Calculate accuracy
        acc = accuracy_score(y_pred, y)
        print(f"Model accuracy: {acc:.2f}")

        # save model
        with open(model_file, 'wb') as f:
            pickle.dump((modelNB, tfidf, movie_review_matrix), f)
            print("Model created successfully!")
            return modelNB, tfidf, movie_review_matrix

In [72]:
def write_review(tfidf):
    review = input("Enter review minimum is 20 words: ")
    while len(review.split()) < 20:
        review = input("Enter review minimum is 20: ")

    # preprocess the sentence
    review_preprocess = preprocess_review(review)

    # vectorize using tfidf
    review_vectorize = tfidf.transform([review_preprocess])

    # load model to classify the review (positive or negative)
    model, _, _ = load_train_model()
    # predict the review
    prediction = model.predict(review_vectorize)
    print(f"Your review: {review}")
    print(f"The result of the review is: {prediction[0]}")

In [73]:
def movie_recommendation(model, tfidf, movie_review_matrix):
    # Get movie titles
    df = pd.read_csv("imdb-movies-dataset.csv")
    movie_title = df['Title']

    # input query
    query = input("Enter a word to get movie recommendations: ")
    # preprocess
    query_preprocess = preprocess_review(query)
    # vectorize
    query_vectorize = tfidf.transform([query_preprocess])
    # calculate cosine similarity
    cosine_sim = cosine_similarity(query_vectorize, movie_review_matrix)
    # sort to get top 2 movie recommendation
    top_indices = cosine_sim.argsort()[0, -2:]

    top_movies = [movie_title[i] for i in top_indices]

    print(f"top 2 movie recommendation: {top_movies}")

In [74]:
def NER():
    # load model
    nlp = spacy.load('en_core_web_sm')
    # input review to get NER
    review = input("please input a review to get NER: ")
    doc = nlp(review)
    # Loop per words entities to get the NER Type
    for ent in doc.ents:
        print(f"{ent.text} ({ent.label_})")

In [75]:
def menu():
    model, tfidf, movie_review_matrix = load_train_model()

    while True:

        print("\nMenu")
        print("1. Write Review")
        print("2. Get Movie Recommendation")
        print("3. Get NER")
        print("4. Exit")

        choice = input("Enter your option: ")

        if choice == "1":
            write_review(tfidf)

        elif choice == "2":
            movie_recommendation(model, tfidf, movie_review_matrix)

        elif choice == "3":
            NER()

        elif choice == "4":
            print("Have a nice day")
            break

        else:
            print("Please choose between 1-4")


In [76]:
menu()

Model loaded from model.pickle

Menu
1. Write Review
2. Get Movie Recommendation
3. Get NER
4. Exit


Model loaded from model.pickle
Your review: I found this movie incredibly disappointing, with bland characters, an unoriginal plot, and dull dialogue that made it feel painfully long and pointless.
The result of the review is: negative

Menu
1. Write Review
2. Get Movie Recommendation
3. Get NER
4. Exit
Please choose between 1-4

Menu
1. Write Review
2. Get Movie Recommendation
3. Get NER
4. Exit
Model loaded from model.pickle
Your review: An absolute joy to watch, filled with heartfelt moments, relatable characters, and a message of hope that stays with you long after.
The result of the review is: positive

Menu
1. Write Review
2. Get Movie Recommendation
3. Get NER
4. Exit
Model loaded from model.pickle
Your review: This movie exceeded all my expectations with its brilliant storytelling, outstanding performances, and beautiful cinematography that kept me engaged and inspired throughout.
The result of the review is: positive

Menu
1. Write Review
2. Get Movie Recommendation
3. Get NER