In [1]:
# Import all the libraries needed
from nltk.probability import FreqDist
from nltk.classify import NaiveBayesClassifier, accuracy
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import Word2Vec
import spacy
import pickle
import os

In [2]:
# Setting variables
stemmer = SnowballStemmer('english')
lemmatizer = WordNetLemmatizer()
eng_stopwords = set(stopwords.words('english'))

# Read CSV
dataset = pd.read_csv("Dataset/imdb-movies-dataset.csv")

dataset.isnull().sum() # Cek apakah ada data yang kosong / ga lengkap
dataset = dataset.dropna() #  Drop data yang tidak lengkap / null

# Tambahin kolom Sentiment karna belom ada sesuai ketentuan
dataset['Sentiment'] = dataset['Rating'].apply(lambda x: 'positive' if x > 5 else 'negative') 
dataset.head(5)

# dataset['Sentiment'].value_counts() # Jumlah berapa yang positive dan negative

Unnamed: 0,Title,Rating,Review,Sentiment
0,The Idea of You,6.4,"This film, as well as the reaction to it, is a...",positive
1,Kingdom of the Planet of the Apes,7.3,"I'm a big fan of all the planet of the apes, a...",positive
2,Unfrosted,5.5,Pretty much the worst criticism you can lay on...,positive
3,The Fall Guy,7.3,Just got out of the Austin premier at SXSW and...,positive
4,Challengers,7.7,This is a tough one. I liked the concept and t...,positive


In [3]:
def preprocess_text(sentence):
    # Tokenize
    word_list = word_tokenize(sentence)
    word_list = [word.lower() for word in word_list] # lower biar aman
    
    # Stopwords
    no_stopwords = [token for token in word_list if token not in eng_stopwords]
    
    # Remove Punctuation
    no_punc = [token for token in no_stopwords if token.isalpha()] 
    
    # Stemming
    stemmed = [stemmer.stem(token) for token in no_punc]
    
    # Lemmatizing
    lemmatized = [lemmatizer.lemmatize(token) for token in stemmed] # langsung pake default, kalo mau lebih bagus bikin function get_tag kaya pas quiz sebelumnya
    
    return lemmatized

In [4]:
# Frequency Distribution
X = dataset["Review"]
Y = dataset["Sentiment"]

all_reviews = ' '.join(X)
all_tokens = preprocess_text(all_reviews)

freq_dist = FreqDist(all_tokens)

print(freq_dist.most_common(10))

[('film', 13735), ('movi', 13267), ('one', 6096), ('like', 5519), ('charact', 4285), ('time', 3934), ('make', 3703), ('see', 3621), ('stori', 3543), ('good', 3259)]


In [5]:
# Extract Text Features
def extract_feature(review):
    features = {} # simpan dalam bentuk dictionary
    for word in freq_dist.keys():
        features[word] = (word in review)
    
    return features

feature_sets = [(extract_feature(preprocess_text(review)), sentiment) for (review, sentiment) in zip(X,Y)]

# Opsional kalo disuruh aja
from random import shuffle
shuffle(feature_sets)

In [6]:
# Load & Train Model
def train_and_save_model():
    train_count = int(len(feature_sets) * 0.8) # Sesuain ama ketentuan soal ya dimintanya berapa persentase testing / training nya
    train_set = feature_sets[:train_count]
    test_set = feature_sets[train_count:]

    classifier = NaiveBayesClassifier.train(train_set)
    test_accuracy = accuracy(classifier, test_set)
    print(f"Akurasi data test: {test_accuracy * 100:.2f}%")
    classifier.show_most_informative_features(5)

    # Buat read / write file juga bebas mau pake cara ini atau pake yg syntax with open ("file_name", "action") as file:
    file = open("model.pickle", "wb") # Kalo pake cara ini jangan lupa file.close()
    pickle.dump(classifier, file)
    file.close()

    return classifier
        
def load_model():
    if os.path.exists("./model.pickle"): # Validasi kaya gini bebas mau pake os atau pake try except
        file = open("model.pickle", "rb") 
        classifier = pickle.load(file)
        print("Model loaded successfully.")
        file.close()
    else:
        print("Model not found! Training model...")
        classifier = train_and_save_model()
    return classifier


### Embedding Language Model
Ini pas UAP harusnya salah satu aja, tapi berhubung gatau kalian dpt soalnya yang mana jadi pelajarin aja semua ya ehe 😁

In [7]:
# TF-IDF
def tf_idf(query):
    vectorizer = TfidfVectorizer(stop_words= 'english')
    tfidf_matrix = vectorizer.fit_transform(dataset["Review"]) 
    # hitung .fit_transform buat hitung frekuensi kemunculan kata + ubah jadi vektor

    query_vec = vectorizer.transform([query]) # ubah query / inputted sentence jadi vektor. Pake [] karena nerimanya dalam bentuk list

    similarity = cosine_similarity(query_vec, tfidf_matrix).flatten() # Ubah hasil jadi 1 Dimensi 
    dataset['Similarity'] = similarity # Tambahin kolom baru di dataframe kita dengan judul "Similarity"

    dataset_sorted = dataset.sort_values(by='Similarity', ascending= False) # Sort dari hasil nilai Similarity Besar -> Kecil
    
    print("\nTop 2 Movie Recommendation for you:")
    print("1: ", dataset_sorted.iloc[0,0]) 
    print("2: ", dataset_sorted.iloc[1,0])
     # iloc = "integer location". Digunain untuk ambil data per row, col. Jadi iloc[0,0] ambil baris pertama dari data dan kolom 0 yang berupa titlenya. Kalau mau ambil semua data dalam 1 row berarti iloc[0] aja
    
# Ngram-Models (Unigram, Bigram, Trigram)
def ngram(query):
    ngram_n = 3 
    ngram_range = (1, ngram_n)

    vectorizer = TfidfVectorizer(ngram_range= ngram_range, stop_words='english') # bekerja dari unigram - trigram
    tfidf_matrix = vectorizer.fit_transform(dataset["Review"])

    query_vec = vectorizer.transform([query])

    similarity = cosine_similarity(query_vec, tfidf_matrix).flatten()
    dataset['Similarity'] = similarity

    dataset_sorted = dataset.sort_values(by='Similarity', ascending=False)
    
    print("\nTop 2 Movie Recommendation for you:")
    print("1: ", dataset_sorted.iloc[0,0])
    print("2: ", dataset_sorted.iloc[1,0])
    
# Word2Vec
# Word2Vec: Menghasilkan vektor level kata. Diperlukan fungsi tambahan (seperti avg_vector) untuk hasilin vector query / sentence supaya bisa dibandingin pake consine_similarity
def avg_vector(tokens, model): # Intinya function buat hasilin vektor rata" dari sebuah query / sentence input
    vectors = [model.wv[word] for word in tokens if word in model.wv]
    if not vectors:
        return np.zeros(model.vector_size)
    return np.mean(vectors, axis= 0)  

def word2vec(query):
    tokenized_docs = [preprocess_text(token) for token in dataset["Review"]]
    w2v_model = Word2Vec(sentences=tokenized_docs, vector_size=100, window=5, min_count=1, sg = 1) # ini pake skip-gram kalo sg = 1, sg = 0 pake CBOW

    query_vec = avg_vector(preprocess_text(query), w2v_model).reshape(1, -1) # Di reshape untuk ikutin ketentuan ketika digunakan di cosine_similarity

    review_vec = [avg_vector(preprocess_text(token), w2v_model) for token in dataset["Review"]]
    similarity = cosine_similarity(query_vec, review_vec)[0]

    dataset['Similarity'] = similarity

    dataset_sorted = dataset.sort_values(by='Similarity', ascending=False)
    
    print("\nTop 2 Movie Recommendation for you:")
    print("1: ", dataset_sorted.iloc[0,0])
    print("2: ", dataset_sorted.iloc[1,0])

In [8]:
# NER with Spacy
paragraph = ' '.join(dataset['Review'].head(500))

# spacy.cli.download('en_core_web_sm')
nlp = spacy.load('en_core_web_sm')
# nlp.max_length = 10000000   
doc = nlp(paragraph)

categories = {}
for ent in doc.ents:
    label = ent.label_
    if label not in categories:
        categories[label] = []
    categories[label].append(ent.text)

- nlp.max_length = 10000000: digunain kalo misal mau lebih dari batas default yaitu 1.000.000. Misalnya kalo banyak banget jumlah katanya yang di dataset. Tapi gausah pake gapapa tapi solusinya di limit aja jadi 500 baris pertama yang diproses biar ga keberatan 😅

In [9]:
my_review = "No Review"
my_category = "Unknown"

def menu_1(loaded_classifier):
    global my_review, my_category
    query = input("My query: ")
    word_list = word_tokenize(query)
    
    if len(word_list) > 20: # Validasinya nanti sesuain aja ya ama soal UAP nya, ini ngecek apakah lebih dari 20 kata ato engga
        my_review = query
        query_processed = preprocess_text(query)
        input_feature = extract_feature(query_processed)
        my_category = loaded_classifier.classify(input_feature)
        print("Review successfully updated!")
    else:
        print(f"Input must be more than 20 words. Your input has {len(word_list)} words.")

def menu_2(): 
    if my_review == "No Review":
        print("Please Input Review First")
    else:
        print("\nChoose Language Model")
        print("1. Word2Vec \n2. TF-IDF \n3. N-GRAM")
        x = input("Chosen model: ")
        
        if x == '1':
            ngram(my_review)
        elif x == '2':
            tf_idf(my_review)
        elif x == '3':
            word2vec(my_review)
        else:
            print("Invalid input")

def menu_3():
    print("\nNamed Entity Recognition")
    for label, ent in categories.items():
        print(f'{label}: {", ".join(ent)}')


In [10]:
def main_menu():
    loaded_classifier = load_model()

    while True:    
        print("\nMovie Recommendation Application Based On Reviews")        
        print(f"Your Review: {my_review}")
        print(f"Your Category: {my_category}")
        
        print('1. Write your review')
        print('2. View movie recommendation')
        print('3. View NER')
        print('4. Exit')
        
        try: 
            choice = input(">> ")
            if choice == '1':
                menu_1(loaded_classifier)
            elif choice == '2':
                menu_2()
            elif choice == '3':
                menu_3()
            elif choice == '4':
                print("Exiting application...")
                break
            else:
                print("Invalid input. Please enter a number between 1 and 4.")
        except ValueError:
            print("Invalid input. Please enter a valid number.")

In [11]:
main_menu()

Model not found! Training model...
Akurasi data test: 93.01%
Most Informative Features
                     gen = True           negati : positi =     39.8 : 1.0
                    anus = True           negati : positi =     28.4 : 1.0
                 carlson = True           negati : positi =     28.4 : 1.0
                   dreck = True           negati : positi =     28.4 : 1.0
                 expound = True           negati : positi =     28.4 : 1.0

Movie Recommendation Application Based On Reviews
Your Review: No Review
Your Category: Unknown
1. Write your review
2. View movie recommendation
3. View NER
4. Exit
Input must be more than 20 words. Your input has 1 words.

Movie Recommendation Application Based On Reviews
Your Review: No Review
Your Category: Unknown
1. Write your review
2. View movie recommendation
3. View NER
4. Exit
Please Input Review First

Movie Recommendation Application Based On Reviews
Your Review: No Review
Your Category: Unknown
1. Write your review
2.