# Import Library

In [26]:
from nltk.probability import FreqDist
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer, WordNetLemmatizer
from nltk.classify import NaiveBayesClassifier, accuracy

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import Word2Vec

import spacy
import os
import pickle
import pandas as pd
import numpy as np

# Setting Variables

In [27]:
stemmer = SnowballStemmer("english")
lemmatizer = WordNetLemmatizer()
eng_stopwords = set(stopwords.words("english"))

# Read Data

In [28]:
dataset = pd.read_csv("./Dataset/imdb-movies-dataset.csv")
dataset.head()

Unnamed: 0,Title,Rating,Review
0,The Idea of You,6.4,"This film, as well as the reaction to it, is a..."
1,Kingdom of the Planet of the Apes,7.3,"I'm a big fan of all the planet of the apes, a..."
2,Unfrosted,5.5,Pretty much the worst criticism you can lay on...
3,The Fall Guy,7.3,Just got out of the Austin premier at SXSW and...
4,Challengers,7.7,This is a tough one. I liked the concept and t...


In [29]:
dataset.isnull().sum()

Title     4971
Rating    5151
Review    5184
dtype: int64

In [30]:
dataset = dataset.dropna()

In [31]:
dataset.isnull().sum()

Title     0
Rating    0
Review    0
dtype: int64

In [32]:
dataset["Sentiment"] = dataset["Rating"].apply(lambda x : "positive" if x > 5 else "negative")

In [33]:
dataset["Sentiment"].value_counts()

Sentiment
positive    4514
negative     272
Name: count, dtype: int64

# Data Preprocessing

In [34]:
def preprocess_text(sentence):

    # Tokenize
    word_list = word_tokenize(sentence)
    word_list = [word.lower() for word in word_list]

    # Stopwords
    stop_words = [token for token in  word_list if token not in eng_stopwords]

    # Remove punctuation
    no_punc = [token for token in stop_words if token.isalpha()]

    # Stemming
    stemmed = [stemmer.stem(token) for token in no_punc]

    lemmatized = [lemmatizer.lemmatize(token) for token in stemmed]

    return lemmatized

# Frequency Distribution

In [35]:
X = dataset["Review"]
Y = dataset["Sentiment"]

all_reviews = ' '.join(X)
all_tokens = preprocess_text(all_reviews)

freq_dist = FreqDist(all_tokens)
print (freq_dist.most_common(10))

[('film', 13735), ('movi', 13267), ('one', 6096), ('like', 5519), ('charact', 4285), ('time', 3934), ('make', 3703), ('see', 3621), ('stori', 3543), ('good', 3259)]


# Extract Features

In [36]:
def extract_features(review):

    features = {}

    for word in freq_dist.keys():
        features[word] = (word in review)

    return features

In [37]:
feature_sets = [(extract_features(preprocess_text(review)), sentiment) for (review, sentiment) in zip (X, Y)]
from random import shuffle
shuffle(feature_sets)

# Load and Train Model : Naive Bayes Classifier

In [38]:
def train_and_save_model():

    train_count = int(len(feature_sets)*0.8)
    train_set = feature_sets[:train_count]
    test_set = feature_sets[train_count:]

    classifier = NaiveBayesClassifier.train(train_set)
    test_accuracy = accuracy(classifier, test_set)
    print (f"Accuracy Model : {test_accuracy*100:.2f}%")
    classifier.show_most_informative_features(10)

    file = open ("./model.pickle", "wb")
    pickle.dump(classifier, file)
    file.close()

    return classifier

def load_model():

    if (os.path.exists("./model.pickle")):

        file = open ("./model.pickle", "rb")
        classifier = pickle.load(file)
        print ("Model Load Successfully")
        file.close()

    else:
        print ("Model not found! Training Model...")
        classifier = train_and_save_model()

    return classifier


# Word Embedding Language Model

In [39]:
def tf_idf(query):

    vectorizer = TfidfVectorizer(stop_words="english")
    tf_idf_matrix = vectorizer.fit_transform(dataset["Review"])

    query_vec = vectorizer.transform([query])

    similarity = cosine_similarity(tf_idf_matrix, query_vec).flatten()

    dataset["Similarity"] = similarity

    dataset_sorted = dataset.sort_values(by="Similarity", ascending=False)

    print (f"Top 2 Movies Recommendation")
    print (f"1. {dataset_sorted.iloc[0,0]}")
    print (f"2. {dataset_sorted.iloc[1,0]}")

# NER

In [40]:
paragraph = ' '.join(dataset["Review"].head(500))

nlp = spacy.load("en_core_web_sm")

doc = nlp(paragraph)

categories = {}

for ent in doc.ents:
    label = ent.label_
    if label not in categories:
        categories[label] = []
    categories[label].append(ent.text)

# Menu Function

In [None]:
my_review = "No Review"
my_category = "Unknown"

def menu_1(loaded_classifier):

    global my_review, my_category

    query = input("Input Query : ")
    word_list = word_tokenize(query)

    if len(word_list) > 20:
        my_review = query
        preprocessed_text = preprocess_text(query)
        extracted_features = extract_features(preprocessed_text)

        my_category = loaded_classifier.classify(extracted_features)

        print ("Review Successfully Updated")

    else:
        print ("Input must be 20 words")

def menu_2():

    if my_review == "No Review":
        print ("Please input review first")
    else:

        print("\nChoose Language Model")
        print("1. Word2Vec \n2. TF-IDF \n3. N-GRAM")
        chosen_model = input("Please choose your model")

        if (chosen_model == "1"):
            tf_idf(my_review)
        elif (chosen_model == "2"):
            tf_idf(my_review)
        elif (chosen_model == "3"):
            tf_idf(my_review)
        else:
            print ("Invalid Input!")

def menu_3():
    print ("NER:\n")
    for label, ent in categories.items():
        print (f"{label}: {', '.join(ent)}")


# Main Function

In [43]:
def main_menu():
    loaded_classifier = load_model()

    while True:
        print ("Movie Recommendation Application Based on Review")
        print (f"Your Review: {my_review}")
        print (f"My Category: {my_category}")

        print ("1. Write your Review\n2. View Movie Recommendation\n3. View NER\n4. Exit")

        try:
            choice = input(">> ")
            if choice == '1':
                menu_1(loaded_classifier)
            elif (choice == '2'):
                menu_2()
            elif (choice == '3'):
                menu_3()
            elif (choice == '4'):
                print ("Exit Menu...")
                break
            else:
                print ("Input Invalid!")
        except ValueError:
            print ("Input Invalid, please enter a valid number!")

In [46]:
main_menu()

Model Load Successfully
Movie Recommendation Application Based on Review
Your Review: No Review
My Category: Unknown
1. Write your Review
2. View Movie Recommendation
3. View NER
4. Exit


TypeError: argument of type 'function' is not iterable