<a href="https://colab.research.google.com/github/Robby-Akbar/ProjectNLP/blob/main/colab/recommended_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Movie Recommendation with TFIDF

In [1]:
import pandas as pd
import ast
import numpy as np

import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.metrics.pairwise import cosine_similarity

from sklearn.model_selection import train_test_split

##Prepare Data

In [2]:
#Load data from link
url = 'https://raw.githubusercontent.com/Robby-Akbar/ProjectNLP/main/output/data/'
dataset = pd.read_csv(url+"dataset_mod.csv")

In [3]:
#format string genres to array
dataset['genres'] = dataset['genres'].apply(lambda x: ast.literal_eval(x))
#format string keywords to array
dataset['keywords'] = dataset['keywords'].apply(lambda x: ast.literal_eval(x))
#format string cast to array
dataset['cast'] = dataset['cast'].apply(lambda x: ast.literal_eval(x))

In [4]:
dataset.head()

Unnamed: 0,genres,id,original_title,overview,tagline,keywords,cast,director
0,"[Adventure, Fantasy, Family]",8844,Jumanji,siblings judy peter discover enchanted board g...,roll the dice and unleash the excitement!,"[jealousy, toy, boy, friendship, friends, riva...","[Tom Hanks, Tim Allen, Don Rickles, Jim Varney...",John Lasseter
1,"[Romance, Comedy]",15602,Grumpier Old Men,family wedding reignites ancient feud nextdoor...,still yelling. still fighting. still ready for...,"[board game, disappearance, based on children'...","[Robin Williams, Jonathan Hyde, Kirsten Dunst,...",Joe Johnston
2,"[Comedy, Drama, Romance]",31357,Waiting to Exhale,"cheated on, mistreated stepped on, women holdi...",friends are the people who let you be yourself...,"[fishing, best friend, duringcreditsstinger, o...","[Walter Matthau, Jack Lemmon, Ann-Margret, Sop...",Howard Deutch
3,[Comedy],11862,Father of the Bride Part II,"george banks recovered daughter's wedding, rec...",just when his world is back to normal... he is...,"[based on novel, interracial relationship, sin...","[Whitney Houston, Angela Bassett, Loretta Devi...",Forest Whitaker
4,"[Action, Crime, Drama, Thriller]",949,Heat,"obsessive master thief, neil mccauley leads to...",a los angeles crime saga,"[baby, midlife crisis, confidence, aging, daug...","[Steve Martin, Diane Keaton, Martin Short, Kim...",Charles Shyer


In [5]:
#mengecek kembali tidak ada data yang NaN, lalu dibuang
dataset.dropna(inplace=True)
dataset.isnull().sum()

genres            0
id                0
original_title    0
overview          0
tagline           0
keywords          0
cast              0
director          0
dtype: int64

In [6]:
#pecah kalimat menjadi sebuah list
dataset['overview'] = dataset['overview'].apply(lambda x:x.split())
dataset['tagline'] = dataset['tagline'].apply(lambda x:x.split())

In [7]:
# Combine all features into new column
dataset['features'] = dataset['overview'] + dataset['genres'] + dataset['tagline'] + dataset['keywords'] + dataset['cast']
dataset['features'] = dataset['features'].apply(lambda x: " ".join(x))
dataset['features'] = dataset['features'] + ' ' + dataset['director']
dataset['features'].head()

0    siblings judy peter discover enchanted board g...
1    family wedding reignites ancient feud nextdoor...
2    cheated on, mistreated stepped on, women holdi...
3    george banks recovered daughter's wedding, rec...
4    obsessive master thief, neil mccauley leads to...
Name: features, dtype: object

In [8]:
print(dataset['features'][0])

siblings judy peter discover enchanted board game opens door magical world, unwittingly invite alan adult trapped inside game 26 years living room. alan's hope freedom finish game, proves risky three find running giant rhinoceroses, evil monkeys terrifying creatures. Adventure Fantasy Family roll the dice and unleash the excitement! jealousy toy boy friendship friends rivalry boy next door new toy toy comes to life Tom Hanks Tim Allen Don Rickles Jim Varney Wallace Shawn John Ratzenberger Annie Potts John Morris Erik von Detten Laurie Metcalf R. Lee Ermey Sarah Freeman Penn Jillette John Lasseter


In [101]:
train, test = train_test_split(dataset, test_size=0.2)
train, val = train_test_split(train, test_size=0.1)

##Setting-Up TFIDF

In [95]:
# Vektorisasi dokumen dengan TF-IDF
tfidf_vectorizer = TfidfVectorizer(
    min_df=5, max_features=16000, strip_accents='unicode', lowercase=True,
    analyzer='word', token_pattern=r'\w+', ngram_range=(1, 3), max_df=0.7, use_idf=True, 
    smooth_idf=True, sublinear_tf=True, stop_words = 'english'
)

# Hitung fitur
tfidf_vectorizer.fit(dataset['features'])

TfidfVectorizer(max_df=0.7, max_features=16000, min_df=5, ngram_range=(1, 3),
                stop_words='english', strip_accents='unicode',
                sublinear_tf=True, token_pattern='\\w+')

In [102]:
X_train_tfidfmatrix = tfidf_vectorizer.transform(train['features'].values)
X_test_tfidfmatrix = tfidf_vectorizer.transform(test['features'].values)


y_train = train['genres'].apply(lambda x: x[0]).values

In [103]:
X_test_tfidfmatrix.shape

(3989, 16000)

##Evaluation
to choose best model

In [15]:
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss, accuracy_score

In [18]:
def evaluate(X, y, clf=None):
    probas = cross_val_predict(clf, X, y, cv=StratifiedKFold(n_splits=5, random_state=8, shuffle=True), 
                              n_jobs=-1, method='predict_proba', verbose=2)
    pred_indices = np.argmax(probas, axis=1)
    classes = np.unique(y)
    preds = classes[pred_indices]

    return probas, preds;

###LogisticRegression

In [104]:
from sklearn.linear_model import LogisticRegression
probas, preds = evaluate(X_train_tfidfmatrix, y_train, clf=LogisticRegression())

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   58.6s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   58.6s finished


In [107]:
print('Log loss: {}'.format(log_loss(y_train, probas)))
print('Accuracy: {}'.format(accuracy_score(y_train, preds)))

Log loss: 1.0802109126554862
Accuracy: 0.7478061011282908


###XGBClassifier

In [31]:
from xgboost import XGBClassifier
probas, preds = evaluate(X_train_tfidfmatrix, y_train, clf=XGBClassifier())

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed: 12.2min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed: 12.2min finished


In [32]:
print('Log loss: {}'.format(log_loss(y_train, probas)))
print('Accuracy: {}'.format(accuracy_score(y_train, preds)))

Log loss: 0.23453618130409049
Accuracy: 0.9306310071040534


###AdaBoostClassifier

In [75]:
from sklearn.ensemble import AdaBoostClassifier
probas, preds = evaluate(X_train_tfidfmatrix, y_train, clf=AdaBoostClassifier())

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   45.2s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   45.2s finished


In [76]:
print('Log loss: {}'.format(log_loss(y_train, probas)))
print('Accuracy: {}'.format(accuracy_score(y_train, preds)))

Log loss: 2.8483854299803135
Accuracy: 0.4165371308228451


###Naive Bayes

In [74]:
from sklearn.naive_bayes import MultinomialNB
probas, preds = evaluate(X_train_tfidfmatrix, y_train, clf=MultinomialNB())

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.6s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.6s finished


In [36]:
print('Log loss: {}'.format(log_loss(y_train, probas)))
print('Accuracy: {}'.format(accuracy_score(y_train, preds)))

Log loss: 2.060708670085041
Accuracy: 0.44637136091377627


###SVM

In [37]:
from sklearn import svm
probas, preds = evaluate(X_train_tfidfmatrix, y_train, clf=svm.SVC(probability=True))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed: 104.7min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed: 104.7min finished


In [None]:
print('Log loss: {}'.format(log_loss(y_train, probas)))
print('Accuracy: {}'.format(accuracy_score(y_train, preds)))

##Training

In [139]:
clf = XGBClassifier()
clf.fit(X_train_tfidfmatrix, y_train)

XGBClassifier(objective='multi:softprob')

In [140]:
#save model
from joblib import dump, load
dump(clf, 'recommended_model.joblib')

['recommended_model.joblib']