<a href="https://colab.research.google.com/github/Robby-Akbar/ProjectNLP/blob/main/colab/recommended_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [100]:
import pandas as pd
import ast
import numpy as np

import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.metrics.pairwise import cosine_similarity

from sklearn.model_selection import train_test_split

In [58]:
#Load data from link
url = 'https://raw.githubusercontent.com/Robby-Akbar/ProjectNLP/main/output/data/'
dataset = pd.read_csv(url+"dataset_mod.csv")

In [59]:
#format string genres to array
dataset['genres'] = dataset['genres'].apply(lambda x: ast.literal_eval(x))
#format string keywords to array
dataset['keywords'] = dataset['keywords'].apply(lambda x: ast.literal_eval(x))
#format string cast to array
dataset['cast'] = dataset['cast'].apply(lambda x: ast.literal_eval(x))

In [60]:
dataset.head()

Unnamed: 0,genres,id,original_title,overview,tagline,keywords,cast,director
0,"[Adventure, Fantasy, Family]",8844,Jumanji,siblings judy peter discover enchanted board g...,roll the dice and unleash the excitement!,"[jealousy, toy, boy, friendship, friends, riva...","[Tom Hanks, Tim Allen, Don Rickles, Jim Varney...",John Lasseter
1,"[Romance, Comedy]",15602,Grumpier Old Men,family wedding reignites ancient feud nextdoor...,still yelling. still fighting. still ready for...,"[board game, disappearance, based on children'...","[Robin Williams, Jonathan Hyde, Kirsten Dunst,...",Joe Johnston
2,"[Comedy, Drama, Romance]",31357,Waiting to Exhale,"cheated on, mistreated stepped on, women holdi...",friends are the people who let you be yourself...,"[fishing, best friend, duringcreditsstinger, o...","[Walter Matthau, Jack Lemmon, Ann-Margret, Sop...",Howard Deutch
3,[Comedy],11862,Father of the Bride Part II,"george banks recovered daughter's wedding, rec...",just when his world is back to normal... he is...,"[based on novel, interracial relationship, sin...","[Whitney Houston, Angela Bassett, Loretta Devi...",Forest Whitaker
4,"[Action, Crime, Drama, Thriller]",949,Heat,"obsessive master thief, neil mccauley leads to...",a los angeles crime saga,"[baby, midlife crisis, confidence, aging, daug...","[Steve Martin, Diane Keaton, Martin Short, Kim...",Charles Shyer


In [61]:
#mengecek kembali tidak ada data yang NaN, lalu dibuang
dataset.dropna(inplace=True)
dataset.isnull().sum()

genres            0
id                0
original_title    0
overview          0
tagline           0
keywords          0
cast              0
director          0
dtype: int64

In [62]:
dataset['overview'] = dataset['overview'].apply(lambda x:x.split())
dataset['tagline'] = dataset['tagline'].apply(lambda x:x.split())

In [63]:
# Combine all features into new column
dataset['features'] = dataset['overview'] + dataset['genres'] + dataset['tagline'] + dataset['keywords'] + dataset['cast']
dataset['features'] = dataset['features'].apply(lambda x: " ".join(x))
dataset['features'] = dataset['features'] + ' ' + dataset['director']
dataset['features'].head()

0    siblings judy peter discover enchanted board g...
1    family wedding reignites ancient feud nextdoor...
2    cheated on, mistreated stepped on, women holdi...
3    george banks recovered daughter's wedding, rec...
4    obsessive master thief, neil mccauley leads to...
Name: features, dtype: object

In [64]:
print(dataset['features'][0])

siblings judy peter discover enchanted board game opens door magical world, unwittingly invite alan adult trapped inside game 26 years living room. alan's hope freedom finish game, proves risky three find running giant rhinoceroses, evil monkeys terrifying creatures. Adventure Fantasy Family roll the dice and unleash the excitement! jealousy toy boy friendship friends rivalry boy next door new toy toy comes to life Tom Hanks Tim Allen Don Rickles Jim Varney Wallace Shawn John Ratzenberger Annie Potts John Morris Erik von Detten Laurie Metcalf R. Lee Ermey Sarah Freeman Penn Jillette John Lasseter


In [66]:
train, test = train_test_split(dataset, test_size=0.2)
train, val = train_test_split(train, test_size=0.1)

In [23]:
# Vektorisasi dokumen dengan TF-IDF
vektorisasi = TfidfVectorizer(max_features=10000, ngram_range=(1,3), max_df=0.7, smooth_idf=True, use_idf=True, sublinear_tf=True)

# Hitung fitur
X = vektorisasi.fit_transform(dataset['features']).toarray()

In [24]:
X.shape

(19943, 10000)

In [29]:
# Tfidf gives normalized vectors, linear_kernel will give the same result as cosine_similarity
# linear_kernel is computationally faster
similarity = linear_kernel(X, X)

In [25]:
movies_indices = dataset.reset_index()
titles = movies_indices['original_title']
indices = pd.Series(movies_indices.index, index=movies_indices['original_title'])

In [30]:
def get_recommendations(title):
    index = indices[title]
    score = list(enumerate(similarity[index]))
    score = sorted(score, key=lambda x: x[1], reverse=True)
    movies_indices = [i[0] for i in score]
    return titles.iloc[movies_indices[1:11]]

In [31]:
get_recommendations('The Dark Knight')

10847         The Dark Knight Rises
110                  Batman Forever
972                  Batman Returns
12696              Reasonable Doubt
12415                   Just Wright
9134            Law Abiding Citizen
9676     Batman: Under the Red Hood
13007                       Tokarev
3213                      The Score
15108       Kidnapping Mr. Heineken
Name: original_title, dtype: object

In [56]:
get_recommendations('Star Trek')

12361           Scanners III: The Takeover
971          Star Trek IV: The Voyage Home
15978                      Robot Overlords
12849                   Threads of Destiny
19734    Recon 2020:  The Caprini Massacre
1332                        Small Soldiers
12205                              Riddick
2468          Teenage Mutant Ninja Turtles
7023                        In Old Chicago
14054                   Terminator Genisys
Name: original_title, dtype: object

In [67]:
tfidf_vectorizer = TfidfVectorizer(
    min_df=5, max_features=16000, strip_accents='unicode', lowercase=True,
    analyzer='word', token_pattern=r'\w+', ngram_range=(1, 3), use_idf=True, 
    smooth_idf=True, sublinear_tf=True, stop_words = 'english'
)

In [68]:
tfidf_vectorizer.fit(train['features'])

TfidfVectorizer(max_features=16000, min_df=5, ngram_range=(1, 3),
                stop_words='english', strip_accents='unicode',
                sublinear_tf=True, token_pattern='\\w+')

In [69]:
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss, accuracy_score

In [91]:
X_train_tfidfmatrix = tfidf_vectorizer.transform(train['features'].values)
X_test_tfidfmatrix = tfidf_vectorizer.transform(test['features'].values)

y_train = train['genres'].values

##Evaluation

In [94]:
def evaluate(X, y, clf=None):
    probas = cross_val_predict(clf, X, y, cv=StratifiedKFold(n_splits=5, random_state=8), 
                              n_jobs=-1, method='predict_proba', verbose=2)
    pred_indices = np.argmax(probas, axis=1)
    classes = np.unique(y)
    preds = classes[pred_indices]
    print('Log loss: {}'.format(log_loss(y, probas)))
    print('Accuracy: {}'.format(accuracy_score(y, preds)))

In [95]:
evaluate(X_train_tfidfmatrix, y_train, clf=XGBClassifier())

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed: 10.8min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed: 10.8min finished


Log loss: 0.2407210799783047
Accuracy: 0.9295862933556206


In [96]:
evaluate(X_train_tfidfmatrix, y_train, clf=LogisticRegression())

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  1.0min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  1.0min finished


Log loss: 1.0803729327237264
Accuracy: 0.7459256163811115


In [97]:
evaluate(X_train_tfidfmatrix, y_train, clf=AdaBoostClassifier())

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   36.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   36.1s finished


Log loss: 2.8603300106938545
Accuracy: 0.3448948321493244


In [98]:
evaluate(X_train_tfidfmatrix, y_train, clf=MultinomialNB())

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.4s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.4s finished


Log loss: 2.066193130352132
Accuracy: 0.44372475275107953


In [99]:
evaluate(X_train_tfidfmatrix, y_train, clf=svm.SVC(probability=True))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed: 106.1min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed: 106.1min finished


Log loss: 0.5402945859776884
Accuracy: 0.8331940381668757


##Training

In [101]:
clf = XGBClassifier()
clf.fit(X_train_tfidfmatrix, y_train)

XGBClassifier(objective='multi:softprob')

In [102]:
y_test_predicted = clf.predict_proba(X_test_tfidfmatrix)

In [104]:
y_test_predicted.shape

(3989, 20)