<a href="https://colab.research.google.com/github/Robby-Akbar/ProjectNLP/blob/main/colab/data_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import ast

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.linear_model import LogisticRegression

In [None]:
#Load data from link
url = 'https://raw.githubusercontent.com/Robby-Akbar/ProjectNLP/main/output/data/'
dataset = pd.read_csv(url+"dataset_mod.csv")
train = pd.read_csv(url+"train_data.csv")
test = pd.read_csv(url+"test_data.csv")
val = pd.read_csv(url+"val_data.csv")

In [None]:
#format string genres to array
dataset['genres'] = dataset['genres'].apply(lambda x: ast.literal_eval(x))
train['genres'] = train['genres'].apply(lambda x: ast.literal_eval(x))
test['genres'] = test['genres'].apply(lambda x: ast.literal_eval(x))
val['genres'] = val['genres'].apply(lambda x: ast.literal_eval(x))

In [None]:
dataset.head()

Unnamed: 0,genres,id,original_title,overview,tagline,keywords,cast,director
0,"[Adventure, Fantasy, Family]",8844,Jumanji,siblings judy peter discover enchanted board g...,roll the dice and unleash the excitement!,"['jealousy', 'toy', 'boy', 'friendship', 'frie...","['Tom Hanks', 'Tim Allen', 'Don Rickles', 'Jim...",John Lasseter
1,"[Romance, Comedy]",15602,Grumpier Old Men,family wedding reignites ancient feud nextdoor...,still yelling. still fighting. still ready for...,"['board game', 'disappearance', ""based on chil...","['Robin Williams', 'Jonathan Hyde', 'Kirsten D...",Joe Johnston
2,"[Comedy, Drama, Romance]",31357,Waiting to Exhale,"cheated on, mistreated stepped on, women holdi...",friends are the people who let you be yourself...,"['fishing', 'best friend', 'duringcreditssting...","['Walter Matthau', 'Jack Lemmon', 'Ann-Margret...",Howard Deutch
3,[Comedy],11862,Father of the Bride Part II,"george banks recovered daughter's wedding, rec...",just when his world is back to normal... he is...,"['based on novel', 'interracial relationship',...","['Whitney Houston', 'Angela Bassett', 'Loretta...",Forest Whitaker
4,"[Action, Crime, Drama, Thriller]",949,Heat,"obsessive master thief, neil mccauley leads to...",a los angeles crime saga,"['baby', 'midlife crisis', 'confidence', 'agin...","['Steve Martin', 'Diane Keaton', 'Martin Short...",Charles Shyer


#Converting Text to Features

In [None]:
multilabel_binarizer = MultiLabelBinarizer()
multilabel_binarizer.fit(dataset['genres'])

# transform target variable
y = multilabel_binarizer.transform(dataset['genres'])

In [None]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=10000)

In [None]:
# split dataset into training and validation set
xtrain, xval, ytrain, yval = train_test_split(dataset['overview'], y, test_size=0.2, random_state=9)

In [None]:
# create TF-IDF features
xtrain_tfidf = tfidf_vectorizer.fit_transform(xtrain)
xval_tfidf = tfidf_vectorizer.transform(xval)

In [None]:
# Binary Relevance
from sklearn.multiclass import OneVsRestClassifier

# Performance metric
from sklearn.metrics import f1_score

In [None]:
lr = LogisticRegression()
clf = OneVsRestClassifier(lr)

In [None]:
# fit model on train data
clf.fit(xtrain_tfidf, ytrain)

OneVsRestClassifier(estimator=LogisticRegression())

In [None]:
# make predictions for validation set
y_pred = clf.predict(xval_tfidf)

In [None]:
y_pred[3]

array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [None]:
multilabel_binarizer.inverse_transform(y_pred)[3]

('Action',)

##Evaluate Performance

In [None]:
f1_score(yval, y_pred, average="micro")

0.4250940643476925

In [None]:
# predict probabilities
y_pred_prob = clf.predict_proba(xval_tfidf)

t = 0.3 # threshold value
y_pred_new = (y_pred_prob >= t).astype(int)

# evaluate performance
f1_score(yval, y_pred_new, average="micro")

0.5680243834857301

In [None]:
def infer_tags(q):
    q = clean_overview(q)
    q = remove_stopwords(q)
    q_vec = tfidf_vectorizer.transform([q])
    q_pred = clf.predict(q_vec)
    return multilabel_binarizer.inverse_transform(q_pred)

In [32]:
for i in range(5): 
  k = xval.sample(1).index[0] 
  print("Movie: ", dataset['original_title'][k], "\nPredicted genre: ", infer_tags(xval[k])), print("Actual genre: ",dataset['genres'][k], "\n")

NameError: ignored