Question: Given a show's name and synopsis, predict the score of the show

Steps
1. Read in dataset
2. Remove punctuation and common words (stopwords) - Could round shows up or down depending on the rating
3. Convert text to lower and sort words
4. ML models + evaluation

In [None]:
import nltk
import pandas as pd
import sklearn
import seaborn as sns
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
import spacy # python -m spacy download en_core_web_lg
nltk.download('stopwords')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')

In [None]:
file_name = "anime_dataset_2023.csv"
global_dir = "/workspaces/anime-predictor/" # Change this to whatever global directory
anime_dataset = pd.read_csv(global_dir + f"data/{file_name}")
anime_dataset = anime_dataset.replace("UNKNOWN", None).dropna(axis = 0)
anime_dataset.dropna(how="any", inplace=True, axis=1)

anime_dataset.head()

In [None]:
"""
Columns to keep (NLP Project): English name, score, synopsis
Can do other analysis of how different variables affect the score of a show
"""

In [None]:
def label_shows(row):
    if float(row["Score"]) >= 6.5:
        return "recommend"    
    return "don't recommend"

In [None]:
anime_nlp = anime_dataset[['Name', 'Score', 'Synopsis']]
anime_nlp["Label"] = anime_nlp.apply(label_shows, axis = 1)
anime_nlp['label_num'] = anime_nlp["Label"].map({"don't recommend":0, 'recommend':1})
anime_nlp['summary_len'] = anime_nlp["Synopsis"].apply(len)

anime_nlp.head()

## Data Cleaning

In [None]:
# Replacing UNKNOWN with null values (will remove those entries from the dataset)
anime_nlp = anime_nlp.replace("UNKNOWN", None).dropna(axis = 0)
anime_nlp = anime_nlp.sort_values("Score", ascending = False).reset_index().drop("index", axis = 1)

In [None]:
plt.figure(figsize=(12, 8))

anime_nlp[anime_nlp.Label=='recommend'].summary_len.plot(bins=35, kind='hist', color='blue', 
                                       label='recommend', alpha=0.6)
anime_nlp[anime_nlp.Label=="don't recommend"].summary_len.plot(kind='hist', color='red', 
                                       label="don't recommend", alpha=0.6)
plt.legend()
plt.xlabel("Summary Length")

In [None]:
stop_words = set(stopwords.words('english'))
nlp = spacy.load('en_core_web_lg')

def remove_keywords(row):
    word_tokens = word_tokenize(str(row))
    filtered_sentence = [w.lower() for w in word_tokens if w.isalpha() and w.lower() not in stop_words] # Removing stop words from sentence
    doc = nlp(' '.join(filtered_sentence).lower())
    proper_noun = [str(tok) for tok in doc if tok.pos_=='PROPN']
    # Removes better than nltk but removes a couple words that are not a PROPN
    key_words = [word.lemma_ for word in doc if word not in proper_noun]
    key_words = " ".join(key_words)
    # Add into function and .apply for whole DataFrame
    return key_words

In [10]:
anime_nlp["cleaned_synopsis"] = anime_nlp["Synopsis"].apply(remove_keywords)
anime_nlp["cleaned_names"] = anime_nlp["Name"].apply(remove_keywords)
anime_nlp["cleaned_summary"] = anime_nlp["cleaned_names"] + " " + anime_nlp["cleaned_synopsis"]

In [11]:
anime_nlp.head()

Unnamed: 0,Name,Score,Synopsis,Label,label_num,summary_len,cleaned_synopsis,cleaned_names,cleaned_summary
0,Fullmetal Alchemist: Brotherhood,9.1,After a horrific alchemy experiment goes wrong...,recommend,1,1137,horrific alchemy experiment go wrong elric hou...,fullmetal alchemist brotherhood,fullmetal alchemist brotherhood horrific alche...
1,Bleach: Sennen Kessen-hen,9.07,Substitute Soul Reaper Ichigo Kurosaki spends ...,recommend,1,972,substitute soul reaper ichigo kurosaki spend d...,bleach sennen,bleach sennen substitute soul reaper ichigo ku...
2,Steins;Gate,9.07,Eccentric scientist Rintarou Okabe has a never...,recommend,1,1166,eccentric scientist rintarou okabe thirst scie...,steins gate,steins gate eccentric scientist rintarou okabe...
3,Gintama°,9.06,"Gintoki, Shinpachi, and Kagura return as the f...",recommend,1,1173,gintoki shinpachi kagura return break member y...,,gintoki shinpachi kagura return break member ...
4,Kaguya-sama wa Kokurasetai: Ultra Romantic,9.05,The elite members of Shuchiin Academy's studen...,recommend,1,919,elite member shuchiin academy student council ...,wa kokurasetai ultra romantic,wa kokurasetai ultra romantic elite member shu...


In [12]:
from collections import Counter

words = anime_nlp[anime_nlp["Label"]=='recommend']["cleaned_summary"].apply(lambda x: [word.lower() for word in x.split()])
rec_words = Counter()

for msg in words:
    rec_words.update(msg)
    
print(rec_words.most_common(50))

[('school', 897), ('world', 804), ('one', 792), ('life', 717), ('however', 680), ('girl', 680), ('new', 642), ('find', 603), ('friend', 583), ('become', 505), ('student', 494), ('take', 475), ('high', 473), ('know', 464), ('year', 439), ('day', 426), ('begin', 410), ('make', 403), ('live', 385), ('two', 375), ('must', 366), ('young', 352), ('power', 334), ('name', 327), ('time', 317), ('human', 309), ('source', 309), ('love', 303), ('club', 301), ('come', 300), ('mysterious', 290), ('way', 290), ('get', 288), ('meet', 287), ('help', 282), ('work', 279), ('call', 279), ('fight', 277), ('together', 262), ('first', 258), ('force', 256), ('man', 252), ('family', 250), ('soon', 249), ('city', 247), ('people', 247), ('member', 246), ('even', 243), ('boy', 241), ('also', 241)]


In [13]:
from collections import Counter

words = anime_nlp[anime_nlp["Label"]=="don't recommend"]["cleaned_summary"].apply(lambda x: [word.lower() for word in x.split()])
not_rec_words = Counter()

for msg in words:
    not_rec_words.update(msg)
    
print(not_rec_words.most_common(50))

[('world', 175), ('girl', 153), ('school', 126), ('one', 125), ('source', 122), ('new', 107), ('life', 101), ('become', 100), ('however', 94), ('friend', 89), ('find', 80), ('year', 71), ('begin', 66), ('name', 65), ('day', 64), ('know', 63), ('student', 62), ('power', 62), ('young', 61), ('call', 61), ('time', 60), ('meet', 60), ('take', 59), ('must', 59), ('high', 59), ('use', 56), ('human', 55), ('live', 55), ('battle', 54), ('boy', 54), ('two', 54), ('save', 53), ('people', 52), ('mysterious', 51), ('way', 51), ('get', 51), ('ann', 50), ('make', 50), ('group', 49), ('game', 46), ('earth', 46), ('family', 45), ('go', 44), ('team', 44), ('war', 43), ('together', 43), ('fight', 42), ('good', 41), ('soon', 41), ('around', 39)]


## ML Modelling

In [14]:
# split X and y into training and testing sets 
from sklearn.model_selection import train_test_split

# how to define X and y (from the SMS data) for use with COUNTVECTORIZER
X = anime_nlp["cleaned_summary"]
y = anime_nlp["label_num"]
print(X.shape)
print(y.shape)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(1854,)
(1854,)
(1390,)
(464,)
(1390,)
(464,)


In [15]:
from sklearn.feature_extraction.text import CountVectorizer

# instantiate the vectorizer
vect = CountVectorizer()
vect.fit(X_train)

# learn training data vocabulary, then use it to create a document-term matrix
X_train_dtm = vect.transform(X_train)

# equivalently: combine fit and transform into a single step
X_train_dtm = vect.fit_transform(X_train)

# examine the document-term matrix
print(type(X_train_dtm), X_train_dtm.shape)

# transform testing data (using fitted vocabulary) into a document-term matrix
X_test_dtm = vect.transform(X_test)
print(type(X_test_dtm), X_test_dtm.shape)

<class 'scipy.sparse._csr.csr_matrix'> (1390, 14586)
<class 'scipy.sparse._csr.csr_matrix'> (464, 14586)


In [16]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transformer = TfidfTransformer()
tfidf_transformer.fit(X_train_dtm)
tfidf_transformer.transform(X_train_dtm)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 91875 stored elements and shape (1390, 14586)>

In [17]:
# import and instantiate a Multinomial Naive Bayes model
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
nb = MultinomialNB() # Performed comparatively worse + does not have as many hyperparameters to tune compared to SVC
nb = SVC(gamma = "scale", class_weight = "balanced", C = 0.85)

In [18]:
# train the model using X_train_dtm (timing it with an IPython "magic command")
%time nb.fit(X_train_dtm, y_train)

CPU times: user 940 ms, sys: 0 ns, total: 940 ms
Wall time: 952 ms


In [19]:
from sklearn import metrics

# make class predictions for X_test_dtm
y_pred_class = nb.predict(X_test_dtm)

# calculate accuracy of class predictions
print("=======Accuracy Score===========")
print(metrics.accuracy_score(y_pred_class, y_test))

# print the confusion matrix
print("=======Confusion Matrix===========")
metrics.confusion_matrix(y_pred_class, y_test)

0.7607758620689655


array([[ 22,  61],
       [ 50, 331]])

In [20]:
# print message text for false positives (ham incorrectly classifier)
# X_test[(y_pred_class==1) & (y_test==0)]
X_test[y_pred_class > y_test]

1770    isuca poor shinichirou asano bad luck parent a...
1678    jikan shihaisha many class koyuki honda look f...
1579    coppelion first glance ibara naruse friend aoi...
1671    island tv remote island far mainland name uras...
1785    project scard praeter kizu siege foreign organ...
1602    kenka banchou otome girl beat boy kenka bancho...
1683    battle upon creation marble launch machine kno...
1697    togainu chi wake third world war leave japan r...
1628    ikkitousen kanto region seven high school figh...
1696    trickster edogawa ranpo shounen tanteidan yori...
1703    hidan aria aa akari mamiya student tokyo butei...
1700    dance devil ritsuka tachibana always good stud...
1804    koi koi tanaka tetsuro excite get full scholar...
1820    momo kyun sword momoko beautiful young sword f...
1846    bikini warrior darkness threaten world four he...
1829    shoumetsu toshi one day city suddenly disappea...
1748    keishichou tokumubu tokushu kyouakuhan taisaku...
1645    tokyo 

In [21]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline

pipe = Pipeline([('bow', CountVectorizer()), 
                 ('tfid', TfidfTransformer()),  
                 ('model', SVC(gamma = "scale", class_weight = "balanced", C = 0.5, tol = 0.9))])

pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)

# calculate accuracy of class predictions
print("=======Accuracy Score===========")
print(metrics.accuracy_score(y_test, y_pred))

# print the confusion matrix
print("=======Confusion Matrix===========")
metrics.confusion_matrix(y_test, y_pred)

0.7931034482758621


array([[ 11,  61],
       [ 35, 357]])

In [22]:
file_name = "test.csv"
entry = pd.read_csv(global_dir + f"data/{file_name}")
entry["cleaned_summary"] = entry["summary"].apply(remove_keywords)
entry["cleaned_summary"]

0    solo levelling sing dub weak hunter mankind gr...
Name: cleaned_summary, dtype: object

In [23]:
test = entry["cleaned_summary"]
test_output = 1
vect_test = CountVectorizer()
vect_test.fit_transform(X_train)

# learn training data vocabulary, then use it to create a document-term matrix
test_entry = vect_test.transform(test)

# examine the document-term matrix
print(type(X_train_dtm), X_train_dtm.shape)

print(type(test_entry), test_entry.shape)

nb.predict(test_entry) # Output array is prediction, 1 = Recommend, 0 = Not Recommend

<class 'scipy.sparse._csr.csr_matrix'> (1390, 14586)
<class 'scipy.sparse._csr.csr_matrix'> (1, 14586)


array([1])

In [None]:
import mlflow
from sklearn.metrics import mean_squared_error
from mlflow.models import infer_signature
# Start a new MLflow run
with mlflow.start_run():
    # Log the hyperparameters
    mlflow.log_param("gamma", "scale") # gamma = "scale", class_weight = "balanced", C = 0.5, tol = 0.9
    mlflow.log_param("class_weight", "balanced")
    mlflow.log_param("C", 0.5)
    mlflow.log_param("tol", 0.9)
    # Train the model on the training data
    nb.fit(X_train_dtm, y_train)
    # Make predictions on the testing data
    predictions = nb.predict(X_test_dtm)
    # Calculate the MSE of the predictions
    mse = mean_squared_error(y_test, predictions)
    # Log the metric
    mlflow.log_metric("mse", mse)
    # # Log the model artifact
    # mlflow.sklearn.log_model(nb, "model")
    # mlflow.sklearn.save_model(nb, "model")
    # # Print the MSE of the predictions
    # print("Mean Squared Error:", mse)
    signature = infer_signature(X_test, y_pred)

    # Log parameters and metrics using the MLflow APIs
    # mlflow.log_params(params)
    mlflow.log_metrics({"mse": mean_squared_error(y_test, y_pred)})

    # Log the sklearn model and register as version 1
    mlflow.sklearn.log_model(
        sk_model=nb,
        artifact_path="sklearn-model",
        signature=signature,
        registered_model_name="sk-learn-svc-model",
    )
    # End the MLflow run
    mlflow.end_run()

Registered model 'sk-learn-svc-model' already exists. Creating a new version of this model...
Created version '2' of model 'sk-learn-svc-model'.


In [None]:
result = mlflow.register_model(
    "runs:/6b93d7ec436a4984b079be44ad0a6b93", "sk-learn-svc"
)

In [1]:
import mlflow
tracking_uri = mlflow.get_tracking_uri()
print(f"Current MLflow tracking URI: {tracking_uri}")

Current MLflow tracking URI: file:///workspaces/anime-predictor/files/mlruns


In [None]:
# from mlflow import MlflowClient

# client = MlflowClient()
# client.create_registered_model("sk-learn-svc-anime_model")