# Anime Genre Classifier
Using the description of the anime, this model will classify it into specific genres. The following dataset has been used:
https://www.kaggle.com/datasets/tarundalal/anime-dataset

The dataset has limited number of data, hence the accuracy score might be extremely poor.

## Dataset Preprocessing

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
!pip install scikit-learn

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
!pip install nltk
import nltk
nltk.download('punkt')

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [5]:
!pip install sentencepiece
import sentencepiece as spm

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [6]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [22]:
anime_data = pd.read_csv('drive/MyDrive/Colab_Notebooks/AnimeWorld.csv')

In [23]:
ani = pd.DataFrame(anime_data)
ani.head()

Unnamed: 0,Anime,Genre,Description,Studio,Year,Rating
0,Re:Zero kara Hajimeru Isekai Seikatsu 2nd Season,"['Drama', 'Fantasy', 'Suspense']",A reunion that was supposed to spell the arriv...,White Fox,"Jul 8, 2020",8.37
1,"Kanojo, Okarishimasu","['Comedy', 'Romance']",Kazuya Kinoshita is a 20-year-old college stud...,TMS Entertainment,"Jul 11, 2020",
2,The God of High School,"['Action', 'Fantasy', 'Sci-Fi', 'Supernatural']","The ""God of High School"" tournament has begun,...",MAPPA,"Jul 6, 2020",
3,Maou Gakuin no Futekigousha: Shijou Saikyou no...,"['Action', 'Fantasy']","In the distant past, a war between humans and ...",SILVER LINK.,"Jul 4, 2020",
4,Enen no Shouboutai: Ni no Shou,"['Action', 'Supernatural']",After his confrontation in the Nether with his...,David Production,"Jul 4, 2020",


In [24]:
ani.columns

Index(['Anime', 'Genre', 'Description', 'Studio', 'Year', 'Rating'], dtype='object')

In [25]:
# Removing unnecessary columns, since this is going to be an NLP based project, so only synopsis and genre are required.
ani = ani.drop(['Studio', 'Year', 'Rating'], axis=1)
ani = ani.rename(columns={'Anime':'title', 'Genre':'genre', 'Description':'synopsis'})

In [26]:
ani.head()

Unnamed: 0,title,genre,synopsis
0,Re:Zero kara Hajimeru Isekai Seikatsu 2nd Season,"['Drama', 'Fantasy', 'Suspense']",A reunion that was supposed to spell the arriv...
1,"Kanojo, Okarishimasu","['Comedy', 'Romance']",Kazuya Kinoshita is a 20-year-old college stud...
2,The God of High School,"['Action', 'Fantasy', 'Sci-Fi', 'Supernatural']","The ""God of High School"" tournament has begun,..."
3,Maou Gakuin no Futekigousha: Shijou Saikyou no...,"['Action', 'Fantasy']","In the distant past, a war between humans and ..."
4,Enen no Shouboutai: Ni no Shou,"['Action', 'Supernatural']",After his confrontation in the Nether with his...


In [27]:
# replace the " character with an empty string in the 'genre' column
ani['genre'] = ani['genre'].str.replace('"', '')
ani['genre'] = ani['genre'].str.replace('[', '')
ani['genre'] = ani['genre'].str.replace(']', '')
ani['genre'] = ani['genre'].str.replace("'", '')
ani['genre'] = ani['genre'].str.replace("'", '')

  ani['genre'] = ani['genre'].str.replace('[', '')
  ani['genre'] = ani['genre'].str.replace(']', '')


In [28]:
unique_genres = set(ani['genre'].str.split(', ').sum())

In [29]:
unique_genres

{'',
 'Action',
 'Adventure',
 'Avant Garde',
 'Boys Love',
 'Comedy',
 'Drama',
 'Ecchi',
 'Fantasy',
 'Girls Love',
 'Gourmet',
 'Horror',
 'Mystery',
 'Romance',
 'Sci-Fi',
 'Slice of Life',
 'Sports',
 'Supernatural',
 'Suspense'}

In [30]:
# Convert synopsis column to string data type
ani['synopsis'] = ani['synopsis'].astype(str)

In [31]:
# tokenize the genres in the 'genre' column using NLTK
tokenized_genres = []
for genre in unique_genres:
    tokens = nltk.word_tokenize(genre)
    tokenized_genres.append(tokens)

In [32]:
print(tokenized_genres)

[[], ['Action'], ['Boys', 'Love'], ['Comedy'], ['Supernatural'], ['Drama'], ['Adventure'], ['Slice', 'of', 'Life'], ['Suspense'], ['Ecchi'], ['Avant', 'Garde'], ['Fantasy'], ['Mystery'], ['Sports'], ['Romance'], ['Horror'], ['Sci-Fi'], ['Girls', 'Love'], ['Gourmet']]


In [33]:
from transformers import AutoTokenizer

model_ckpt = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

In [34]:
# Tokenize the 'title', 'genre', and 'synopsis' columns
title_tokens = []
for title in ani['title']:
    tokens = tokenizer.encode(title, add_special_tokens=True)
    title_tokens.append(tokens)

genre_tokens = []
for genre in ani['genre']:
    tokens = tokenizer.encode(genre, add_special_tokens=True)
    genre_tokens.append(tokens)

synopsis_tokens = []
for synopsis in ani['synopsis']:
    tokens = tokenizer.encode(synopsis, add_special_tokens=True)
    synopsis_tokens.append(tokens)

# Add the tokenized 'title', 'genre', and 'synopsis' columns to the DataFrame
ani['title_tokens'] = title_tokens
ani['genre_tokens'] = genre_tokens
ani['synopsis_tokens'] = synopsis_tokens

# Save the tokenized DataFrame to a new CSV file
ani.to_csv('ani_toke.csv', index=False)

Token indices sequence length is longer than the specified maximum sequence length for this model (646 > 512). Running this sequence through the model will result in indexing errors


## Feature Extraction

In [35]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [36]:
ani['text'] = ani['title_tokens'] + ani['synopsis_tokens']

In [37]:
# Convert the tokenized text into strings
ani['text'] = ani['text'].apply(lambda x: ' '.join(map(str, x)))

In [38]:
tfidf_vectorizer = TfidfVectorizer()

In [39]:
tfidf_matrix = tfidf_vectorizer.fit_transform(ani['text'])

In [40]:
# Fit and transform the vectorizer on the tokenized data
X = tfidf_vectorizer.fit_transform(ani['text'])

# Get the feature names
feature_names = tfidf_vectorizer.get_feature_names_out()

In [41]:
# Print the feature names
print(feature_names)

['100' '1000' '10000' ... '9995' '9996' '9997']


In [42]:
# Print the shape of the TF-IDF matrix
print(tfidf_matrix.shape)

(2981, 15323)


## Classification Model

In [43]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [44]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(tfidf_matrix, ani['genre'], test_size=0.2, random_state=42)

In [45]:
# Create the Random Forest model with default parameters
rfc = RandomForestClassifier()

# Fit the model on the training data
rfc.fit(X_train, y_train)

In [46]:
# Make predictions on the test data
y_pred = rfc.predict(X_test)

# Print the accuracy score
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

0.17755443886097153


## SVC:

In [47]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(tfidf_matrix, ani['genre'], test_size=0.2, random_state=42)

# Create the SVC model with default parameters
svm = SVC()

# Fit the model on the training data
svm.fit(X_train, y_train)

# Make predictions on the test data
y_pred = svm.predict(X_test)

# Print the accuracy score
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

0.12227805695142378
