In [1]:
import pandas as pd
import numpy as np


In [2]:
comic = pd.read_csv('comics_data.csv')

In [3]:
comic.head()

Unnamed: 0,title,description,rating,year,tags,cover
0,Salad Days (Tang LiuZang) - Part 2,The second season of Salad Days (Tang LiuZang).,4.7,2021.0,"['BL', 'Manhua', 'Romance', 'Shounen-ai', 'Spo...",https://cdn.anime-planet.com/manga/primary/sal...
1,The Master of Diabolism,As the grandmaster who founded the Demonic Sec...,4.7,2017.0,"['Action', 'Adventure', 'BL', 'Comedy', 'Manhu...",https://cdn.anime-planet.com/manga/primary/the...
2,JoJo's Bizarre Adventure Part 7: Steel Ball Run,"Set in 1890, Steel Ball Run spotlights Gyro Ze...",4.7,2004.0,"['Action', 'Adventure', 'Horror', 'Mystery', '...",https://cdn.anime-planet.com/manga/primary/joj...
3,A Sign of Affection,"Yuki is a typical college student, whose world...",4.7,2019.0,"['Romance', 'Shoujo', 'Slice of Life', 'Disabi...",https://cdn.anime-planet.com/manga/primary/a-s...
4,Moriarty the Patriot,"Before he was Sherlock’s rival, Moriarty fough...",4.7,2016.0,"['Mystery', 'Shounen', 'Detectives', 'England'...",https://cdn.anime-planet.com/manga/primary/mor...


In [4]:
comic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70948 entries, 0 to 70947
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   title        70948 non-null  object 
 1   description  70917 non-null  object 
 2   rating       29871 non-null  float64
 3   year         70124 non-null  float64
 4   tags         70948 non-null  object 
 5   cover        70948 non-null  object 
dtypes: float64(2), object(4)
memory usage: 3.2+ MB


Filtering | Preprocessing

In [5]:
# title - name of comic
# description - data
# year - data
# tags - data
# cover - links to refer
comic = comic[['title', 'description', 'year', 'tags', 'cover']]

In [6]:
comic.isnull().sum()

title            0
description     31
year           824
tags             0
cover            0
dtype: int64

In [7]:
comic.dropna(inplace=True)

In [8]:
comic.duplicated().sum()

0

list can be a string so convert it to list using ast.literal_eval
'[ ]' -> [ ]

In [9]:
type(comic['tags'][0])

str

In [10]:
import ast

In [11]:
comic['tags'] = comic['tags'].apply(lambda x: ast.literal_eval(x))

In [12]:
comic['tags'] = comic['tags'].apply(lambda x: [i.replace(" ", "") for i in x])
comic['tags'][4]

['Mystery',
 'Shounen',
 'Detectives',
 'England',
 'Europe',
 'Historical',
 'SherlockHolmes',
 'AdaptedtoAnime',
 'BasedonaNovel']

In [13]:
comic['overview'] = comic['year'].astype(str) + " " + comic['description'] + " " + comic['tags'].apply(' '.join)
comic['overview'][0]

'2021.0 The second season of\xa0Salad Days (Tang LiuZang). BL Manhua Romance Shounen-ai Sports Webtoons FullColor'

Server crashed duu to insufficient ram so reduce data

In [14]:
new_df = comic[['title', 'cover', 'overview']]
new_df = new_df.head(20000)

In [15]:
new_df.head()

Unnamed: 0,title,cover,overview
0,Salad Days (Tang LiuZang) - Part 2,https://cdn.anime-planet.com/manga/primary/sal...,2021.0 The second season of Salad Days (Tang L...
1,The Master of Diabolism,https://cdn.anime-planet.com/manga/primary/the...,2017.0 As the grandmaster who founded the Demo...
2,JoJo's Bizarre Adventure Part 7: Steel Ball Run,https://cdn.anime-planet.com/manga/primary/joj...,"2004.0 Set in 1890, Steel Ball Run spotlights ..."
3,A Sign of Affection,https://cdn.anime-planet.com/manga/primary/a-s...,"2019.0 Yuki is a typical college student, whos..."
4,Moriarty the Patriot,https://cdn.anime-planet.com/manga/primary/mor...,"2016.0 Before he was Sherlock’s rival, Moriart..."


Vectorize the data

In [16]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(max_features=5000, stop_words='english')


In [17]:
vectors = vectorizer.fit_transform(new_df['overview']).toarray()
vectors.shape

(20000, 5000)

In [26]:
for i in vectorizer.get_feature_names_out():
  print(i, end=" ")



Stemming to convert multiple forms of verbes into one

In [19]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [20]:
def stemming(text):
  y = []
  for i in text.split():
    y.append(ps.stem(i))
  return " ".join(y)

In [21]:
new_df['overview'] = new_df['overview'].apply(stemming)

Calculate the nearest vectors which are to be recommended as they will contain like wise content - tags

In [22]:
from sklearn.metrics.pairwise import cosine_similarity

In [23]:
similarity = cosine_similarity(vectors)
similarity

array([[1.        , 0.21147747, 0.        , ..., 0.06201737, 0.        ,
        0.19611614],
       [0.21147747, 1.        , 0.10461316, ..., 0.03409972, 0.06745406,
        0.16174916],
       [0.        , 0.10461316, 1.        , ..., 0.        , 0.07585826,
        0.        ],
       ...,
       [0.06201737, 0.03409972, 0.        , ..., 1.        , 0.03296902,
        0.07905694],
       [0.        , 0.06745406, 0.07585826, ..., 0.03296902, 1.        ,
        0.0521286 ],
       [0.19611614, 0.16174916, 0.        , ..., 0.07905694, 0.0521286 ,
        1.        ]])

In [29]:
def recommendation():
  comic_name = 'Level Up with the Gods'  #input('Enter the name of the comic: ')
  comic_id = new_df[new_df['title'] == comic_name].index[0]
  distances = similarity[comic_id]
  comics_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x: x[1])[1:11]
  for i in comics_list:
    print(new_df.iloc[i[0]].title, new_df.iloc[i[0]].cover)

In [31]:
recommendation()

I Log In Alone - Part 3 https://cdn.anime-planet.com/manga/primary/i-log-in-alone-part-3-1-285x399.webp?t=1641652200
Tale of a Scribe Who Retires to the Countryside - Part 2 https://cdn.anime-planet.com/manga/primary/tale-of-a-scribe-who-retires-to-the-countryside-part-2-1-285x400.jpg?t=1632101617
LV999 no Murabito (Light Novel) https://cdn.anime-planet.com/manga/primary/lv999-no-murabito-light-novel-1-190x270.jpg?t=1625916095
Infectee https://cdn.anime-planet.com/manga/primary/infectee-1-190x297.jpg?t=1625914726
Leveling with the Gods (Novel) https://cdn.anime-planet.com/manga/primary/leveling-with-the-gods-novel-1-285x386.jpg?t=1628029003
The Road of Karma https://cdn.anime-planet.com/manga/primary/the-road-of-karma-1-190x269.jpg?t=1625920635
The Frozen Player Returns https://cdn.anime-planet.com/manga/primary/the-frozen-player-returns-1-285x399.webp?t=1647320859
I Log In Alone - Part 2 https://cdn.anime-planet.com/manga/primary/i-log-in-alone-part-2-1-190x266.jpg?t=1625929751
Tower 