# Importing Libraries

In [3]:
import pandas as pd
import numpy as np

## Importing the dataset

In [4]:
listwith_synopsis = pd.read_csv('anime_with_synopsis.csv')
anime_df = pd.read_csv('anime.csv')

In [5]:
listwith_synopsis.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16214 entries, 0 to 16213
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   MAL_ID     16214 non-null  int64 
 1   Name       16214 non-null  object
 2   Score      16214 non-null  object
 3   Genres     16214 non-null  object
 4   sypnopsis  16206 non-null  object
dtypes: int64(1), object(4)
memory usage: 633.5+ KB


In [6]:
anime_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17562 entries, 0 to 17561
Data columns (total 35 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   MAL_ID         17562 non-null  int64 
 1   Name           17562 non-null  object
 2   Score          17562 non-null  object
 3   Genres         17562 non-null  object
 4   English name   17562 non-null  object
 5   Japanese name  17562 non-null  object
 6   Type           17562 non-null  object
 7   Episodes       17562 non-null  object
 8   Aired          17562 non-null  object
 9   Premiered      17562 non-null  object
 10  Producers      17562 non-null  object
 11  Licensors      17562 non-null  object
 12  Studios        17562 non-null  object
 13  Source         17562 non-null  object
 14  Duration       17562 non-null  object
 15  Rating         17562 non-null  object
 16  Ranked         17562 non-null  object
 17  Popularity     17562 non-null  int64 
 18  Members        17562 non-n

## Filtering which columns to keep

In [7]:
# MAL_ID 
# English name
# Genres
# Type
# Producers
# Studios
# sypnopsis

anime_df = anime_df[['MAL_ID', 'Name', 'Genres', 'Type', 'Producers', 'Studios']]

In [8]:
templist = listwith_synopsis['sypnopsis']
anime_df = anime_df.join(templist)

In [9]:
# We have a dataframe with all the content we need.
anime_df.head()

Unnamed: 0,MAL_ID,Name,Genres,Type,Producers,Studios,sypnopsis
0,1,Cowboy Bebop,"Action, Adventure, Comedy, Drama, Sci-Fi, Space",TV,Bandai Visual,Sunrise,"In the year 2071, humanity has colonized sever..."
1,5,Cowboy Bebop: Tengoku no Tobira,"Action, Drama, Mystery, Sci-Fi, Space",Movie,"Sunrise, Bandai Visual",Bones,"other day, another bounty—such is the life of ..."
2,6,Trigun,"Action, Sci-Fi, Adventure, Comedy, Drama, Shounen",TV,Victor Entertainment,Madhouse,"Vash the Stampede is the man with a $$60,000,0..."
3,7,Witch Hunter Robin,"Action, Mystery, Police, Supernatural, Drama, ...",TV,"TV Tokyo, Bandai Visual, Dentsu, Victor Entert...",Sunrise,ches are individuals with special powers like ...
4,8,Bouken Ou Beet,"Adventure, Fantasy, Shounen, Supernatural",TV,"TV Tokyo, Dentsu",Toei Animation,It is the dark century and the people are suff...


In [10]:
# The data is still in different format which can hamper the performance of the program

## Data Preprocessing

In [11]:
# Missing data in this data set is filled with the string 'Unknown'
anime_df.replace('Unknown', np.nan, inplace = True)

In [12]:
# Checking for missing data
anime_df.isnull().sum()

MAL_ID          0
Name            0
Genres         63
Type           37
Producers    7794
Studios      7079
sypnopsis    1356
dtype: int64

In [13]:
# Dropping the rows without synopsis
anime_df.dropna(inplace = True)

In [14]:
anime_df.reset_index(inplace = True)

In [15]:
# We want all of our features to be the type list so its easy to manipulate the data when we combine it in the future 

In [16]:
anime_df

Unnamed: 0,index,MAL_ID,Name,Genres,Type,Producers,Studios,sypnopsis
0,0,1,Cowboy Bebop,"Action, Adventure, Comedy, Drama, Sci-Fi, Space",TV,Bandai Visual,Sunrise,"In the year 2071, humanity has colonized sever..."
1,1,5,Cowboy Bebop: Tengoku no Tobira,"Action, Drama, Mystery, Sci-Fi, Space",Movie,"Sunrise, Bandai Visual",Bones,"other day, another bounty—such is the life of ..."
2,2,6,Trigun,"Action, Sci-Fi, Adventure, Comedy, Drama, Shounen",TV,Victor Entertainment,Madhouse,"Vash the Stampede is the man with a $$60,000,0..."
3,3,7,Witch Hunter Robin,"Action, Mystery, Police, Supernatural, Drama, ...",TV,"TV Tokyo, Bandai Visual, Dentsu, Victor Entert...",Sunrise,ches are individuals with special powers like ...
4,4,8,Bouken Ou Beet,"Adventure, Fantasy, Shounen, Supernatural",TV,"TV Tokyo, Dentsu",Toei Animation,It is the dark century and the people are suff...
...,...,...,...,...,...,...,...,...
5950,16200,40506,Shadowverse (TV),"Game, Fantasy",TV,"TV Tokyo, Sonilude",Zexcs,usic video for Hoshimi Production's fourth dig...
5951,16204,40513,Nami yo Kiitekure,"Comedy, Drama, Romance, Seinen",TV,"Mainichi Broadcasting System, Kodansha, DMM pi...",Sunrise,cap of the first 9 episodes of SK∞ .
5952,16206,40517,Ano Hi no Kokoro wo Toraete,"Sci-Fi, Drama",ONA,"A-1 Pictures, Omnibus Japan, Tohokushinsha Fil...",Sunrise,The stage is Shibuya. When Ryuuhei Oda was in ...
5953,16211,40526,"Dragon, Ie wo Kau.","Comedy, Fantasy, Shounen",TV,Pony Canyon,Signal.MD,Sequel to Higurashi no Naku Koro ni Gou .


In [17]:
ind = range(0,5955)
for count in ind:
    obj = anime_df['Genres'][count].split(',')
    List = []
    for i in obj:
        List.append(i)
    anime_df['Genres'][count] = List

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  anime_df['Genres'][count] = List


In [18]:
anime_df['Studios'] = anime_df['Studios'].apply(lambda x:x.split())

In [19]:
anime_df['sypnopsis'] = anime_df['sypnopsis'].apply(lambda x:x.split())

In [20]:
ind = range(0,5955)
for count in ind:
    obj = anime_df['Producers'][count].split(',')
    List = []
    for i in obj:
        List.append(i)
    anime_df['Producers'][count] = List

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  anime_df['Producers'][count] = List


In [21]:
anime_df['Type'] = anime_df['Type'].apply(lambda x:x.split())

In [22]:
anime_df['Genres'] = anime_df['Genres'].apply(lambda x:[i.replace(" ", "") for i in x])
anime_df['Producers'] = anime_df['Producers'].apply(lambda x:[i.replace(" ", "") for i in x])
anime_df['Studios'] = anime_df['Studios'].apply(lambda x:[i.replace(" ", "") for i in x])
anime_df['sypnopsis'] = anime_df['sypnopsis'].apply(lambda x:[i.replace(" ", "") for i in x])

## Creating the final dataframe

In [23]:
anime_df['Tags'] = anime_df['Genres'] + anime_df['Type'] + anime_df['Producers'] + anime_df['Studios'] + anime_df['sypnopsis']

In [24]:
anidata = anime_df[['MAL_ID', 'Name', 'Tags']]
anidata

Unnamed: 0,MAL_ID,Name,Tags
0,1,Cowboy Bebop,"[Action, Adventure, Comedy, Drama, Sci-Fi, Spa..."
1,5,Cowboy Bebop: Tengoku no Tobira,"[Action, Drama, Mystery, Sci-Fi, Space, Movie,..."
2,6,Trigun,"[Action, Sci-Fi, Adventure, Comedy, Drama, Sho..."
3,7,Witch Hunter Robin,"[Action, Mystery, Police, Supernatural, Drama,..."
4,8,Bouken Ou Beet,"[Adventure, Fantasy, Shounen, Supernatural, TV..."
...,...,...,...
5950,40506,Shadowverse (TV),"[Game, Fantasy, TV, TVTokyo, Sonilude, Zexcs, ..."
5951,40513,Nami yo Kiitekure,"[Comedy, Drama, Romance, Seinen, TV, MainichiB..."
5952,40517,Ano Hi no Kokoro wo Toraete,"[Sci-Fi, Drama, ONA, A-1Pictures, OmnibusJapan..."
5953,40526,"Dragon, Ie wo Kau.","[Comedy, Fantasy, Shounen, TV, PonyCanyon, Sig..."


In [25]:
anidata['Tags'] = anidata['Tags'].apply(lambda x:" ".join(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  anidata['Tags'] = anidata['Tags'].apply(lambda x:" ".join(x))


In [26]:
import nltk
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [27]:
def stem(text):
    y = []
    
    for i in text.split():
        y.append(ps.stem(i))
        
    return " ".join(y)

In [28]:
anidata['Tags'] = anidata['Tags'].apply(stem)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  anidata['Tags'] = anidata['Tags'].apply(stem)


In [29]:
anidata['Tags'][0]

'action adventur comedi drama sci-fi space tv bandaivisu sunris in the year 2071, human ha colon sever of the planet and moon of the solar system leav the now uninhabit surfac of planet earth behind. the inter solar system polic attempt to keep peac in the galaxy, aid in part by outlaw bounti hunters, refer to as "cowboys." the ragtag team aboard the spaceship bebop are two such individuals. mellow and carefre spike spiegel is balanc by hi boisterous, pragmat partner jet black as the pair make a live chase bounti and collect rewards. thrown off cours by the addit of new member that they meet in their travels—ein, a genet engineered, highli intellig welsh corgi; femm fatal fay valentine, an enigmat trickster with memori loss; and the strang comput whiz kid edward wong—th crew embark on thrill adventur that unravel each member\' dark and mysteri past littl by little. well-balanc with high densiti action and light-heart comedy, cowboy bebop is a space western classic and an homag to the s

In [30]:
anidata['Tags'] = anidata['Tags'].apply(lambda x:x.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  anidata['Tags'] = anidata['Tags'].apply(lambda x:x.lower())


## Converting this string into vectors

In [31]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000,stop_words='english')

In [32]:
anivectors = cv.fit_transform(anidata['Tags']).toarray()

In [33]:
anivectors

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [3, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [34]:
cv.get_feature_names()

['000',
 '10',
 '100',
 '10th',
 '11',
 '12',
 '12th',
 '13',
 '13th',
 '14',
 '14th',
 '15',
 '15th',
 '16',
 '16th',
 '17',
 '18',
 '19',
 '1945',
 '1968',
 '1984',
 '1999',
 '19th',
 '1st',
 '20',
 '200',
 '2002',
 '2004',
 '2005',
 '2008',
 '2009',
 '2010',
 '2011',
 '2012',
 '2013',
 '2014',
 '2015',
 '2016',
 '2017',
 '2018',
 '2020',
 '20th',
 '21',
 '21st',
 '22',
 '23',
 '24',
 '25',
 '25th',
 '26',
 '27',
 '28',
 '2nd',
 '30',
 '300',
 '3d',
 '3hz',
 '3rd',
 '40',
 '4th',
 '50',
 '500',
 '5pb',
 '5th',
 '60',
 '6th',
 '707',
 '7th',
 '80',
 '81produc',
 '8bit',
 '8th',
 '90',
 '9th',
 'abandon',
 'abcanim',
 'abduct',
 'abe',
 'abh',
 'abil',
 'abilities',
 'ability',
 'abl',
 'abnorm',
 'aboard',
 'abov',
 'absolut',
 'absorb',
 'absurd',
 'abus',
 'academ',
 'academi',
 'academy',
 'accept',
 'access',
 'accid',
 'accident',
 'acclaim',
 'accompani',
 'accomplish',
 'accord',
 'account',
 'accustom',
 'ace',
 'achiev',
 'acquaint',
 'acquir',
 'act',
 'acta',
 'actas',
 'ac

## Applying cosine similarity to find Distances between movie vectors

In [35]:
from sklearn.metrics.pairwise import cosine_similarity

In [36]:
anisimilarity = cosine_similarity(anivectors)

In [37]:
sorted(list(enumerate(anisimilarity[100])),reverse=True, key = lambda x:x[1])[1:6]

[(1374, 0.5309323916898032),
 (984, 0.48186234149032653),
 (4875, 0.4150286783196449),
 (38, 0.39640834702882205),
 (190, 0.39135119960978293)]

In [38]:
def recommend(anime):
    anime_index = anidata[anidata['Name'] == anime].index[0]
    distances = anisimilarity[anime_index]
    recomend_list = sorted(list(enumerate(distances)),reverse=True, key = lambda x:x[1])[1:6]
    
    for i in recomend_list:
        print(anidata.iloc[i[0]].Name)
    return

In [39]:
recommend('Grappler Baki (TV)')

Love Live! School Idol Project OVA
Kenkou Zenrakei Suieibu Umishou
Skull Man
Cleavage
Reideen


In [40]:
anidata.iloc[970].Name

'Ginga Tetsudou Monogatari'

In [41]:
import pickle

In [42]:
pickle.dump(anidata,open('animes.pkl', 'wb'))

In [44]:
anidata['Name'].values

array(['Cowboy Bebop', 'Cowboy Bebop: Tengoku no Tobira', 'Trigun', ...,
       'Ano Hi no Kokoro wo Toraete', 'Dragon, Ie wo Kau.',
       'No Guns Life 2nd Season'], dtype=object)

In [45]:
pickle.dump(anisimilarity, open('anisimilarity.pkl', 'wb'))