In [36]:
import pandas as pd
import scipy as sc
import numpy as np
import math
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn import linear_model, metrics, preprocessing, model_selection
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import r2_score, f1_score
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from collections import Counter

In [37]:
df = pd.read_csv("raw_data.csv")
tempdf = pd.read_csv("data_firsts.csv")
df = pd.concat([df,tempdf])
df = df.copy()
df = df.drop('Unnamed: 0', axis=1) #7510 rows × 42 columns

In [38]:
#Drop duplicated links
df.drop_duplicates(subset='Link', keep='last', inplace=True)

#Drop un-needed rows
df.dropna(thresh = 10, inplace = True)
df.dropna(subset = 'Name Romaji', inplace = True)
df.dropna(subset = 'Start Year', inplace = True)
df.dropna(subset = 'Status', inplace = True) 

#Fixing index col after rows delete
df = df.reset_index() 
df = df.drop('index', axis=1) 

#Filling NaN values before merging 
df['Rating 10'].fillna(0, inplace=True)
df['Rating 20'].fillna(0, inplace=True)
df['Rating 30'].fillna(0, inplace=True)
df['Rating 40'].fillna(0, inplace=True)
df['Rating 50'].fillna(0, inplace=True)
df['Rating 60'].fillna(0, inplace=True)
df['Rating 70'].fillna(0, inplace=True)
df['Rating 80'].fillna(0, inplace=True)
df['Rating 90'].fillna(0, inplace=True)
df['Rating 100'].fillna(0, inplace=True)

#Filling missing numeric values
df['Favorites'].fillna(0, inplace=True)
df['Rating Rank'].fillna(0, inplace=True)
df['Popularity Rank'].fillna(0, inplace=True)
df['Volume Count'].fillna(df['Volume Count'].median(), inplace=True)
df['Popularity'].fillna(df['Popularity'].median(), inplace=True)
df['Anime Release Date'].fillna(9999, inplace=True)

In [39]:
#Filling missing non-numeric values
df.Source = df.Source.fillna(df.Source.mode()[0])

for i in range(0, df.shape[0]):
    if df.loc[i,'Name English'] is np.nan:
        df.loc[i,'Name English'] = df.loc[i,'Name Romaji']
    if df.loc[i,'Name Native'] is np.nan:
        df.loc[i,'Name Native'] = df.loc[i,'Name Romaji']
    if df.loc[i,'Synonyms'] is np.nan:
        df.loc[i,'Synonyms'] = df.loc[i,'Name Romaji']
    if df.loc[i,'Tag_One'] is np.nan:
        df.loc[i,'Tag_One'] = 'No Tag'
    if df.loc[i,'Tag_Two'] is np.nan:
        df.loc[i,'Tag_Two'] = 'No Tag'
    if df.loc[i,'Tag_Three'] is np.nan:
        df.loc[i,'Tag_Three'] = 'No Tag'
    if df.loc[i,'Genre_One'] is np.nan:
        df.loc[i,'Genre_One'] = 'No Genre'
    if df.loc[i,'Genre_Two'] is np.nan:
        df.loc[i,'Genre_Two'] = 'No Genre'
    if df.loc[i,'Genre_Three'] is np.nan:
        df.loc[i,'Genre_Three'] = 'No Genre'
        
    df['Average Rating'] = (((df['Rating 10'])*10 + (df['Rating 20'])*20 + (df['Rating 30'])*30 + (df['Rating 40'])*40 +
                            (df['Rating 50'])*50 + (df['Rating 60'])*60 + (df['Rating 70'])*70 + (df['Rating 80'])*80 +
                            (df['Rating 90'])*90 + (df['Rating 100'])*100)/((df['Rating 10']) + (df['Rating 20']) + 
                            (df['Rating 30']) + (df['Rating 40']) +(df['Rating 50']) + (df['Rating 60']) + (df['Rating 70'])
                            + (df['Rating 80']) +(df['Rating 90']) + (df['Rating 100']))) #Adding average
    
    #New Column: Does it got an adaptation?
    if ((df.loc[i,'Related 1 Type'] == 'TV · Releasing') or (df.loc[i,'Related 2 Type'] == 'TV · Releasing') or
        (df.loc[i,'Related 1 Type'] == 'Movie · Finished') or (df.loc[i,'Related 2 Type'] == 'Movie · Finished') or
        (df.loc[i,'Related 1 Type'] == 'OVA · Finished') or (df.loc[i,'Related 2 Type'] == 'OVA · Finished') or
        (df.loc[i,'Related 1 Type'] == 'ONA · Finished') or (df.loc[i,'Related 2 Type'] == 'ONA · Finished') or
        (df.loc[i,'Related 1 Type'] == 'ONA · Releasing') or (df.loc[i,'Related 2 Type'] == 'ONA · Releasing') or
        (df.loc[i,'Related 1 Type'] == 'TV Short · Finished') or (df.loc[i,'Related 2 Type'] == 'TV Short · Finished') or
        (df.loc[i,'Related 1 Type'] == 'TV · Finished') or (df.loc[i,'Related 2 Type'] == 'TV · Finished') or
        (df.loc[i,'Related 1 Type'] == 'Special · Finished') or (df.loc[i,'Related 2 Type'] == 'Special · Finished')):
        df.loc[i,'Adaptation'] = 1
    else:
        df.loc[i,'Adaptation'] = 0

#Filling missing rating score
for i in range(0, df.shape[0]):
    if np.isnan(df.loc[i,'Average Rating']):
        df.loc[i,'Average Rating'] = df.loc[i,'Mean Score']
df['Average Rating'].fillna(999, inplace=True)

In [40]:
#Dropping unnecessary columns
df = df.drop(['Rating 10','Rating 20','Rating 30','Rating 40','Rating 50','Rating 60',
             'Rating 70','Rating 80','Rating 90','Rating 100'], axis=1)
df = df.drop('Chapter Count', axis=1)
df = df.drop('End Date', axis=1)
df = df.drop('Average Score', axis=1)
df = df.drop('Mean Score', axis=1)
#df = df.drop('Related 1', axis=1)
df = df.drop('Related 1 Type', axis=1)
#df = df.drop('Related 2', axis=1)
df = df.drop('Related 2 Type', axis=1)

In [41]:
#Change type to save space
df["Name Romaji"] = df["Name Romaji"].astype('category')
df["Name Native"] = df["Name Native"].astype('category')
df["Name English"] = df["Name English"].astype('category')
df["Synonyms"] = df["Synonyms"].astype('category')
df["Format"] = df["Format"].astype('category')
df["Status"] = df["Status"].astype('category')
df["Source"] = df["Source"].astype('category')
df["Genre_One"] = df["Genre_One"].astype('category')
df["Genre_Two"] = df["Genre_Two"].astype('category')
df["Genre_Three"] = df["Genre_Three"].astype('category')
df["Tag_One"] = df["Tag_One"].astype('category')
df["Tag_Two"] = df["Tag_Two"].astype('category')
df["Tag_Three"] = df["Tag_Three"].astype('category')

replace_map_Genre = {'Drama':1, 'Adventure':2, 'Action':3, 'Comedy':4, 'Mystery':5, 'Romance':6, 'Fantasy':7,
 'Horror':8, 'Sci-Fi':9, 'Psychological':10, 'Ecchi':11, 'Supernatural':12, 'Music':13,
 'Thriller':14, 'Slice of Life':15, 'Sports':16, 'Mahou Shoujo':17, 'Mecha':18, 'No Genre':999}
#replace_map_Status = {'Finished':1, 'Releasing':2, 'Hiatus':3, 'Cancelled':4, 'Not Yet Released':5}
#replace_map_Source = {'Original':1 , 'Manga':2 , 'Visual Novel':3 , 'Video Game':4 , 'Web Novel':5 ,'Other':6 ,
                      #'Light Novel':7 ,'Multimedia Project':8 , 'Game':9 , 'Novel':10 , 'Live Action':11 , 'Anime':12}

df.replace(replace_map_Genre, inplace = True)
#df.replace(replace_map_Status, inplace = True)
#df.replace(replace_map_Source, inplace = True)

df = pd.get_dummies(df, columns=['Status'], prefix = ['Status'])
df = pd.get_dummies(df, columns=['Source'], prefix = ['Source'])

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4078 entries, 0 to 4077
Data columns (total 43 columns):
 #   Column                     Non-Null Count  Dtype   
---  ------                     --------------  -----   
 0   Name Romaji                4078 non-null   category
 1   Name Native                4078 non-null   category
 2   Name English               4078 non-null   category
 3   Synonyms                   4078 non-null   category
 4   Format                     4078 non-null   category
 5   Volume Count               4078 non-null   float64 
 6   Start Date                 4078 non-null   object  
 7   Start Year                 4078 non-null   float64 
 8   Popularity                 4078 non-null   float64 
 9   Favorites                  4078 non-null   float64 
 10  Genre_One                  4078 non-null   category
 11  Genre_Two                  4078 non-null   category
 12  Genre_Three                4078 non-null   category
 13  Tag_One                    4078 n

count             4078
unique            2151
top       Apr 10, 2020
freq                12
Name: Start Date, dtype: object