In [1]:
%run manual_integration.py
import matplotlib as mpl
DF_out = pd.read_csv('./../final_data/integrated_entity.csv')
pd.set_option('display.max_rows', 10)

*Load the integrated entity (before cleaning):*

In [2]:
DF_out

Unnamed: 0.1,Unnamed: 0,name,type,kind,development_stage,vitality,attack,harmful,universe
0,0,Limit Reverse,Trap Card,Continuous,,,,False,yugioh
1,1,The 13th Grave,Normal Monster,Zombie,0.250000,0.240000,0.240000,True,yugioh
2,2,Gem-Enhancement,Trap Card,Normal,,,,False,yugioh
3,3,Magician's Circle,Trap Card,Normal,,,,False,yugioh
4,4,Castle of Dark Illusions,Flip Effect Monster,Fiend,0.333333,0.386000,0.184000,True,yugioh
...,...,...,...,...,...,...,...,...,...
7862,7862,Young Green Dragon,Lawful Evil,dragon,0.250000,0.460592,0.693333,True,dd5
7863,7863,Young Red Dragon,Chaotic Evil,dragon,0.250000,0.491657,0.693333,True,dd5
7864,7864,Young Silver Dragon,Lawful Good,dragon,0.250000,0.484260,0.693333,True,dd5
7865,7865,Young White Dragon,Chaotic Evil,dragon,0.250000,0.438373,0.673333,True,dd5


In [3]:
DF_out.describe(include='all')

Unnamed: 0.1,Unnamed: 0,name,type,kind,development_stage,vitality,attack,harmful,universe
count,7867.0,7867,7867,7867,5608.000000,5593.000000,5571.000000,7867,7867
unique,,7769,114,83,,,,2,3
top,,Dark Magician,Effect Monster,Normal,,,,True,yugioh
freq,,9,2494,1126,,,,4647,6534
mean,3933.0,,,,0.348070,0.281558,0.269544,,
...,...,...,...,...,...,...,...,...,...
min,0.0,,,,0.001232,0.000000,0.000000,,
25%,1966.5,,,,0.166667,0.100000,0.093028,,
50%,3933.0,,,,0.333333,0.300000,0.245000,,
75%,5899.5,,,,0.500000,0.400000,0.400000,,


*set all string values in "kind" column to lowercase:*

In [4]:
dfc = DF_out
dfc['kind'] = dfc['kind'].str.lower()

*check for duplicates: Entities with the same name, type, kind and development stage from the same universe are very likely to be unwanted duplicates*

In [5]:
dfc.duplicated(subset=['name', 'type', 'kind', 'development_stage', 'universe']).sum()

88

In [6]:
dfc.loc[dfc.duplicated(keep=False, subset=['name', 'type', 'kind', 'development_stage', 'universe']), :]

Unnamed: 0.1,Unnamed: 0,name,type,kind,development_stage,vitality,attack,harmful,universe
126,126,Cyber End Dragon,Fusion Monster,machine,0.833333,0.800000,0.800000,True,yugioh
127,127,Cyber End Dragon,Fusion Monster,machine,0.833333,0.800000,0.800000,True,yugioh
517,517,Arcana Knight Joker,Fusion Monster,warrior,0.750000,0.760000,0.760000,True,yugioh
518,518,Arcana Knight Joker,Fusion Monster,warrior,0.750000,0.760000,0.760000,True,yugioh
534,534,Gaia The Fierce Knight,Normal Monster,warrior,0.583333,0.460000,0.460000,True,yugioh
...,...,...,...,...,...,...,...,...,...
6866,6866,Emperor Titus Mede II,CombatAssassin,imperial,0.001232,0.000483,0.069424,False,skyrim
7149,7149,Barknar,,nord,0.001232,0.024143,0.069424,False,skyrim
7390,7390,Barknar,,nord,0.001232,0.024143,0.069424,False,skyrim
7412,7412,Emperor Titus Mede II,CombatAssassin,imperial,0.001232,0.000483,0.069424,False,skyrim


*Sort by name for better readability, then drop the duplicates and check the dataframe for duplicates inside universes:*

In [7]:
dfc.sort_values(by=['name'], ascending=True, inplace=True)

In [8]:
dfc.drop_duplicates(subset=['name', 'type', 'kind', 'development_stage', 'universe'], keep='first', inplace=True)

In [9]:
dfc.duplicated(subset=['name', 'type', 'kind', 'development_stage', 'universe']).sum()

0

In [10]:
dfc.loc[dfc.duplicated(keep=False, subset=['name', 'type', 'kind', 'development_stage']), :]

Unnamed: 0.1,Unnamed: 0,name,type,kind,development_stage,vitality,attack,harmful,universe


*Check for similar entities across different universes:*

In [11]:
dfc.duplicated(subset=['name']).sum()
dfc.loc[dfc.duplicated(keep=False, subset=['name']), :]

Unnamed: 0.1,Unnamed: 0,name,type,kind,development_stage,vitality,attack,harmful,universe
5785,5785,Bat,Normal Monster,machine,0.083333,0.070000,0.060000,True,yugioh
7583,7583,Bat,Unaligned,beast,0.031250,0.240740,0.281667,True,dd5
5331,5331,Doppelganger,Trap Card,continuous,,,,False,yugioh
7626,7626,Doppelganger,Unaligned,monstrosity (shapechanger),0.062500,0.318462,0.530000,True,dd5
7396,7396,Eydis,EncClassBanditMelee,nord,0.030788,0.000000,0.045455,True,skyrim
...,...,...,...,...,...,...,...,...,...
7193,7193,Nikulas,,nord,0.001232,0.024143,0.069424,False,skyrim
7851,7851,Wolf,Unaligned,beast,0.062500,0.268136,0.593333,True,dd5
4035,4035,Wolf,Normal Monster,beast,0.250000,0.240000,0.240000,True,yugioh
7038,7038,Ysgramor,,animals,0.002463,0.010140,0.011505,True,skyrim


*clean up attack values:*

In [12]:
dfc['attack'].fillna(value=0)

3620    0.00
963     0.25
1958    0.36
1166    0.06
1994    0.10
        ... 
2573    0.24
4647    0.36
2565    0.40
609     0.36
5427    0.00
Name: attack, Length: 7779, dtype: float64

*save clean integrated entities into .csv:*

In [13]:
dfc.to_csv('./../final_data/clean_entity.csv')