## Data cleaning -
---
Data cleaning - The process of fixing / removing incomplete, incorrect or
                irrelevant data.
                ~75% of work done with pandas is data cleaning

In [64]:
import pandas as pd
df = pd.read_csv("../data/pokemon.csv")
# print(df)

In [65]:
# 1. drop irrelevant columns
# we will use drop() fn for that and it returns a dataframe
df = df.drop(columns=["No"])
# df = df.drop(columns=["Legendary", "No"])
print(df)

           Name    Type1   Type2  Height  Weight  Legendary
0     Bulbasaur    Grass  Poison     0.7     6.9          0
1       Ivysaur    Grass  Poison     1.0    13.0          0
2      Venusaur    Grass  Poison     2.0   100.0          0
3    Charmander     Fire     NaN     0.6     8.5          0
4    Charmeleon     Fire     NaN     1.1    19.0          0
..          ...      ...     ...     ...     ...        ...
145     Moltres     Fire  Flying     2.0    60.0          1
146     Dratini   Dragon     NaN     1.8     3.3          0
147   Dragonair   Dragon     NaN     4.0    16.5          0
148   Dragonite   Dragon  Flying     2.2   210.0          0
149      Mewtwo  Psychic     NaN     2.0   122.0          1

[150 rows x 6 columns]


In [66]:
# 2. Handle missing data
# if there is no second type we will drop that pokemon
# dropna = drop not available
# df = df.dropna(subset=["Type2"])
print(df.to_string())

           Name     Type1     Type2  Height  Weight  Legendary
0     Bulbasaur     Grass    Poison     0.7     6.9          0
1       Ivysaur     Grass    Poison     1.0    13.0          0
2      Venusaur     Grass    Poison     2.0   100.0          0
3    Charmander      Fire       NaN     0.6     8.5          0
4    Charmeleon      Fire       NaN     1.1    19.0          0
5     Charizard      Fire    Flying     1.7    90.5          0
6      Squirtle     Water       NaN     0.5     9.0          0
7     Wartortle     Water       NaN     1.0    22.5          0
8     Blastoise     Water       NaN     1.6    85.5          0
9      Caterpie       Bug       NaN     0.3     2.9          0
10      Metapod       Bug       NaN     0.7     9.9          0
11   Butterfree       Bug    Flying     1.1    32.0          0
12       Weedle       Bug    Poison     0.3     3.2          0
13       Kakuna       Bug    Poison     0.6    10.0          0
14     Beedrill       Bug    Poison     1.0    29.5    

In [67]:
# filling the not available places with value
df = df.fillna({"Type2" : "None"})
print(df)

           Name    Type1   Type2  Height  Weight  Legendary
0     Bulbasaur    Grass  Poison     0.7     6.9          0
1       Ivysaur    Grass  Poison     1.0    13.0          0
2      Venusaur    Grass  Poison     2.0   100.0          0
3    Charmander     Fire    None     0.6     8.5          0
4    Charmeleon     Fire    None     1.1    19.0          0
..          ...      ...     ...     ...     ...        ...
145     Moltres     Fire  Flying     2.0    60.0          1
146     Dratini   Dragon    None     1.8     3.3          0
147   Dragonair   Dragon    None     4.0    16.5          0
148   Dragonite   Dragon  Flying     2.2   210.0          0
149      Mewtwo  Psychic    None     2.0   122.0          1

[150 rows x 6 columns]


In [68]:
# 3. Fix inconsistent values
# df["Type1"] = df["Type1"].replace({"Grass" : "GRASS",
#                                    "Fire" : "FIRE"})
print(df)

           Name    Type1   Type2  Height  Weight  Legendary
0     Bulbasaur    Grass  Poison     0.7     6.9          0
1       Ivysaur    Grass  Poison     1.0    13.0          0
2      Venusaur    Grass  Poison     2.0   100.0          0
3    Charmander     Fire    None     0.6     8.5          0
4    Charmeleon     Fire    None     1.1    19.0          0
..          ...      ...     ...     ...     ...        ...
145     Moltres     Fire  Flying     2.0    60.0          1
146     Dratini   Dragon    None     1.8     3.3          0
147   Dragonair   Dragon    None     4.0    16.5          0
148   Dragonite   Dragon  Flying     2.2   210.0          0
149      Mewtwo  Psychic    None     2.0   122.0          1

[150 rows x 6 columns]


In [69]:
# 4. Standardize text
# df["Name"] = df["Name"].str.lower()
print(df)

           Name    Type1   Type2  Height  Weight  Legendary
0     Bulbasaur    Grass  Poison     0.7     6.9          0
1       Ivysaur    Grass  Poison     1.0    13.0          0
2      Venusaur    Grass  Poison     2.0   100.0          0
3    Charmander     Fire    None     0.6     8.5          0
4    Charmeleon     Fire    None     1.1    19.0          0
..          ...      ...     ...     ...     ...        ...
145     Moltres     Fire  Flying     2.0    60.0          1
146     Dratini   Dragon    None     1.8     3.3          0
147   Dragonair   Dragon    None     4.0    16.5          0
148   Dragonite   Dragon  Flying     2.2   210.0          0
149      Mewtwo  Psychic    None     2.0   122.0          1

[150 rows x 6 columns]


In [70]:
# 5. Fix data types
df["Legendary"] = df["Legendary"].astype(bool)
print(df)

           Name    Type1   Type2  Height  Weight  Legendary
0     Bulbasaur    Grass  Poison     0.7     6.9      False
1       Ivysaur    Grass  Poison     1.0    13.0      False
2      Venusaur    Grass  Poison     2.0   100.0      False
3    Charmander     Fire    None     0.6     8.5      False
4    Charmeleon     Fire    None     1.1    19.0      False
..          ...      ...     ...     ...     ...        ...
145     Moltres     Fire  Flying     2.0    60.0       True
146     Dratini   Dragon    None     1.8     3.3      False
147   Dragonair   Dragon    None     4.0    16.5      False
148   Dragonite   Dragon  Flying     2.2   210.0      False
149      Mewtwo  Psychic    None     2.0   122.0       True

[150 rows x 6 columns]


In [None]:
# 6. Remove duplicate values
# i dont have duplicates so it wont remove any
# df = df.drop_duplicates()
# print(df)