# Clean dataset of detailed mammal information

In [1]:
import pandas as pd
import numpy as np

In [2]:
Animal_df = pd.read_csv("..\csv_files\msw3_all.csv", engine = "python")

In [3]:
Animal_df.head()

Unnamed: 0,ID,Order,Suborder,Infraorder,Superfamily,Family,Subfamily,Tribe,Genus,Subgenus,...,TypeSpecies,CommonName,TypeLocality,Distribution,Status,Synonyms,Comments,File,SortOrder,DisplayOrder
0,10300001,MONOTREMATA,,,,,,,,,...,,,,,,,Reviewed by Griffiths (1978). The order is the...,3,03-00001,03-0001
1,10300002,MONOTREMATA,,,,Tachyglossidae,,,,,...,,,,,,,,3,03-00002,03-0001-0000-0000-0000-0002
2,10300003,MONOTREMATA,,,,Tachyglossidae,,,Tachyglossus,,...,"<i>Echidna novaehollandiae</i> Lacépède, 1799 ...",,,,,"<i>Acanthonotus</i> Goldfuss, 1809; <i>Echidn...",,3,03-00003,03-0001-0000-0000-0000-0002-0000-0000-0003
3,10300004,MONOTREMATA,,,,Tachyglossidae,,,Tachyglossus,,...,,Short-beaked Echidna,"Australia, New South Wales, New Holland (= Syd...","S and E New Guinea; Australia, including Kanga...",IUCN ­– Lower Risk (nt) as <i>T. a.</i> <i>mul...,"<i>australiensis</i> (Lesson, 1827);<i> austra...","Includes <i>lawesii</i> and <i>setosus, </i>se...",3,03-00004,03-0001-0000-0000-0000-0002-0000-0000-0003-000...
4,10300005,MONOTREMATA,,,,Tachyglossidae,,,Tachyglossus,,...,,,"Australia, New South Wales, New Holland (= Syd...",,,,,3,03-00005,03-0001-0000-0000-0000-0002-0000-0000-0003-000...


In [4]:
Animal_df.shape

(13582, 34)

In [5]:
Animal_df[["TaxonLevel","Family","CommonName"]].head()

Unnamed: 0,TaxonLevel,Family,CommonName
0,ORDER,,
1,FAMILY,Tachyglossidae,
2,GENUS,Tachyglossidae,
3,SPECIES,Tachyglossidae,Short-beaked Echidna
4,SUBSPECIES,Tachyglossidae,


In [6]:
Animal_df.describe()

Unnamed: 0,ID,ActualDate,File
count,13582.0,263.0,13582.0
mean,13180160.0,1862.079848,31.794655
std,910760.7,32.809497,9.105451
min,10300000.0,1780.0,3.0
25%,12700120.0,1838.0,27.0
50%,13500100.0,1858.0,35.0
75%,13802290.0,1878.0,38.0
max,14300180.0,2002.0,43.0


In [7]:
Animal_df.columns

Index(['ID', 'Order', 'Suborder', 'Infraorder', 'Superfamily', 'Family',
       'Subfamily', 'Tribe', 'Genus', 'Subgenus', 'Species', 'Subspecies',
       'TaxonLevel', 'Extinct?', 'OriginalName', 'ValidName', 'Author', 'Date',
       'ActualDate', 'CitationName', 'CitationVolume', 'CitationIssue',
       'CitationPages', 'CitationType', 'TypeSpecies', 'CommonName',
       'TypeLocality', 'Distribution', 'Status', 'Synonyms', 'Comments',
       'File', 'SortOrder', 'DisplayOrder'],
      dtype='object')

In [8]:
Animal_df.isna().sum()

ID                    0
Order                 0
Suborder           5705
Infraorder        12154
Superfamily       10950
Family               62
Subfamily          4082
Tribe             11196
Genus               392
Subgenus          10964
Species            1814
Subspecies         7230
TaxonLevel            0
Extinct?              0
OriginalName      13510
ValidName             1
Author                0
Date                  2
ActualDate        13319
CitationName       5198
CitationVolume     6152
CitationIssue     12927
CitationPages      5229
CitationType      13577
TypeSpecies       12274
CommonName         8132
TypeLocality       6855
Distribution       8137
Status             8589
Synonyms           9467
Comments           6860
File                  0
SortOrder             0
DisplayOrder          0
dtype: int64

We will be working with specific columns for this analysis so can drop the unneccessary ones.

In [9]:
Relevant_columns = ['Species','TypeSpecies','Family','CommonName','TypeLocality','Status','Extinct?']

In [10]:
Df_copy = Animal_df[Relevant_columns].copy()

In [11]:
Df_copy.head()

Unnamed: 0,Species,TypeSpecies,Family,CommonName,TypeLocality,Status,Extinct?
0,,,,,,,False
1,,,Tachyglossidae,,,,False
2,,"<i>Echidna novaehollandiae</i> Lac�p�de, 1799 ...",Tachyglossidae,,,,False
3,aculeatus,,Tachyglossidae,Short-beaked Echidna,"Australia, New South Wales, New Holland (= Syd...",IUCN �� Lower Risk (nt) as <i>T. a.</i> <i>mul...,False
4,aculeatus,,Tachyglossidae,,"Australia, New South Wales, New Holland (= Syd...",,False


## Inspecting the new dataframe

In [12]:
Df_copy.isna().sum()

Species          1814
TypeSpecies     12274
Family             62
CommonName       8132
TypeLocality     6855
Status           8589
Extinct?            0
dtype: int64

In [13]:
Df_copy.isnull().sum().loc[Df_copy.isnull().sum().gt(0)]

Species          1814
TypeSpecies     12274
Family             62
CommonName       8132
TypeLocality     6855
Status           8589
dtype: int64

In [14]:
Df_copy.isna().sum()/len(Df_copy)

Species         0.133559
TypeSpecies     0.903696
Family          0.004565
CommonName      0.598734
TypeLocality    0.504712
Status          0.632381
Extinct?        0.000000
dtype: float64

From the cell above, We can see that 90% of `TypeSpecies` is missing and there is little that can be done to get all these missing values. So we will be dropping the column due to large missing values.

In [15]:
Df_copy.drop("TypeSpecies", axis = 1, inplace = True)

In [16]:
Df_copy.head()

Unnamed: 0,Species,Family,CommonName,TypeLocality,Status,Extinct?
0,,,,,,False
1,,Tachyglossidae,,,,False
2,,Tachyglossidae,,,,False
3,aculeatus,Tachyglossidae,Short-beaked Echidna,"Australia, New South Wales, New Holland (= Syd...",IUCN �� Lower Risk (nt) as <i>T. a.</i> <i>mul...,False
4,aculeatus,Tachyglossidae,,"Australia, New South Wales, New Holland (= Syd...",,False


In [17]:
Df_copy.isna().sum()/len(Df_copy)

Species         0.133559
Family          0.004565
CommonName      0.598734
TypeLocality    0.504712
Status          0.632381
Extinct?        0.000000
dtype: float64

Renaming the columns to a more suitable names so as to facilitate proper selection of the columns.

In [18]:
Df_copy.rename(columns = {'CommonName': 'Common_name',
                         'TypeLocality': 'Locality',
                         'Extinct?': 'Extinct'}, 
                      inplace = True)

In [19]:
Df_copy.head()

Unnamed: 0,Species,Family,Common_name,Locality,Status,Extinct
0,,,,,,False
1,,Tachyglossidae,,,,False
2,,Tachyglossidae,,,,False
3,aculeatus,Tachyglossidae,Short-beaked Echidna,"Australia, New South Wales, New Holland (= Syd...",IUCN �� Lower Risk (nt) as <i>T. a.</i> <i>mul...,False
4,aculeatus,Tachyglossidae,,"Australia, New South Wales, New Holland (= Syd...",,False


Investigating each columns to work out how to deal with the missing values.

In [20]:
Df_copy["Species"].value_counts()

bottae          142
capensis         64
talpoides        61
vulpes           46
brasiliensis     43
               ... 
anakuma           1
contractus        1
vosmaeri          1
mixtus            1
puhoatensis       1
Name: Species, Length: 3802, dtype: int64

In [21]:
Df_copy["Species"].count()

11768

In [22]:
Df_copy.loc[Df_copy.Species.isnull(), ["Family","Species", "Common_name"]]

Unnamed: 0,Family,Species,Common_name
0,,,
1,Tachyglossidae,,
2,Tachyglossidae,,
9,Tachyglossidae,,
17,Ornithorhynchidae,,
...,...,...,...
13558,Ziphiidae,,
13561,Ziphiidae,,
13563,Ziphiidae,,
13578,Ziphiidae,,


In [23]:
Df_copy.shape

(13582, 6)

It is difficult to get the species of the animals from the dataframe while comparing it with other columns, so we will be dropping the rows with the missing values.

In [24]:
Df_copy = Df_copy.dropna(subset =["Species", "Status"], how = "any")

In [25]:
Df_copy.shape

(4993, 6)

In [26]:
Df_copy.reset_index(drop = True).tail(10)

Unnamed: 0,Species,Family,Common_name,Locality,Status,Extinct
4983,grayi,Ziphiidae,Gray�s Beaked Whale,"New Zealand, ""the Chatham Islands. . . from sp...",CITES � Appendix II; IUCN � Data Deficient.,False
4984,hectori,Ziphiidae,Hector�s Beaked Whale,"New Zealand, Wellington, ""killed in Tatai [sic...",CITES � Appendix II; IUCN � Data Deficient.,False
4985,layardii,Ziphiidae,Strap-toothed Whale,"None given, probably South Africa.",CITES � Appendix II; IUCN � Data Deficient.,False
4986,mirus,Ziphiidae,True�s Beaked Whale,"USA, ""stranded in the outer bank of Bird Islan...",CITES � Appendix II; IUCN � Data Deficient.,False
4987,perrini,Ziphiidae,Perrin�s Beaked Whale,"U.S.A., Carlsbad, California.",CITES � Appendix II.,False
4988,peruvianus,Ziphiidae,Pygmy Beaked Whale,"""Playa Paraiso (11�12' S), Huacho, Lima, Peru.""",CITES � Appendix II; IUCN � Data Deficient.,False
4989,stejnegeri,Ziphiidae,Stejneger�s Beaked Whale,"Russia, Commander Isls, ""Bering Island"".",CITES � Appendix II; IUCN � Data Deficient.,False
4990,traversii,Ziphiidae,Spade-toothed Whale,"Chatham Isl, New Zealand.",CITES � Appendix II.,False
4991,shepherdi,Ziphiidae,Shepherd�s Beaked Whale,"New Zealand, North Isl, ""cast upon the beach a...",CITES � Appendix II; IUCN � Data Deficient.,False
4992,cavirostris,Ziphiidae,Cuvier�s Beaked Whale,"France, ""dans le d�partement des Bouches-du-Rh...",CITES � Appendix II; IUCN � Data Deficient.,False


In [27]:
Df_copy.isna().sum()/len(Df_copy)

Species        0.0000
Family         0.0000
Common_name    0.0002
Locality       0.0000
Status         0.0000
Extinct        0.0000
dtype: float64

In [42]:
Df_copy.Common_name.isna().sum()/len(Df_copy)

0.0

In [28]:
Df_copy.reset_index(drop = True).head()

Unnamed: 0,Species,Family,Common_name,Locality,Status,Extinct
0,aculeatus,Tachyglossidae,Short-beaked Echidna,"Australia, New South Wales, New Holland (= Syd...",IUCN �� Lower Risk (nt) as <i>T. a.</i> <i>mul...,False
1,attenboroughi,Tachyglossidae,Sir David�s Long-beaked Echidna.,"Indonesia, Prov. of Papua (= Irian Jaya), Cycl...",CITES - Appendix II.,False
2,bartoni,Tachyglossidae,Eastern Long-beaked Echidna,"Papua New Guinea, Albert Edward Range, Mount V...",CITES - Appendix II; IUCN � Endangered as <i>Z...,False
3,bruijni,Tachyglossidae,Western Long-beaked Echidna.,"Indonesia, Prov. of Papua (= Irian Jaya), Voge...",CITES - Appendix II; IUCN � Endangered.,False
4,anatinus,Ornithorhynchidae,Platypus.,"Australia, New South Wales, New Holland (= Syd...",IUCN � Lower Risk (lc); common but vulnerable ...,False


In [29]:
Df_copy.shape

(4993, 6)

Removing the encode in the dataframe.

In [43]:
def remove_non_ascii(text):
    return ''.join([i for i in str (text) if ord(i)<128])
Df_copy["Status"] = Df_copy["Status"].apply(remove_non_ascii)

In [44]:
Df_copy["Locality"] = Df_copy["Locality"].apply(remove_non_ascii)

In [45]:
Df_copy["Common_name"] = Df_copy["Common_name"].apply(remove_non_ascii)

In [46]:
Df_copy.reset_index(drop = True).head()

Unnamed: 0,Species,Family,Common_name,Locality,Status,Extinct
0,aculeatus,Tachyglossidae,Short-beaked Echidna,"Australia, New South Wales, New Holland (= Syd...",Lower risk,False
1,attenboroughi,Tachyglossidae,Sir Davids Long-beaked Echidna.,"Indonesia, Prov. of Papua (= Irian Jaya), Cycl...",Check Appendix,False
2,bartoni,Tachyglossidae,Eastern Long-beaked Echidna,"Papua New Guinea, Albert Edward Range, Mount V...",Endangered,False
3,bruijni,Tachyglossidae,Western Long-beaked Echidna.,"Indonesia, Prov. of Papua (= Irian Jaya), Voge...",Endangered,False
4,anatinus,Ornithorhynchidae,Platypus.,"Australia, New South Wales, New Holland (= Syd...",Lower risk,False


Assign the actual status to the animals and get rid of the long status in the column

In [47]:
Df_copy["Status"].str.contains(r"Lower").reset_index(drop = True)

0        True
1       False
2       False
3       False
4        True
        ...  
4988    False
4989    False
4990    False
4991    False
4992    False
Name: Status, Length: 4993, dtype: bool

In [35]:
Df_copy["Status"]=np.where(Df_copy["Status"].str.contains(r"Lower"),"Lower risk",
                           np.where(Df_copy["Status"].str.contains(r"Endanger"),"Endangered",
                            np.where(Df_copy["Status"].str.contains(r"Vulnerable"),"Vulnerable",
                            np.where(Df_copy["Status"].str.contains(r"eficient"),"Data Deficiency",
                            np.where(Df_copy["Status"].str.contains(r"valuated"), "Not evaluated",
                            np.where(Df_copy["Status"].str.contains(r"Threatened"), "Threatened",
                        np.where(Df_copy["Status"].str.contains(r"forgotten",r"holotype"), "Dead but not forgotten",
                            np.where(Df_copy["Status"].str.contains(r"xtinct"), "Extinct",
                            np.where(Df_copy["Status"].str.contains(r"oncern",r"algoae"), "Least concern",
                            np.where(Df_copy["Status"].str.contains(r"isted",r"Uncommon"), "Not listed",
                            np.where(Df_copy["Status"].str.contains(r"albiventer",r"Merriam, 1900"), "Check Appendix",
                            np.where(Df_copy["Status"].str.contains(r"albiventer"), "Check Appendix",
                            np.where(Df_copy["Status"].str.contains(r"Appendix",r"Stewart et al"), "Check Appendix",
                            np.where(Df_copy["Status"].str.contains(r"Baryshnikov",r"Ellerman"), "Check Appendix",
                            np.where(Df_copy["Status"].str.contains(r"Roberts",r"Manning and Macpherson"), "Check Appendix",
                            np.where(Df_copy["Status"].str.contains(r"excluded from protection"), "Exclusion",
                            np.where(Df_copy["Status"].str.contains(r"Robinson and Kloss",r"Stewart et al"), "Check Appendix",
                            np.where(Df_copy["Status"].str.contains(r"Nehring",r"Thomas"), "Check Appendix",
                            np.where(Df_copy["Status"].str.contains(r"Lichtenstein"), "Check Appendix",
                            np.where(Df_copy["Status"].str.contains(r"Meyer and Malikov"), "Check Appendix",
                            np.where(Df_copy["Status"].str.contains(r"Higgins and Petterd"), "Check Appendix",
                            np.where(Df_copy["Status"].str.contains(r"Lataste"), "Check Appendix",
                            np.where(Df_copy["Status"].str.contains(r"Bonhote"), "Check Appendix",
                            np.where(Df_copy["Status"].str.contains(r"Manning and Macpherson",r"Querouil et al"), "Check Appendix",
                            np.where(Df_copy["Status"].str.contains(r"Querouil et al"), "Check Appendix",
                            np.where(Df_copy["Status"].str.contains(r"Stewart et al"), "Check Appendix",
                            np.where(Df_copy["Status"].str.contains(r"Rmmler"), "Check Appendix",
                            np.where(Df_copy["Status"].str.contains(r"Action Plan"), "Data Deficiency",
                            Df_copy["Status"]))))))))))))))))))))))))))))

In [36]:
Df_copy.head()

Unnamed: 0,Species,Family,Common_name,Locality,Status,Extinct
3,aculeatus,Tachyglossidae,Short-beaked Echidna,"Australia, New South Wales, New Holland (= Syd...",Lower risk,False
10,attenboroughi,Tachyglossidae,Sir Davids Long-beaked Echidna.,"Indonesia, Prov. of Papua (= Irian Jaya), Cycl...",Check Appendix,False
11,bartoni,Tachyglossidae,Eastern Long-beaked Echidna,"Papua New Guinea, Albert Edward Range, Mount V...",Endangered,False
16,bruijni,Tachyglossidae,Western Long-beaked Echidna.,"Indonesia, Prov. of Papua (= Irian Jaya), Voge...",Endangered,False
19,anatinus,Ornithorhynchidae,Platypus.,"Australia, New South Wales, New Holland (= Syd...",Lower risk,False


In [49]:
Df_copy.shape

(4993, 6)

Reset the index

In [50]:
Df_copy = Df_copy.reset_index(drop = True)

In [51]:
Df_copy.head()

Unnamed: 0,Species,Family,Common_name,Locality,Status,Extinct
0,aculeatus,Tachyglossidae,Short-beaked Echidna,"Australia, New South Wales, New Holland (= Syd...",Lower risk,False
1,attenboroughi,Tachyglossidae,Sir Davids Long-beaked Echidna.,"Indonesia, Prov. of Papua (= Irian Jaya), Cycl...",Check Appendix,False
2,bartoni,Tachyglossidae,Eastern Long-beaked Echidna,"Papua New Guinea, Albert Edward Range, Mount V...",Endangered,False
3,bruijni,Tachyglossidae,Western Long-beaked Echidna.,"Indonesia, Prov. of Papua (= Irian Jaya), Voge...",Endangered,False
4,anatinus,Ornithorhynchidae,Platypus.,"Australia, New South Wales, New Holland (= Syd...",Lower risk,False


In [52]:
Df_copy.to_csv("..\csv_files\animals_project.csv")