In [31]:
%matplotlib inline 
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy import stats
import seaborn as sns
from sklearn.datasets import load_boston

In [32]:
data = pd.read_csv("species.csv")
data.head()

Unnamed: 0,Species_ID,Park_Name,Category,Order,Family,Scientific_Name,Common_Names,Record_Status,Occurrence,Nativeness,Abundance,Seasonality,Conservation_Status
0,ACAD-1000,Acadia National Park,Mammal,Artiodactyla,Cervidae,Alces alces,Moose,Approved,Present,Native,Rare,Resident,
1,ACAD-1001,Acadia National Park,Mammal,Artiodactyla,Cervidae,Odocoileus virginianus,"Northern White-Tailed Deer, Virginia Deer, Whi...",Approved,Present,Native,Abundant,,
2,ACAD-1002,Acadia National Park,Mammal,Carnivora,Canidae,Canis latrans,"Coyote, Eastern Coyote",Approved,Present,Not Native,Common,,Species of Concern
3,ACAD-1003,Acadia National Park,Mammal,Carnivora,Canidae,Canis lupus,"Eastern Timber Wolf, Gray Wolf, Timber Wolf",Approved,Not Confirmed,Native,,,Endangered
4,ACAD-1004,Acadia National Park,Mammal,Carnivora,Canidae,Vulpes vulpes,"Black Fox, Cross Fox, Eastern Red Fox, Fox, Re...",Approved,Present,Unknown,Common,Breeder,


In [33]:
data.shape

(119190, 13)

In [34]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119190 entries, 0 to 119189
Data columns (total 13 columns):
Species_ID             119190 non-null object
Park_Name              119190 non-null object
Category               119190 non-null object
Order                  117718 non-null object
Family                 117678 non-null object
Scientific_Name        119190 non-null object
Common_Names           119190 non-null object
Record_Status          119190 non-null object
Occurrence             99048 non-null object
Nativeness             94145 non-null object
Abundance              76254 non-null object
Seasonality            20110 non-null object
Conservation_Status    4704 non-null object
dtypes: object(13)
memory usage: 11.8+ MB


###### Exclude differences 

In [35]:
data['Category'].value_counts()

Vascular Plant         65184
Bird                   14593
Insect                 14349
Fungi                   6197
Nonvascular Plant       4278
Fish                    3955
Mammal                  3863
Invertebrate            1566
Reptile                 1341
Algae                    976
Slug/Snail               787
Spider/Scorpion          776
Amphibian                743
Crab/Lobster/Shrimp      582
Name: Category, dtype: int64

In [36]:
data['Record_Status'].value_counts()

Approved     86254
In Review    32936
Name: Record_Status, dtype: int64

In [37]:
data['Occurrence'].value_counts()

Present                            83277
Not Confirmed                      11958
Not Present (False Report)          2072
Not Present (Historical Report)     1310
Not Present                          431
Name: Occurrence, dtype: int64

In [38]:
data['Nativeness'].value_counts()

Native        75949
Not Native    11095
Unknown        7101
Name: Nativeness, dtype: int64

In [39]:
data['Abundance'].value_counts()

Unknown       28118
Uncommon      18778
Common        12913
Rare          10768
Occasional     3312
Abundant       2365
Name: Abundance, dtype: int64

In [40]:
data['Seasonality'].value_counts()

Breeder                                 12214
Resident                                 3215
Migratory                                2825
Vagrant                                  1660
Breeder, Winter                            64
Summer                                     26
Migratory, Winter                          24
Winter                                     20
Breeder, Resident                          20
Breeder, Migratory                         13
Resident, Winter                            9
Migratory, Vagrant                          4
Breeder, Resident, Summer                   3
Migratory, Summer                           3
Summer, Vagrant                             3
Breeder, Summer                             2
Resident, Summer                            2
Breeder, Resident, Summer, Winter           1
Winter, Vagrant                             1
Breeder, Resident, Migratory, Summer        1
Name: Seasonality, dtype: int64

In [41]:
data.isnull().sum()

Species_ID                  0
Park_Name                   0
Category                    0
Order                    1472
Family                   1512
Scientific_Name             0
Common_Names                0
Record_Status               0
Occurrence              20142
Nativeness              25045
Abundance               42936
Seasonality             99080
Conservation_Status    114486
dtype: int64

###### Identify missing data

In [42]:
data['Occurrence']=data['Occurrence'].fillna('Not confirmed')

In [43]:
data.isnull().sum()

Species_ID                  0
Park_Name                   0
Category                    0
Order                    1472
Family                   1512
Scientific_Name             0
Common_Names                0
Record_Status               0
Occurrence                  0
Nativeness              25045
Abundance               42936
Seasonality             99080
Conservation_Status    114486
dtype: int64

In [44]:
data['Conservation_Status'].value_counts()

Species of Concern     3843
Endangered              374
Under Review            194
Threatened              184
In Recovery              77
Proposed Endangered      24
Proposed Threatened       7
Extinct                   1
Name: Conservation_Status, dtype: int64

In [45]:
data['Conservation_Status']=data['Conservation_Status'].fillna('Under Review')

In [46]:
data.isnull().sum()

Species_ID                 0
Park_Name                  0
Category                   0
Order                   1472
Family                  1512
Scientific_Name            0
Common_Names               0
Record_Status              0
Occurrence                 0
Nativeness             25045
Abundance              42936
Seasonality            99080
Conservation_Status        0
dtype: int64

In [47]:
data['Nativeness']=data['Nativeness'].fillna('Under Review')

In [48]:
data.isnull().sum()

Species_ID                 0
Park_Name                  0
Category                   0
Order                   1472
Family                  1512
Scientific_Name            0
Common_Names               0
Record_Status              0
Occurrence                 0
Nativeness                 0
Abundance              42936
Seasonality            99080
Conservation_Status        0
dtype: int64

In [49]:
data['Seasonality']=data['Seasonality'].fillna('Unknown')

In [50]:
data.isnull().sum()

Species_ID                 0
Park_Name                  0
Category                   0
Order                   1472
Family                  1512
Scientific_Name            0
Common_Names               0
Record_Status              0
Occurrence                 0
Nativeness                 0
Abundance              42936
Seasonality                0
Conservation_Status        0
dtype: int64

In [51]:
data['Family']=data['Family'].fillna('Unknown')

In [52]:
data.isnull().sum()

Species_ID                 0
Park_Name                  0
Category                   0
Order                   1472
Family                     0
Scientific_Name            0
Common_Names               0
Record_Status              0
Occurrence                 0
Nativeness                 0
Abundance              42936
Seasonality                0
Conservation_Status        0
dtype: int64

In [53]:
# drop rows with missing values
data.dropna(inplace=True)

In [54]:
data.isnull().sum()

Species_ID             0
Park_Name              0
Category               0
Order                  0
Family                 0
Scientific_Name        0
Common_Names           0
Record_Status          0
Occurrence             0
Nativeness             0
Abundance              0
Seasonality            0
Conservation_Status    0
dtype: int64

In [55]:
data = pd.DataFrame(data)
data.to_csv('species1.csv',index = False, header =("Species_ID","Park_Name","Category","Order","Family","Scientific_Name","Common_Names","Record_Status","Occurrence","Nativeness","Abundance","Seasonality","Conservation_Status"))
print(data)

       Species_ID             Park_Name        Category           Order  \
0       ACAD-1000  Acadia National Park          Mammal    Artiodactyla   
1       ACAD-1001  Acadia National Park          Mammal    Artiodactyla   
2       ACAD-1002  Acadia National Park          Mammal       Carnivora   
4       ACAD-1004  Acadia National Park          Mammal       Carnivora   
8       ACAD-1008  Acadia National Park          Mammal       Carnivora   
9       ACAD-1009  Acadia National Park          Mammal       Carnivora   
11      ACAD-1011  Acadia National Park          Mammal       Carnivora   
12      ACAD-1012  Acadia National Park          Mammal       Carnivora   
14      ACAD-1014  Acadia National Park          Mammal       Carnivora   
18      ACAD-1018  Acadia National Park          Mammal       Carnivora   
19      ACAD-1019  Acadia National Park          Mammal       Carnivora   
20      ACAD-1020  Acadia National Park          Mammal      Chiroptera   
21      ACAD-1021  Acadia

In [56]:
data.shape

(75935, 13)

In [57]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 75935 entries, 0 to 119189
Data columns (total 13 columns):
Species_ID             75935 non-null object
Park_Name              75935 non-null object
Category               75935 non-null object
Order                  75935 non-null object
Family                 75935 non-null object
Scientific_Name        75935 non-null object
Common_Names           75935 non-null object
Record_Status          75935 non-null object
Occurrence             75935 non-null object
Nativeness             75935 non-null object
Abundance              75935 non-null object
Seasonality            75935 non-null object
Conservation_Status    75935 non-null object
dtypes: object(13)
memory usage: 8.1+ MB


In [58]:
data.duplicated()

0         False
1         False
2         False
4         False
8         False
9         False
11        False
12        False
14        False
18        False
19        False
20        False
21        False
22        False
23        False
24        False
26        False
28        False
29        False
30        False
33        False
34        False
35        False
36        False
37        False
38        False
42        False
44        False
45        False
46        False
          ...  
119153    False
119155    False
119156    False
119158    False
119159    False
119160    False
119161    False
119162    False
119163    False
119164    False
119167    False
119169    False
119170    False
119171    False
119172    False
119173    False
119174    False
119175    False
119177    False
119178    False
119179    False
119180    False
119181    False
119182    False
119183    False
119185    False
119186    False
119187    False
119188    False
119189    False
Length: 75935, dtype: bo