## Import

In [28]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

## Loading data

In [14]:
df = pd.read_csv("crocodile_dataset.csv")
df.head(5)

Unnamed: 0,Observation ID,Common Name,Scientific Name,Family,Genus,Observed Length (m),Observed Weight (kg),Age Class,Sex,Date of Observation,Country/Region,Habitat Type,Conservation Status,Observer Name,Notes
0,1,Morelet's Crocodile,Crocodylus moreletii,Crocodylidae,Crocodylus,1.9,62.0,Adult,Male,31-03-2018,Belize,Swamps,Least Concern,Allison Hill,Cause bill scientist nation opportunity.
1,2,American Crocodile,Crocodylus acutus,Crocodylidae,Crocodylus,4.09,334.5,Adult,Male,28-01-2015,Venezuela,Mangroves,Vulnerable,Brandon Hall,Ago current practice nation determine operatio...
2,3,Orinoco Crocodile,Crocodylus intermedius,Crocodylidae,Crocodylus,1.08,118.2,Juvenile,Unknown,07-12-2010,Venezuela,Flooded Savannas,Critically Endangered,Melissa Peterson,Democratic shake bill here grow gas enough ana...
3,4,Morelet's Crocodile,Crocodylus moreletii,Crocodylidae,Crocodylus,2.42,90.4,Adult,Male,01-11-2019,Mexico,Rivers,Least Concern,Edward Fuller,Officer relate animal direction eye bag do.
4,5,Mugger Crocodile (Marsh Crocodile),Crocodylus palustris,Crocodylidae,Crocodylus,3.75,269.4,Adult,Unknown,15-07-2019,India,Rivers,Vulnerable,Donald Reid,Class great prove reduce raise author play mov...


## Data analysis

In [15]:
#print(df.astype)
print(df.dtypes)
#print(df.shape)
print(df.isna().sum())

Observation ID            int64
Common Name              object
Scientific Name          object
Family                   object
Genus                    object
Observed Length (m)     float64
Observed Weight (kg)    float64
Age Class                object
Sex                      object
Date of Observation      object
Country/Region           object
Habitat Type             object
Conservation Status      object
Observer Name            object
Notes                    object
dtype: object
Observation ID          0
Common Name             0
Scientific Name         0
Family                  0
Genus                   0
Observed Length (m)     0
Observed Weight (kg)    0
Age Class               0
Sex                     0
Date of Observation     0
Country/Region          0
Habitat Type            0
Conservation Status     0
Observer Name           0
Notes                   0
dtype: int64


## Data cleaning

In [16]:
id = df["Observation ID"].copy()
y_train = df["Scientific Name"].copy()
df = df.drop(['Observation ID'], axis=1)

df.head(5)
df.columns

Index(['Common Name', 'Scientific Name', 'Family', 'Genus',
       'Observed Length (m)', 'Observed Weight (kg)', 'Age Class', 'Sex',
       'Date of Observation', 'Country/Region', 'Habitat Type',
       'Conservation Status', 'Observer Name', 'Notes'],
      dtype='object')

In [17]:
catList = ['Family', 'Genus', 'Age Class', 'Sex', 'Country/Region', 'Habitat Type', 'Conservation Status']
floatList = ['Observed Length (m)', 'Observed Weight (kg)']

In [18]:
dfFloat = df[floatList].copy()
dfFloat = dfFloat.astype(int)

In [19]:
df = df.astype({'Family': 'category', 'Genus': 'category', 'Age Class': 'category', 'Sex': 'category', 'Country/Region': 'category', 'Habitat Type': 'category', 'Conservation Status': 'category'})

dfCat = df[catList].copy()

dfCat = pd.get_dummies(dfCat, columns=catList, drop_first=True).astype(int)

dfCat.head(5)

Unnamed: 0,Genus_Mecistops,Genus_Osteolaemus,Age Class_Hatchling,Age Class_Juvenile,Age Class_Subadult,Sex_Male,Sex_Unknown,Country/Region_Belize,Country/Region_Cambodia,Country/Region_Cameroon,...,Habitat Type_Shaded Forest Rivers,Habitat Type_Slow Rivers,Habitat Type_Slow Streams,Habitat Type_Small Streams,Habitat Type_Swamps,Habitat Type_Tidal Rivers,Conservation Status_Data Deficient,Conservation Status_Endangered,Conservation Status_Least Concern,Conservation Status_Vulnerable
0,0,0,0,0,0,1,0,1,0,0,...,0,0,0,0,1,0,0,0,1,0
1,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,1,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [20]:
print(dfFloat.shape)
print(dfCat.shape)

(1000, 2)
(1000, 85)


In [21]:
dfFinal = pd.concat([dfFloat, dfCat], axis=1, ignore_index=True)

print(dfFinal.shape)

dfFinal.head(5)

(1000, 87)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,77,78,79,80,81,82,83,84,85,86
0,1,62,0,0,0,0,0,1,0,1,...,0,0,0,0,1,0,0,0,1,0
1,4,334,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
2,1,118,0,0,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,2,90,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
4,3,269,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1


In [22]:
y_train

0        Crocodylus moreletii
1           Crocodylus acutus
2      Crocodylus intermedius
3        Crocodylus moreletii
4        Crocodylus palustris
                ...          
995         Crocodylus suchus
996          Crocodylus halli
997    Mecistops cataphractus
998    Mecistops cataphractus
999          Crocodylus halli
Name: Scientific Name, Length: 1000, dtype: object

In [23]:
y_train = pd.DataFrame(y_train)

y_train["Scientific Name"] = y_train["Scientific Name"].astype("category").cat.codes + 1

y_train

Unnamed: 0,Scientific Name
0,6
1,1
2,3
3,6
4,9
...,...
995,14
996,2
997,15
998,15


## Scaler

In [24]:
scaler = StandardScaler()
scaler.fit(dfFinal)
X_train = pd.DataFrame(scaler.transform(dfFinal))

X_train.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,77,78,79,80,81,82,83,84,85,86
0,-0.822717,-0.532965,-0.353354,-0.342518,-0.226991,-0.490607,-0.572731,1.441153,-0.740262,7.0,...,-0.169725,-0.123404,-0.142857,-0.169725,2.325996,-0.100504,-0.360477,-0.243561,1.266557,-0.45257
1,1.854237,1.020487,-0.353354,-0.342518,-0.226991,-0.490607,-0.572731,1.441153,-0.740262,-0.142857,...,-0.169725,-0.123404,-0.142857,-0.169725,-0.429923,-0.100504,-0.360477,-0.243561,-0.789542,2.209605
2,-0.822717,-0.213137,-0.353354,-0.342518,-0.226991,2.038293,-0.572731,-0.693889,1.350873,-0.142857,...,-0.169725,-0.123404,-0.142857,-0.169725,-0.429923,-0.100504,-0.360477,-0.243561,-0.789542,-0.45257
3,0.069601,-0.373051,-0.353354,-0.342518,-0.226991,-0.490607,-0.572731,1.441153,-0.740262,-0.142857,...,-0.169725,-0.123404,-0.142857,-0.169725,-0.429923,-0.100504,-0.360477,-0.243561,1.266557,-0.45257
4,0.961919,0.649257,-0.353354,-0.342518,-0.226991,-0.490607,-0.572731,-0.693889,1.350873,-0.142857,...,-0.169725,-0.123404,-0.142857,-0.169725,-0.429923,-0.100504,-0.360477,-0.243561,-0.789542,2.209605


## Model fitting

In [32]:
X_train, X_test, y_train, y_test = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42)

y_train

Unnamed: 0,Scientific Name
864,10
838,1
686,18
582,6
530,14
...,...
919,15
939,10
915,6
239,6


In [30]:
model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

accuracy

  y = column_or_1d(y, warn=True)


0.99375