In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import OneHotEncoder

In [2]:
data = pd.read_csv('penguins_size.csv')

data.head(10)

Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,MALE
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,FEMALE
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,FEMALE
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,FEMALE
5,Adelie,Torgersen,39.3,20.6,190.0,3650.0,MALE
6,Adelie,Torgersen,38.9,17.8,181.0,3625.0,FEMALE
7,Adelie,Torgersen,39.2,19.6,195.0,4675.0,MALE
8,Adelie,Torgersen,34.1,18.1,193.0,3475.0,
9,Adelie,Torgersen,42.0,20.2,190.0,4250.0,


In [3]:
np.shape(data)

(344, 7)

In [4]:
missing = data.isnull().sum()
missing

species               0
island                0
culmen_length_mm      2
culmen_depth_mm       2
flipper_length_mm     2
body_mass_g           2
sex                  10
dtype: int64

In [5]:
total_cells = np.product(data.shape)
total_missing = missing.sum()

percent_missing = "{:.3%}".format((total_missing/total_cells))

print(percent_missing)

0.748%


In [6]:
# Categorical variables
s = data.dtypes == 'object'
object_cols = list(s[s].index)
print(object_cols)

['species', 'island', 'sex']


In [7]:
for col in data[object_cols]:
    print(data[col].unique())

['Adelie' 'Chinstrap' 'Gentoo']
['Torgersen' 'Biscoe' 'Dream']
['MALE' 'FEMALE' nan '.']


In [8]:
indices = np.where([data['sex'] == '.'])[1]

data.loc[indices]

Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
336,Gentoo,Biscoe,44.5,15.7,217.0,4875.0,.


In [9]:
data[330:340]

Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
330,Gentoo,Biscoe,50.5,15.2,216.0,5000.0,FEMALE
331,Gentoo,Biscoe,49.8,15.9,229.0,5950.0,MALE
332,Gentoo,Biscoe,43.5,15.2,213.0,4650.0,FEMALE
333,Gentoo,Biscoe,51.5,16.3,230.0,5500.0,MALE
334,Gentoo,Biscoe,46.2,14.1,217.0,4375.0,FEMALE
335,Gentoo,Biscoe,55.1,16.0,230.0,5850.0,MALE
336,Gentoo,Biscoe,44.5,15.7,217.0,4875.0,.
337,Gentoo,Biscoe,48.8,16.2,222.0,6000.0,MALE
338,Gentoo,Biscoe,47.2,13.7,214.0,4925.0,FEMALE
339,Gentoo,Biscoe,,,,,


Column 'sex' contains a value '.', so may need to change this as well as the 'NaN' values. After considering the columns around it, it seems doing a bfill will change it to MALE, where it seems more appropriate changing it to FEMALE when looking at the data in the other columns. Instead of updating this with a guess, I will remove this column during imputation stage.

## Imputation
### Method 1: Fill missing values automatically

In [10]:
data2 = data.fillna(method='bfill', axis=0).fillna(0).drop([0, 336])

data2[330:340]

Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
331,Gentoo,Biscoe,49.8,15.9,229.0,5950.0,MALE
332,Gentoo,Biscoe,43.5,15.2,213.0,4650.0,FEMALE
333,Gentoo,Biscoe,51.5,16.3,230.0,5500.0,MALE
334,Gentoo,Biscoe,46.2,14.1,217.0,4375.0,FEMALE
335,Gentoo,Biscoe,55.1,16.0,230.0,5850.0,MALE
337,Gentoo,Biscoe,48.8,16.2,222.0,6000.0,MALE
338,Gentoo,Biscoe,47.2,13.7,214.0,4925.0,FEMALE
339,Gentoo,Biscoe,46.8,14.3,215.0,4850.0,FEMALE
340,Gentoo,Biscoe,46.8,14.3,215.0,4850.0,FEMALE
341,Gentoo,Biscoe,50.4,15.7,222.0,5750.0,MALE


In [11]:
missing = data2.isnull().sum()
total_cells = np.product(data2.shape)
total_missing = missing.sum()
percent_missing = "{:.3%}".format((total_missing/total_cells))
print(percent_missing)

0.000%


In [28]:
# Encode ordinal features
def one_hot(df):
    OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
    encode_cols = ['sex', 'island']
    df_cols = pd.DataFrame(OH_encoder.fit_transform(df[encode_cols]))
    df_cols.index = df.index
    df2 = df.drop(encode_cols, axis=1)
    df2 = pd.concat([df2, df_cols], axis=1)
    return df2

# Separate target from predictors
def split(data):
    y = data.species
    X = data.drop(['species'], axis=1)
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)
    return X_train, X_valid, y_train, y_valid

# Scoring
def regression_score(X_train, X_valid, y_train, y_valid):
    model = RandomForestRegressor(n_estimators=100, random_state=0)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return mean_absolute_error(y_valid, preds)

def classification_score(X_train, X_valid, y_train, y_valid):
    model = RandomForestClassifier(random_state=0)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return accuracy_score(y_valid, preds)
    

In [38]:
data2_OH = one_hot(data2)

X_train_1, X_valid_1, y_train, y_valid = split(data2_OH)

classification_score(X_train_1, X_valid_1, y_train, y_valid)


0.9855072463768116

In [39]:
model = RandomForestClassifier(random_state=0)
model.fit(X_train_1, y_train)
test = np.array([50.6,19.4,193.0,3800.0, 0, 1, 0, 1, 0]).reshape(1, -1)
model.predict(test)

array(['Chinstrap'], dtype=object)

### Method 2: Drop Missing Values

In [14]:
data3 = data.dropna().drop([0, 336])
data3.head(10)

Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,FEMALE
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,FEMALE
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,FEMALE
5,Adelie,Torgersen,39.3,20.6,190.0,3650.0,MALE
6,Adelie,Torgersen,38.9,17.8,181.0,3625.0,FEMALE
7,Adelie,Torgersen,39.2,19.6,195.0,4675.0,MALE
12,Adelie,Torgersen,41.1,17.6,182.0,3200.0,FEMALE
13,Adelie,Torgersen,38.6,21.2,191.0,3800.0,MALE
14,Adelie,Torgersen,34.6,21.1,198.0,4400.0,MALE
15,Adelie,Torgersen,36.6,17.8,185.0,3700.0,FEMALE


In [15]:
np.shape(data3)
# 10 Rows were dropped

(332, 7)

In [16]:
data3_OH = one_hot(data3)

X_train_2, X_valid_2, y_train, y_valid = split(data3_OH)

classification_score(X_train_2, X_valid_2, y_train, y_valid)

0.9701492537313433

Inputing is slightly better, giving 98.5% accuracy, compared to 97.0% by dropping all NaN values. 

Now I will export the imputed data and recreate the chosen model in clf_model.py, to be used for the Web App.

In [40]:
data2.to_csv('penguins_imputed.csv')