In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns



В этом семинаре мы будем работать с данными с kaggle соревнования Possum Regression.

В этой задаче нужно предсказать возраст оппосума по имеющимся о нем данным:

    Таргет: age - Возраст
    case - Номер наблюдения
    site - Номер участка, на котором был отловлен опоссум
    Pop - Поселение, либо Vic (Виктория), либо другое (Новый Южный Уэльс или Квинсленд)
    sex - Пол
    hdlngth - Длина головы, в мм.
    skullw - Ширина черепа, в мм.
    totlngth - Общая длина, в см.
    taill - Длина хвоста, в см.
    footlgth - Длина стопы
    earconch - Длина ушной раковины
    eye - Расстояние от медиального канта до латерального канта правого глаза
    chest - Обхват груди (в см)
    belly - Обхват живота (в см)



In [5]:
df = pd.read_csv('possum.csv')
df

Unnamed: 0,case,site,Pop,sex,age,hdlngth,skullw,totlngth,taill,footlgth,earconch,eye,chest,belly
0,1,1,Vic,m,8.0,94.1,60.4,89.0,36.0,74.5,54.5,15.2,28.0,36.0
1,2,1,Vic,f,6.0,92.5,57.6,91.5,36.5,72.5,51.2,16.0,28.5,33.0
2,3,1,Vic,f,6.0,94.0,60.0,95.5,39.0,75.4,51.9,15.5,30.0,34.0
3,4,1,Vic,f,6.0,93.2,57.1,92.0,38.0,76.1,52.2,15.2,28.0,34.0
4,5,1,Vic,f,2.0,91.5,56.3,85.5,36.0,71.0,53.2,15.1,28.5,33.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99,100,7,other,m,1.0,89.5,56.0,81.5,36.5,66.0,46.8,14.8,23.0,27.0
100,101,7,other,m,1.0,88.6,54.7,82.5,39.0,64.4,48.0,14.0,25.0,33.0
101,102,7,other,f,6.0,92.4,55.0,89.0,38.0,63.5,45.4,13.0,25.0,30.0
102,103,7,other,m,4.0,91.5,55.2,82.5,36.5,62.9,45.9,15.4,25.0,29.0


## Exploratory Data Analysis

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 104 entries, 0 to 103
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   case      104 non-null    int64  
 1   site      104 non-null    int64  
 2   Pop       104 non-null    object 
 3   sex       104 non-null    object 
 4   age       102 non-null    float64
 5   hdlngth   104 non-null    float64
 6   skullw    104 non-null    float64
 7   totlngth  104 non-null    float64
 8   taill     104 non-null    float64
 9   footlgth  103 non-null    float64
 10  earconch  104 non-null    float64
 11  eye       104 non-null    float64
 12  chest     104 non-null    float64
 13  belly     104 non-null    float64
dtypes: float64(10), int64(2), object(2)
memory usage: 11.5+ KB


In [None]:
# Удаляем ненужные признаки
df = df.drop(columns=['case', 'site', 'Pop', 'sex'])

In [21]:
df

Unnamed: 0,age,hdlngth,skullw,totlngth,taill,footlgth,earconch,eye,chest,belly
0,8.0,94.1,60.4,89.0,36.0,74.5,54.5,15.2,28.0,36.0
1,6.0,92.5,57.6,91.5,36.5,72.5,51.2,16.0,28.5,33.0
2,6.0,94.0,60.0,95.5,39.0,75.4,51.9,15.5,30.0,34.0
3,6.0,93.2,57.1,92.0,38.0,76.1,52.2,15.2,28.0,34.0
4,2.0,91.5,56.3,85.5,36.0,71.0,53.2,15.1,28.5,33.0
...,...,...,...,...,...,...,...,...,...,...
99,1.0,89.5,56.0,81.5,36.5,66.0,46.8,14.8,23.0,27.0
100,1.0,88.6,54.7,82.5,39.0,64.4,48.0,14.0,25.0,33.0
101,6.0,92.4,55.0,89.0,38.0,63.5,45.4,13.0,25.0,30.0
102,4.0,91.5,55.2,82.5,36.5,62.9,45.9,15.4,25.0,29.0


In [22]:
df.isna().sum()

age         2
hdlngth     0
skullw      0
totlngth    0
taill       0
footlgth    1
earconch    0
eye         0
chest       0
belly       0
dtype: int64

In [23]:
df[(df['age'].isna()) | (df['footlgth'].isna())]

Unnamed: 0,age,hdlngth,skullw,totlngth,taill,footlgth,earconch,eye,chest,belly
40,5.0,88.4,57.0,83.0,36.5,,40.3,15.9,27.0,30.5
43,,85.1,51.5,76.0,35.5,70.3,52.6,14.4,23.0,27.0
45,,91.4,54.4,84.0,35.0,72.8,51.2,14.4,24.5,35.0


In [25]:
df = df.dropna()

## Preparing data for `ML`

In [26]:
X = df.drop(columns='age')
y = df.age

In [27]:
from sklearn.model_selection import train_test_split

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, test_size=.2, random_state=44)