In [5]:
import numpy as np
import pandas as pd # Для работы с данными
import matplotlib.pyplot as plt  # Библиотека для визуализации результатов 

In [6]:
data = pd.read_csv('https://raw.githubusercontent.com/a-milenkin/datasets_for_t-tests/main/athletes.csv')
data.head()

Unnamed: 0,id,name,nationality,sex,dob,height,weight,sport,gold,silver,bronze
0,736041664,A Jesus Garcia,ESP,male,10/17/69,1.72,64.0,athletics,0,0,0
1,532037425,A Lam Shin,KOR,female,9/23/86,1.68,56.0,fencing,0,0,0
2,435962603,Aaron Brown,CAN,male,5/27/92,1.98,79.0,athletics,0,0,1
3,521041435,Aaron Cook,MDA,male,1/2/91,1.83,80.0,taekwondo,0,0,0
4,33922579,Aaron Gate,NZL,male,11/26/90,1.81,71.0,cycling,0,0,0


### Анализ данных

In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11538 entries, 0 to 11537
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id           11538 non-null  int64  
 1   name         11538 non-null  object 
 2   nationality  11538 non-null  object 
 3   sex          11538 non-null  object 
 4   dob          11537 non-null  object 
 5   height       11208 non-null  float64
 6   weight       10879 non-null  float64
 7   sport        11538 non-null  object 
 8   gold         11538 non-null  int64  
 9   silver       11538 non-null  int64  
 10  bronze       11538 non-null  int64  
dtypes: float64(2), int64(4), object(5)
memory usage: 991.7+ KB


#### Просмотр пропусков

In [10]:
data.isna().sum()

id               0
name             0
nationality      0
sex              0
dob              1
height         330
weight         659
sport            0
gold             0
silver           0
bronze           0
dtype: int64

In [11]:
data[data['height'].isna()].head()

Unnamed: 0,id,name,nationality,sex,dob,height,weight,sport,gold,silver,bronze
12,258556239,Abbas Qali,IOA,male,10/11/92,,,aquatics,0,0,0
47,469953606,Abdoullah Bamoussa,ITA,male,6/8/86,,,athletics,0,0,0
50,325809293,Abdul Omar,GHA,male,10/3/93,,,boxing,0,0,0
52,262868423,Abdulaziz Alshatti,IOA,male,10/30/90,,,fencing,0,0,0
56,897549624,Abdullah Hel Baki,BAN,male,8/1/89,,,shooting,0,0,0


### Заменяем пропуски 0

In [12]:
data = data.fillna(0)
data.isna().sum()

id             0
name           0
nationality    0
sex            0
dob            0
height         0
weight         0
sport          0
gold           0
silver         0
bronze         0
dtype: int64

### Предобработка данных

In [13]:
selectedColumns = data[['height', 'sport', 'sex']]

#### Каждой категории даем цифру

In [18]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder().fit(selectedColumns['sport'])

le.classes_

array(['aquatics', 'archery', 'athletics', 'badminton', 'basketball',
       'boxing', 'canoe', 'cycling', 'equestrian', 'fencing', 'football',
       'golf', 'gymnastics', 'handball', 'hockey', 'judo',
       'modern pentathlon', 'rowing', 'rugby sevens', 'sailing',
       'shooting', 'table tennis', 'taekwondo', 'tennis', 'triathlon',
       'volleyball', 'weightlifting', 'wrestling'], dtype=object)

In [19]:
le.transform(selectedColumns['sport'])

array([ 2,  9,  2, ..., 27, 26,  2])

#### Каждая категория это столбец

In [24]:
pd.get_dummies(selectedColumns, columns=['sport'], dtype='int', drop_first=True)

Unnamed: 0,height,sex,sport_archery,sport_athletics,sport_badminton,sport_basketball,sport_boxing,sport_canoe,sport_cycling,sport_equestrian,...,sport_rugby sevens,sport_sailing,sport_shooting,sport_table tennis,sport_taekwondo,sport_tennis,sport_triathlon,sport_volleyball,sport_weightlifting,sport_wrestling
0,1.72,male,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1.68,female,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1.98,male,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1.83,male,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,1.81,male,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11533,1.64,female,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11534,1.73,female,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11535,1.85,male,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
11536,1.60,male,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [25]:
# столбец sport - категориальная переменная (не путать с континуальными)
# переведем ее в значения 0 и 1, добавив столбцы с соответствующими названиями
X = pd.get_dummies(selectedColumns, columns=['sport'], dtype='int', drop_first=True)

# столбец sex является целевой переменной, удаляем его из Х
del X['sex']

print('Пометили вид спорта спортсмена единичкой.')
X.head()

Пометили вид спорта спортсмена единичкой.


Unnamed: 0,height,sport_archery,sport_athletics,sport_badminton,sport_basketball,sport_boxing,sport_canoe,sport_cycling,sport_equestrian,sport_fencing,...,sport_rugby sevens,sport_sailing,sport_shooting,sport_table tennis,sport_taekwondo,sport_tennis,sport_triathlon,sport_volleyball,sport_weightlifting,sport_wrestling
0,1.72,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1.68,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,1.98,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1.83,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,1.81,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


#### Переведем значения столбца sex в числа

In [36]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [37]:
le.fit(data['sex'])
le.classes_

array(['female', 'male'], dtype=object)

##### Проверяем преобразование

In [38]:
le.transform(['male', 'female', 'male'])

array([1, 0, 1])

In [39]:
le.inverse_transform([1, 0, 1])

array(['male', 'female', 'male'], dtype=object)

In [40]:
y = pd.Series(le.transform(data['sex']))
y.head()

0    1
1    0
2    1
3    1
4    1
dtype: int64

### Обучение

In [28]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [41]:
model = make_pipeline(
    StandardScaler(),
    LogisticRegression(max_iter=1000)
)

#### Делаем выборку

In [42]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_test.index

Index([ 5187,  6260,  2069,  7259,  4995,  7070,  6535,  5609,  5340,  1941,
       ...
        8977,  6929,  6870,   266,  1108,  8844, 10448,  3229,  4513,  9586],
      dtype='int64', length=3462)

#### Обучение

In [43]:
model.fit(X_train, y_train)
prediction = model.predict(X_test)

In [45]:
prediction[:5]

array([1, 1, 1, 1, 1])

In [44]:
model.predict_proba(X_test)

array([[0.08649093, 0.91350907],
       [0.40949037, 0.59050963],
       [0.37160317, 0.62839683],
       ...,
       [0.0747077 , 0.9252923 ],
       [0.93055115, 0.06944885],
       [0.10137369, 0.89862631]])

### Получаем точность предсказания

In [46]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test, prediction)

0.6554015020219526

In [47]:
model.score(X_train, y_train)

0.6679049034175334

## Обучаем методом SVC

### Обучение

In [48]:
from sklearn.svm import SVC

clf = make_pipeline(StandardScaler(), SVC())
clf.fit(X_train, y_train)

In [49]:
clf.score(X_train, y_train)

0.7743932639920753

In [50]:
clf.score(X_test, y_test)

0.7726747544771808