In [1]:
#метода - https://alexbmstu.github.io/2019/

#pandas - работка с датасетами
#numpy - работа с матрицами и др.
#на локалке необходимо установить библиотеки "pip3 install pandas" и тд
import pandas as pd
import numpy as np
import random as rnd
import math as math

#seaborn , matplotlib - графическое представление данных
#import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
pd.options.mode.chained_assignment = None  # default='warn'

# Освоение библиотек

In [2]:
#загрузка данных из csv
train_df = pd.read_csv('train_longevity.csv')
test_df = pd.read_csv('test_longevity.csv')
combine = [train_df, test_df]


In [3]:
train_df

Unnamed: 0,Id,Longevity,Education,Sex,Age,Pet,Children,Region,Activity,MedExam,Sport
0,1,0,3,female,73,1,0,21,725,,
1,2,1,1,male,75,1,0,17,7128,+,+
2,3,1,3,male,73,0,0,31,793,,
3,4,1,1,male,74,1,0,11,5310,+,
4,5,0,3,female,74,0,0,37,805,,
...,...,...,...,...,...,...,...,...,...,...,...
852,853,0,3,male,71,1,1,26,1525,,+
853,854,1,1,male,72,0,1,17,3940,+,
854,855,0,2,male,76,1,0,24,2600,,
855,856,1,3,male,72,0,1,39,935,,


### описание данных

- Longevity - Класс активного долголетия: 1 - человек доживет до 90 лет; 0 - нет
- Id - Идентификатор пожилого человека;
- Education - Образование: 1 - высшее; 2 - среднее; 3 без образования;
- Sex - Пол;
- Age - Возраст;
- Pet - Пожилой человек ухаживает за домашними животными: указано количество;
- Children - Пожилой человек проживают с детьми/внуками/близкими родственниками: указано - количество проживающих совместно в пожилым человеком;
- Region - Регион проживания;
- Activity - Уровень физической активности (количество шагов в день): данные получены от специального приложения;
- MedExam - Посещение поликлиники (за послений год): кодирование посещений на основе заполненной медицинской карточки;
- Sport - Физические упражнения: '+' пожилой человек занимается спортом (ходьба, бег, плаванье); '-' не занимается.

In [4]:
print(train_df.columns.values) #названия столбцов для тренировочных данных

['Id' 'Longevity' 'Education' 'Sex' 'Age' 'Pet' 'Children' 'Region'
 'Activity' 'MedExam' 'Sport']


In [5]:
print(test_df.columns.values) #назания столбцов для тестовых данных

['Id' 'Education' 'Sex' 'Age' 'Pet' 'Children' 'Region' 'Activity'
 'MedExam' 'Sport' 'Longevity']


In [6]:
train_df.shape #размерность данных

(857, 11)

In [7]:
train_df.info() #информация о полях

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 857 entries, 0 to 856
Data columns (total 11 columns):
Id           857 non-null int64
Longevity    857 non-null int64
Education    857 non-null int64
Sex          857 non-null object
Age          680 non-null object
Pet          857 non-null int64
Children     857 non-null int64
Region       857 non-null object
Activity     857 non-null int64
MedExam      196 non-null object
Sport        236 non-null object
dtypes: int64(6), object(5)
memory usage: 73.8+ KB


In [8]:
train_df.describe() #доп данные по столбцам

Unnamed: 0,Id,Longevity,Education,Pet,Children,Activity
count,857.0,857.0,857.0,857.0,857.0,857.0
mean,429.0,0.383897,2.309218,0.525088,0.378063,3267.183197
std,247.538886,0.486617,0.83656,1.089302,0.793893,5048.489704
min,1.0,0.0,1.0,0.0,0.0,0.0
25%,215.0,0.0,2.0,0.0,0.0,793.0
50%,429.0,0.0,3.0,0.0,0.0,1446.0
75%,643.0,1.0,3.0,1.0,0.0,3128.0
max,857.0,1.0,3.0,8.0,6.0,51233.0


In [9]:
train_df.nunique() #количество уникальных значений для каждого столбца (не подробно)

Id           857
Longevity      2
Education      3
Sex            2
Age           12
Pet            7
Children       7
Region        51
Activity     229
MedExam        5
Sport          2
dtype: int64

In [10]:
#вывод количеств уникальных значений по столбцам (подробно)
feature_names = train_df.columns.tolist()
for column in feature_names:
    print (column)
    print (train_df[column].value_counts(dropna=False))

Id
857    1
294    1
292    1
291    1
290    1
      ..
568    1
567    1
566    1
565    1
1      1
Name: Id, Length: 857, dtype: int64
Longevity
0    528
1    329
Name: Longevity, dtype: int64
Education
3    473
1    208
2    176
Name: Education, dtype: int64
Sex
female    557
male      300
Name: Sex, dtype: int64
Age
NaN       177
73        162
74        154
72         93
75         90
76         63
71         37
77         30
70         24
78         19
79          6
#ЗНАЧ!      1
80          1
Name: Age, dtype: int64
Pet
0    582
1    203
2     27
4     18
3     16
8      6
5      5
Name: Pet, dtype: int64
Children
0    652
1    114
2     78
5      4
4      4
3      4
6      1
Name: Children, dtype: int64
Region
34             131
11              85
17              74
31              69
26              60
36              46
23              43
24              32
35              29
37              26
33              24
21              23
29              18
13              17
19    

In [11]:
#сводная таблица для Education и Longevity
train_df[['Education', 'Longevity']].groupby(['Education'], as_index=False).mean().sort_values(by='Longevity', ascending = False)

Unnamed: 0,Education,Longevity
0,1,0.625
1,2,0.471591
2,3,0.245243


In [12]:
#сводная таблица для Sex и longevity
train_df[['Sex', 'Longevity']].groupby(['Sex'], as_index=False).mean().sort_values(by='Longevity', ascending = False)

Unnamed: 0,Sex,Longevity
1,male,0.743333
0,female,0.190305


In [13]:
#сводная таблица для Pet и longevity
train_df[['Pet', 'Longevity']].groupby(['Pet'], as_index=False).mean().sort_values(by='Longevity', ascending = False)

Unnamed: 0,Pet,Longevity
1,1,0.53202
2,2,0.481481
0,0,0.345361
3,3,0.25
4,4,0.166667
5,5,0.0
6,8,0.0


In [14]:
#сводная таблица для Children и longevity
train_df[['Children', 'Longevity']].groupby(['Children'], as_index=False).mean().sort_values(by='Longevity', ascending = False)

Unnamed: 0,Children,Longevity
1,1,0.535088
2,2,0.512821
3,3,0.5
0,0,0.345092
5,5,0.25
4,4,0.0
6,6,0.0


In [15]:
# отрисовка зависимости возраста от долголетия 
# (см. описание параметра долголетие (Longevity) в методе)

# g = sns.FacetGrid(train_df, col = 'Longevity')
# g.map(plt.hist, 'Age', bins = 20)

In [16]:
#Вывод уникальных значений
print(train_df['Age'].unique())
print(test_df['Age'].unique())

['73' '75' '74' nan '77' '70' '72' '71' '78' '76' '79' '#ЗНАЧ!' '80']
['76' '73' nan '75' '74' '71' '72' '77' '78' '70' '80' '#ЗНАЧ!']


In [17]:
train_df['Age'].unique()

array(['73', '75', '74', nan, '77', '70', '72', '71', '78', '76', '79',
       '#ЗНАЧ!', '80'], dtype=object)

# предобработка датасета

In [18]:
#1) ОБработка признака age
#удалим некорректные значения
print(train_df['Age'].value_counts())
idmax = train_df['Age'].value_counts().idxmax() #медианное значение age - 73

idmax = 73
#изменение train_df и test_df с заменой некорректного зачения на медианное
# v!=v  <=>   math.isnan(v) - проверка на NaN.
train_df['Age'] = train_df['Age'].map(lambda v: idmax if (v == '#ЗНАЧ!' or v!= v) else v).astype(np.float32)
test_df['Age'] = test_df['Age'].map(lambda v: idmax if (v == '#ЗНАЧ!' or v != v) else v).astype(np.float32)

train_df['Age'].value_counts() #проверка

73        162
74        154
72         93
75         90
76         63
71         37
77         30
70         24
78         19
79          6
#ЗНАЧ!      1
80          1
Name: Age, dtype: int64


73.0    340
74.0    154
72.0     93
75.0     90
76.0     63
71.0     37
77.0     30
70.0     24
78.0     19
79.0      6
80.0      1
Name: Age, dtype: int64

In [19]:
#2) отбрасывание признака MedExam
train_df = train_df.drop(['MedExam'], axis = 1)
test_df = test_df.drop(['MedExam'], axis = 1)

In [20]:
train_df['Sport'].value_counts()

+    161
-     75
Name: Sport, dtype: int64

In [21]:
#3) дополнение sport '+' (с вероятностью, соответствующей 
#часоте встречания + в исходном датасете) или '-'
# '+' - шифруется единицей
# '-' - шифруется нулем
plus_count = train_df['Sport'].value_counts()['+']
minus_count = train_df['Sport'].value_counts()['-']
all_count = plus_count + minus_count
train_df['Sport'] = train_df['Sport'].map(lambda v: 0 if (rnd.random()>(plus_count/all_count)) else 1).astype(np.float32)

plus_count = test_df['Sport'].value_counts()['+']
minus_count = test_df['Sport'].value_counts()['-']
all_count = plus_count + minus_count
test_df['Sport'] = test_df['Sport'].map(lambda v: 0 if (rnd.random()>(plus_count/all_count)) else 1).astype(np.float32)

train_df['Sport'].value_counts()

1.0    554
0.0    303
Name: Sport, dtype: int64

In [22]:
#4) Исключение region из анализа
train_df = train_df.drop(['Region'], axis = 1)
test_df = test_df.drop(['Region'], axis = 1)
train_df.head()

Unnamed: 0,Id,Longevity,Education,Sex,Age,Pet,Children,Activity,Sport
0,1,0,3,female,73.0,1,0,725,1.0
1,2,1,1,male,75.0,1,0,7128,1.0
2,3,1,3,male,73.0,0,0,793,0.0
3,4,1,1,male,74.0,1,0,5310,0.0
4,5,0,3,female,74.0,0,0,805,1.0


In [23]:
#5) Удаление ID
train_df = train_df.drop(['Id'], axis = 1)
test_df = test_df.drop(['Id'], axis = 1)
train_df.head()

Unnamed: 0,Longevity,Education,Sex,Age,Pet,Children,Activity,Sport
0,0,3,female,73.0,1,0,725,1.0
1,1,1,male,75.0,1,0,7128,1.0
2,1,3,male,73.0,0,0,793,0.0
3,1,1,male,74.0,1,0,5310,0.0
4,0,3,female,74.0,0,0,805,1.0


In [24]:
#6,7) Создание нового признака - family и isAlone на основе старых Children и Pet
pets = train_df['Pet'].copy()
children = train_df['Children'].copy()

family = pets.copy() #просто инициализиация
for i in range(len(pets)):
    family[i] = pets[i] + children[i]
    
isalone = family.map(lambda v: 1 if (v == 0) else 0).astype(np.float32)

#добавление новых полей
train_df['Family'] = family
train_df['IsAlone'] = isalone

#удаление children и pet
train_df = train_df.drop(['Children', 'Pet'], axis=1)

#для test_df
pets = test_df['Pet'].copy()
children = test_df['Children'].copy()

family = pets.copy() #просто инициализиация
for i in range(len(pets)):
    family[i] = pets[i] + children[i]
    
isalone = family.map(lambda v: 1 if (v == 0) else 0).astype(np.float32)

#добавление новых полей
test_df['Family'] = family
test_df['IsAlone'] = isalone

#удаление children и pet
test_df = test_df.drop(['Children', 'Pet'], axis=1)

train_df.head()

Unnamed: 0,Longevity,Education,Sex,Age,Activity,Sport,Family,IsAlone
0,0,3,female,73.0,725,1.0,1,0.0
1,1,1,male,75.0,7128,1.0,1,0.0
2,1,3,male,73.0,793,0.0,0,1.0
3,1,1,male,74.0,5310,0.0,1,0.0
4,0,3,female,74.0,805,1.0,0,1.0


In [25]:
#8) Добавление признака социального статуса пожилого человека (пусть будет SocialStatus = Age*Education)
socialStatus = train_df['Age'].copy() #инициализация
for i in range(len(train_df['Age'])):
    socialStatus[i] = train_df['Age'][i] * train_df['Education'][i]
    
#Применим к SocialStatus min-max нормализацию
min = socialStatus.min()
max = socialStatus.max()
socialStatus = socialStatus.map(lambda v: (v-min)/(max-min)).astype(np.float32)

#добавление признака в датасет train_df
train_df['SocialStatus'] = socialStatus

socialStatus = test_df['Age'].copy() #инициализация
for i in range(len(test_df['Age'])):
    socialStatus[i] = test_df['Age'][i] * test_df['Education'][i]
    
#Применим к SocialStatus min-max нормализацию
min = socialStatus.min()
max = socialStatus.max()
socialStatus = socialStatus.map(lambda v: (v-min)/(max-min)).astype(np.float32)

#добавление признака в датасет test_df
test_df['SocialStatus'] = socialStatus

#проверка
train_df

Unnamed: 0,Longevity,Education,Sex,Age,Activity,Sport,Family,IsAlone,SocialStatus
0,0,3,female,73.0,725,1.0,1,0.0,0.892216
1,1,1,male,75.0,7128,1.0,1,0.0,0.029940
2,1,3,male,73.0,793,0.0,0,1.0,0.892216
3,1,1,male,74.0,5310,0.0,1,0.0,0.023952
4,0,3,female,74.0,805,1.0,0,1.0,0.910180
...,...,...,...,...,...,...,...,...,...
852,0,3,male,71.0,1525,1.0,2,0.0,0.856287
853,1,1,male,72.0,3940,1.0,1,0.0,0.011976
854,0,2,male,76.0,2600,1.0,1,0.0,0.491018
855,1,3,male,72.0,935,0.0,1,0.0,0.874251


In [26]:
#9) Cоздать новый признак диапазонов возрасного равновесия на основе признака Age,
#т.к. это поможет разделить пожилых людей на группы условно равновесного состояния 
#(смертность повышается в определенные периоды времени между 70 и 80 годами, и в 
#другие моменты резко снижается) для следующих интервалов:
#{ (...,70](70,72],(72,74],(74,76],(76,78], (78, 80],(80,..)}
#      1     2        3     4        5         6         7

def calcRange(age):
    if age in range(71):
        return 1
    if age in range(71,73):
        return 2
    if age in range(73, 75):
        return 3
    if age in range(75, 77):
        return 4
    if age in range(77,79):
        return 5
    if age in range(79,80):
        return 6
    if age > 80:
        return 7
    
    
ageRanges = train_df['Age'].map(lambda v: calcRange(v)).astype(np.float32)
train_df['AgeRanges'] = ageRanges


ageRanges = test_df['Age'].map(lambda v: calcRange(v)).astype(np.float32)
test_df['AgeRanges'] = ageRanges

print (ageRanges.value_counts())

3.0    279
4.0     82
2.0     54
5.0     30
1.0      6
Name: Age, dtype: int64


In [27]:
#Замена male и female на 1 и 0
sexes = train_df['Sex'].copy()
sexes_b = sexes.map(lambda v: 1 if (v == 'male') else 0).astype(float)
train_df['Sex'] = sexes_b

sexes = test_df['Sex'].copy()
sexes_b = sexes.map(lambda v: 1 if (v == 'male') else 0).astype(float)
test_df['Sex'] = sexes_b

train_df.head()

Unnamed: 0,Longevity,Education,Sex,Age,Activity,Sport,Family,IsAlone,SocialStatus,AgeRanges
0,0,3,0.0,73.0,725,1.0,1,0.0,0.892216,3.0
1,1,1,1.0,75.0,7128,1.0,1,0.0,0.02994,4.0
2,1,3,1.0,73.0,793,0.0,0,1.0,0.892216,3.0
3,1,1,1.0,74.0,5310,0.0,1,0.0,0.023952,3.0
4,0,3,0.0,74.0,805,1.0,0,1.0,0.91018,3.0


In [28]:
age = train_df['Age'].copy()
minage = age.min()
maxage = age.max()
max_min = maxage - minage

for i in range(len(age)):    
    age[i] = ((age[i]-minage)/(max_min)).astype(np.float32)
train_df['Age'] = age
print (train_df)

age = test_df['Age'].copy()
minage = age.min()
maxage = age.max()
max_min = maxage - minage
for i in range(len(age)):
    age[i] = ((age[i]-minage)/(max_min)).astype(np.float32)
test_df['Age'] = age

train_df.head()

     Longevity  Education  Sex  Age  Activity  Sport  Family  IsAlone  \
0            0          3  0.0  0.3       725    1.0       1      0.0   
1            1          1  1.0  0.5      7128    1.0       1      0.0   
2            1          3  1.0  0.3       793    0.0       0      1.0   
3            1          1  1.0  0.4      5310    0.0       1      0.0   
4            0          3  0.0  0.4       805    1.0       0      1.0   
..         ...        ...  ...  ...       ...    ...     ...      ...   
852          0          3  1.0  0.1      1525    1.0       2      0.0   
853          1          1  1.0  0.2      3940    1.0       1      0.0   
854          0          2  1.0  0.6      2600    1.0       1      0.0   
855          1          3  1.0  0.2       935    0.0       1      0.0   
856          1          1  1.0  0.6     16487    1.0       2      0.0   

     SocialStatus  AgeRanges  
0        0.892216        3.0  
1        0.029940        4.0  
2        0.892216        3.0  

Unnamed: 0,Longevity,Education,Sex,Age,Activity,Sport,Family,IsAlone,SocialStatus,AgeRanges
0,0,3,0.0,0.3,725,1.0,1,0.0,0.892216,3.0
1,1,1,1.0,0.5,7128,1.0,1,0.0,0.02994,4.0
2,1,3,1.0,0.3,793,0.0,0,1.0,0.892216,3.0
3,1,1,1.0,0.4,5310,0.0,1,0.0,0.023952,3.0
4,0,3,0.0,0.4,805,1.0,0,1.0,0.91018,3.0


In [29]:
#Нормализация для train_df - Activity
activity = train_df['Activity'].copy()
min = activity.min().astype(float)
max = activity.max().astype(float)

new_activity = activity.map(lambda v: (v - min)/(max - min)).astype(np.float32)
train_df['Activity'] = new_activity

#Нормализация для test_df - Activity
activity = test_df['Activity'].copy()
min = activity.min().astype(float)
max = activity.max().astype(float)
new_activity = activity.map(lambda v: (v - min)/(max - min)).astype(np.float32)
test_df['Activity'] = new_activity

#Проверка
train_df.head()

Unnamed: 0,Longevity,Education,Sex,Age,Activity,Sport,Family,IsAlone,SocialStatus,AgeRanges
0,0,3,0.0,0.3,0.014151,1.0,1,0.0,0.892216,3.0
1,1,1,1.0,0.5,0.139129,1.0,1,0.0,0.02994,4.0
2,1,3,1.0,0.3,0.015478,0.0,0,1.0,0.892216,3.0
3,1,1,1.0,0.4,0.103644,0.0,1,0.0,0.023952,3.0
4,0,3,0.0,0.4,0.015713,1.0,0,1.0,0.91018,3.0


In [30]:
def to_float32(string, data_frame):
    col = data_frame[string].copy()
    col = col.map(lambda v: v).astype(np.float32)
    data_frame[string] = col
    return data_frame
    
train_df = to_float32('Longevity', train_df)
train_df = to_float32('Education', train_df)
train_df = to_float32('Family', train_df)
train_df = to_float32('Sex', train_df)

test_df = to_float32('Longevity', test_df)
test_df = to_float32('Education', test_df)
test_df = to_float32('Family', test_df)
test_df = to_float32('Sex', test_df)


In [31]:
#Сохранение исправленных датасетов в отдельные файлы
#train_df.to_csv('train_df_done.csv' ,index = False )
#test_df.to_csv('test_df_done.csv', index = False)

train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 857 entries, 0 to 856
Data columns (total 10 columns):
Longevity       857 non-null float32
Education       857 non-null float32
Sex             857 non-null float32
Age             857 non-null float32
Activity        857 non-null float32
Sport           857 non-null float32
Family          857 non-null float32
IsAlone         857 non-null float32
SocialStatus    857 non-null float32
AgeRanges       856 non-null float32
dtypes: float32(10)
memory usage: 33.6 KB
