### Решаем задачу классификации физических лиц по уровню дохода

https://www.cs.toronto.edu/~delve/data/adult/desc.html

### Подготовка данных

In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv( 'adult.csv', na_values='?' )
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,,103497,Some-college,10,Never-married,,Own-child,White,Female,0,0,30,United-States,<=50K


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48842 entries, 0 to 48841
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   age              48842 non-null  int64 
 1   workclass        46043 non-null  object
 2   fnlwgt           48842 non-null  int64 
 3   education        48842 non-null  object
 4   educational-num  48842 non-null  int64 
 5   marital-status   48842 non-null  object
 6   occupation       46033 non-null  object
 7   relationship     48842 non-null  object
 8   race             48842 non-null  object
 9   gender           48842 non-null  object
 10  capital-gain     48842 non-null  int64 
 11  capital-loss     48842 non-null  int64 
 12  hours-per-week   48842 non-null  int64 
 13  native-country   47985 non-null  object
 14  income           48842 non-null  object
dtypes: int64(6), object(9)
memory usage: 5.6+ MB


Посмотрим много ли в наших признаках пустых значений

In [4]:
data[ pd.isnull( data['workclass'] ) ].head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
4,18,,103497,Some-college,10,Never-married,,Own-child,White,Female,0,0,30,United-States,<=50K
6,29,,227026,HS-grad,9,Never-married,,Unmarried,Black,Male,0,0,40,United-States,<=50K
13,58,,299831,HS-grad,9,Married-civ-spouse,,Husband,White,Male,0,0,35,United-States,<=50K
22,72,,132015,7th-8th,4,Divorced,,Not-in-family,White,Female,0,0,6,United-States,<=50K
35,65,,191846,HS-grad,9,Married-civ-spouse,,Husband,White,Male,0,0,40,United-States,<=50K


In [21]:
print('Для workclass пустых строк {}'.format( len( data[ pd.isnull( data['workclass'] ) ] ) ))
print('Для occupation пустых строк {}'.format( len( data[ pd.isnull( data['occupation'] ) ] ) ))
print('Для native-country пустых строк {}'.format( len( data[ pd.isnull( data['native-country'] ) ] ) ))
print('Всего строк в наборе {}'.format( len( data ) ))

Для workclass пустых строк 2799
Для occupation пустых строк 2809
Для native-country пустых строк 857
Всего строк в наборе 48842


In [5]:
data['workclass'].unique()

array(['Private', 'Local-gov', nan, 'Self-emp-not-inc', 'Federal-gov',
       'State-gov', 'Self-emp-inc', 'Without-pay', 'Never-worked'],
      dtype=object)

In [6]:
# обнуляем ячейки с отсутствием данных
data = data[ pd.isnull( data['workclass'] ) == 0 ]
data = data[ pd.isnull( data['occupation'] ) == 0 ]
data = data[ pd.isnull( data['native-country'] ) == 0 ]

In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 45222 entries, 0 to 48841
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   age              45222 non-null  int64 
 1   workclass        45222 non-null  object
 2   fnlwgt           45222 non-null  int64 
 3   education        45222 non-null  object
 4   educational-num  45222 non-null  int64 
 5   marital-status   45222 non-null  object
 6   occupation       45222 non-null  object
 7   relationship     45222 non-null  object
 8   race             45222 non-null  object
 9   gender           45222 non-null  object
 10  capital-gain     45222 non-null  int64 
 11  capital-loss     45222 non-null  int64 
 12  hours-per-week   45222 non-null  int64 
 13  native-country   45222 non-null  object
 14  income           45222 non-null  object
dtypes: int64(6), object(9)
memory usage: 5.5+ MB


In [9]:
# попробуем выбрать какие-нибудь признаки
selectedColumns = data[ [ 'age', 'education', 'gender', 'income' ] ]

# столбцы education и gender являются категориальной переменной
# переведем ее в значения 0 и 1, добавив столбцы с соответствующими названиями
X = pd.get_dummies( selectedColumns, columns = [ 'gender', 'education' ] )

# столбец income является целевой переменной, удаляем его из X
del X['income']
X.head()

Unnamed: 0,age,gender_Female,gender_Male,education_10th,education_11th,education_12th,education_1st-4th,education_5th-6th,education_7th-8th,education_9th,education_Assoc-acdm,education_Assoc-voc,education_Bachelors,education_Doctorate,education_HS-grad,education_Masters,education_Preschool,education_Prof-school,education_Some-college
0,25,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,38,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
2,28,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
3,44,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
5,34,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [10]:
# целевая переменная (столбец income) снова является категориальной
# переведем значения столбца в числа, оставив один столбец

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [11]:
le.fit( data['income'] )

LabelEncoder()

In [12]:
le.classes_

array(['<=50K', '>50K'], dtype=object)

In [13]:
# пример "расшировки" столбца income

le.transform( [ '<=50K', '>50K', '<=50K' ] )

array([0, 1, 0])

In [14]:
# записываем в переменную y преобразованный столбец income

y = pd.Series( data = le.transform( data['income'] ) )
y.head()

0    0
1    0
2    1
3    1
4    0
dtype: int32

Поделим данные на обучающую выборку и тестовую. Указываем в test_size долю датасета, которая пойдет на тестовую выборку

In [16]:
from sklearn.model_selection import train_test_split

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [18]:
X_train

Unnamed: 0,age,gender_Female,gender_Male,education_10th,education_11th,education_12th,education_1st-4th,education_5th-6th,education_7th-8th,education_9th,education_Assoc-acdm,education_Assoc-voc,education_Bachelors,education_Doctorate,education_HS-grad,education_Masters,education_Preschool,education_Prof-school,education_Some-college
8605,29,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
28547,39,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
33927,41,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
14437,21,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
41839,58,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12183,39,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
48310,36,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
41203,53,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
930,57,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0


### Решаем задачу методом логистической регрессии

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
model = LogisticRegression(max_iter=1000)

In [19]:
# обучаем модель

model.fit( X_train, y_train )

LogisticRegression(max_iter=1000)

In [20]:
predictions = model.predict_proba( X_test )

In [21]:
predictions[:5]

array([[0.99157141, 0.00842859],
       [0.7606725 , 0.2393275 ],
       [0.6040294 , 0.3959706 ],
       [0.84319756, 0.15680244],
       [0.68912704, 0.31087296]])

Получаем скор (точность предсказания) на обучающей и тестовой выборках для логистической регрессии

In [22]:
model.score(X_train, y_train) 

0.7894242198081655

In [23]:
model.score(X_test,y_test)

0.7920398009950249

### Решим эту же задачу методом опорных векторов (SVM)

In [24]:
from sklearn.svm import SVC

In [25]:
model_svm = SVC(gamma='auto')

In [26]:
model_svm.fit( X_train, y_train )

SVC(gamma='auto')

Получаем скор (точность предсказания) на обучающей и тестовой выборках для SVM

In [27]:
model_svm.score(X_train, y_train) 

0.7983525444343091

In [28]:
model_svm.score(X_test,y_test) 

0.8021006080707573

Вывод: точность предсказания на тестовых данных методом SVM выше. Дополнительно повысить точность можно попробовать за счет использования StandardScaler.