In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsOneClassifier
from sklearn.model_selection import train_test_split, cross_val_score

# Файл с данными
filename = 'income_data.txt'

# Чтение данных
features_data = []
labels_data = []
category_one_counter = 0
category_two_counter = 0
max_records = 25000

with open(filename, 'r') as file_handle:
    for line_content in file_handle.readlines():
        if category_one_counter >= max_records and category_two_counter >= max_records:
            break

        if '?' in line_content:
            continue

        row_values = line_content[:-1].split(', ')

        if row_values[-1] == '<=50K' and category_one_counter < max_records:
            features_data.append(row_values)
            category_one_counter += 1

        if row_values[-1] == '>50K' and category_two_counter < max_records:
            features_data.append(row_values)
            category_two_counter += 1

# Преобразование в numpy массив
features_array = np.array(features_data)

# Конвертация строковых данных в числовые
encoders_list = []
transformed_array = np.empty(features_array.shape)

for column_index, column_value in enumerate(features_array[0]):
    if column_value.isdigit():
        transformed_array[:, column_index] = features_array[:, column_index]
    else:
        encoders_list.append(preprocessing.LabelEncoder())
        transformed_array[:, column_index] = encoders_list[-1].fit_transform(features_array[:, column_index])

input_features = transformed_array[:, :-1].astype(int)
output_labels = transformed_array[:, -1].astype(int)

# Создание классификатора SVM
svm_model = OneVsOneClassifier(LinearSVC(random_state=42))

# Обучение классификатора
svm_model.fit(input_features, output_labels)

# Разделение данных для валидации
X_train_split, X_test_split, y_train_split, y_test_split = train_test_split(
    input_features, output_labels, test_size=0.25, random_state=7
)

svm_model = OneVsOneClassifier(LinearSVC(random_state=42))
svm_model.fit(X_train_split, y_train_split)
predictions_test = svm_model.predict(X_test_split)

# Вычисление F1 score
f1_scores = cross_val_score(svm_model, input_features, output_labels, scoring='f1_weighted', cv=4)
print("F1 оценка модели: " + str(round(100*f1_scores.mean(), 2)) + "%")

# Тестовый пример для предсказания
sample_input = ['37', 'Private', '215646', 'HS-grad', '9', 'Never-married',
               'Handlers-cleaners', 'Not-in-family', 'White', 'Male',
               '0', '0', '40', 'United-States']

# Кодирование тестового примера
encoded_sample = [-1] * len(sample_input)
encoder_counter = 0

for feature_index, feature_value in enumerate(sample_input):
    if feature_value.isdigit():
        encoded_sample[feature_index] = int(sample_input[feature_index])
    else:
        # Кодирование категориальных признаков
        encoded_sample[feature_index] = int(encoders_list[encoder_counter].transform([sample_input[feature_index]])[0])
        encoder_counter += 1

# Подготовка данных для предсказания
final_input = np.array(encoded_sample).reshape(1, -1)

# Получение предсказания
predicted_label = svm_model.predict(final_input)
print("Результат предсказания: " + encoders_list[-1].inverse_transform(predicted_label)[0])

F1 оценка модели: 75.95%
Результат предсказания: <=50K
