In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# Завантажуємо дані
columns = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status',
           'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss',
           'hours-per-week', 'native-country', 'income']
data = pd.read_csv('income_data.txt', header=None, names=columns)

# Перетворення цільової змінної 'income' на числові значення
label_encoder = LabelEncoder()
data['income'] = label_encoder.fit_transform(data['income'])  # <=50K -> 0, >50K -> 1

In [18]:
categorical_columns = ['workclass', 'education', 'marital-status', 'occupation', 'relationship',
                       'race', 'sex', 'native-country']
data = pd.get_dummies(data, columns=categorical_columns)

In [19]:
X = data.drop('income', axis=1)
y = data['income']

# Розбиваємо дані на навчальні та тестові набори
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Масштабуємо дані
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [20]:
models = {
    'Logistic Regression (LR)': LogisticRegression(),
    'Linear Discriminant Analysis (LDA)': LinearDiscriminantAnalysis(),
    'K-Nearest Neighbors (KNN)': KNeighborsClassifier(),
    'Classification and Regression Tree (CART)': DecisionTreeClassifier(),
    'Naive Bayes (NB)': GaussianNB(),
    'Support Vector Machine (SVM)': SVC()
}

# Порівняння моделей
for name, model in models.items():
    # Навчання моделі
    model.fit(X_train_scaled, y_train)

    # Прогнозування
    y_pred = model.predict(X_test_scaled)

    # Оцінка точності
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{name} - Accuracy: {accuracy:.4f}")
    print(f"Classification Report for {name}:\n{classification_report(y_test, y_pred)}\n")

Logistic Regression (LR) - Accuracy: 0.8549
Classification Report for Logistic Regression (LR):
              precision    recall  f1-score   support

           0       0.88      0.93      0.91      7455
           1       0.74      0.61      0.66      2314

    accuracy                           0.85      9769
   macro avg       0.81      0.77      0.79      9769
weighted avg       0.85      0.85      0.85      9769


Linear Discriminant Analysis (LDA) - Accuracy: 0.8432
Classification Report for Linear Discriminant Analysis (LDA):
              precision    recall  f1-score   support

           0       0.87      0.93      0.90      7455
           1       0.71      0.57      0.63      2314

    accuracy                           0.84      9769
   macro avg       0.79      0.75      0.77      9769
weighted avg       0.84      0.84      0.84      9769


K-Nearest Neighbors (KNN) - Accuracy: 0.8211
Classification Report for K-Nearest Neighbors (KNN):
              precision    recall 