In [28]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import time

train_data = pd.read_csv('Desktop/Homework/data mining/as1/Census Income Data Set/adult.data', header=None)
test_data = pd.read_csv('Desktop/Homework/data mining/as1/Census Income Data Set/adult.test', header=None, skiprows=1)
columns = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 
           'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income']

train_data.columns = columns
test_data.columns = columns

# Data Preparation
def preprocess_data(data):
    # Handle Missing Values
    data = data.replace(' ?', None).dropna()
    
    # Categorical variables
    le = LabelEncoder()
    for column in data.select_dtypes(include=['object']).columns:
        data[column] = le.fit_transform(data[column])
    
    return data

train_data = preprocess_data(train_data)
test_data = preprocess_data(test_data)

# Split features and labels
X_train = train_data.drop('income', axis=1)
y_train = train_data['income']
X_test = test_data.drop('income', axis=1)
y_test = test_data['income']

# Normalizing Numerical Features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

def evaluate_model(model, X_train, y_train, X_test, y_test):
    # reocrd time
    start_time = time.time()
    model.fit(X_train, y_train)
    training_time = time.time() - start_time
    
    # record test time
    start_time = time.time()
    y_pred = model.predict(X_test)
    testing_time = time.time() - start_time
    
    # Calculation performance indicators
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    
    return accuracy, precision, recall, f1, conf_matrix, training_time, testing_time

# Decision Tree
decision_tree = DecisionTreeClassifier()
dt_results = evaluate_model(decision_tree, X_train, y_train, X_test, y_test)

# Support Vector Machines (SVM)
svm = SVC()
svm_results = evaluate_model(svm, X_train, y_train, X_test, y_test)

# comparing the results
print("Decision Tree Results:", dt_results)
print("SVM Results:", svm_results)

Decision Tree Results: (0.799136786188579, 0.5875940825330911, 0.6118918918918919, 0.5994968886535151, array([[9771, 1589],
       [1436, 2264]]), 0.10006523132324219, 0.0015299320220947266)
SVM Results: (0.846347941567065, 0.7572383073496659, 0.5513513513513514, 0.6380982170785111, array([[10706,   654],
       [ 1660,  2040]]), 5.7301270961761475, 4.05870795249939)
