# Seyed Mohammad Amin Atyabi - 830402014 - HW 7

In [1]:
import json
import pandas
from math import log
from scipy import io as sio


def split_data(data: pandas.DataFrame):
    splits = []
    for i in range(5):
        splits.append(data.iloc[i:i + 65, :])
    return splits


def train_model(data: pandas.DataFrame):
    model = {}
    P = data.loc[data['DEMOCRAT/LIBERAL'] == 1]
    len_p = len(P) + 2
    N = data.loc[data['DEMOCRAT/LIBERAL'] == 0]
    len_n = len(N) + 2
    for column in data.columns:
        if column == 'DEMOCRAT/LIBERAL': continue
        p_p = P.loc[P[column] == 1]
        p_n = N.loc[N[column] == 0]
        model[column] = {
            '11': (len(p_p) + 1) / len_p,
            '01': 1 - ((len(p_p) + 1) / len_p),
            '10': (len(p_n) + 1) / len_n,
            '00': 1 - ((len(p_n) + 1) / len_n)
        }
        desire = log(model[column]['11'] / (1 - model[column]['11'])) + log(
            model[column]['10'] / (1 - model[column]['10']))
        model[column]['bias'] = 1 if desire > 0 else 0
        model[column]['weight'] = desire
    return model


def evaluate_model(model, data: pandas.DataFrame):
    error = 0
    for _, row in data.iterrows():
        predict_1 = 1
        predict_0 = 1
        for column in data.columns:
            if column == 'DEMOCRAT/LIBERAL': continue
            predict_1 *= model[column][f'{row[column]}1']
            predict_0 *= model[column][f'{row[column]}0']
        prediction = 1 if predict_1 > predict_0 else 0
        if prediction != row['DEMOCRAT/LIBERAL']: error += 1
    return error


def print_model(model):
    print(json.dumps(model, indent=4))


columns = [
    'HANDICAPPED-INFANTS',
    'WATER-PROJECT-COST-SHARING',
    'ADOPTION-OF-THE-BUDGET-RESOLUTION',
    'PHYSICIAN-FEE-FREEZE',
    'EL-SALVADOR-AID',
    'RELIGIOUS-GROUPS-IN-SCHOOL',
    'ANTI-SATELLITE-TEST-BAN',
    'AID-TO-NICARAGUAN-CONTRAS',
    'MX-MISSILE',
    'IMMIGRATION',
    'SYNFUELS-CORP-CUTBACK',
    'EDUCATION-SPENDING',
    'SUPERFUND-RIGHT-TO-SUE',
    'CRIME',
    'DUTY-FREE-EXPORTS',
    'EXPORT-ADMINISTRATION-ACT-SOUTH-AFRICA',
    'DEMOCRAT/LIBERAL'
]

data = pandas.DataFrame(sio.loadmat('Data/vote.mat').get('Data'), columns=columns, dtype=int)

splits = split_data(data)

best_model = None
best_model_error = 0

for i in range(len(splits)):
    model = train_model(splits[i])
    avg_error = 0
    for j in range(len(splits)):
        if j == i: continue
        avg_error += evaluate_model(model, splits[j])
    avg_error /= (len(splits) - 1)
    if best_model is None or avg_error < best_model_error:
        best_model = model
        best_model_error = avg_error
    print(avg_error)

print_model(best_model)

34.0
34.0
35.75
35.75
34.75
{
    "HANDICAPPED-INFANTS": {
        "11": 0.5641025641025641,
        "01": 0.4358974358974359,
        "10": 0.7333333333333333,
        "00": 0.2666666666666667,
        "bias": 1,
        "weight": 1.2694300209805796
    },
    "WATER-PROJECT-COST-SHARING": {
        "11": 0.5384615384615384,
        "01": 0.46153846153846156,
        "10": 0.4666666666666667,
        "00": 0.5333333333333333,
        "bias": 1,
        "weight": 0.020619287202735537
    },
    "ADOPTION-OF-THE-BUDGET-RESOLUTION": {
        "11": 0.8461538461538461,
        "01": 0.15384615384615385,
        "10": 0.8,
        "00": 0.19999999999999996,
        "bias": 1,
        "weight": 3.091042453358316
    },
    "PHYSICIAN-FEE-FREEZE": {
        "11": 0.10256410256410256,
        "01": 0.8974358974358975,
        "10": 0.06666666666666667,
        "00": 0.9333333333333333,
        "bias": 0,
        "weight": -4.808111029984782
    },
    "EL-SALVADOR-AID": {
        "11": 0.3333