## Multi-class classification

In [None]:
import numpy as np
import pandas as pd

# Load the votes data
votes = pd.read_csv('votes.csv')

# keep only counties in CA, TX, and NY
votes = votes[votes['state_abbr'].isin(['CA', 'TX', 'NY', 'PA'])]

## tabulate number of counties in each state
votes['state_abbr'].value_counts()

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

features = ["white", "hispanic", "poverty", "bachelor", "highschool", 
            "age18under", "female", "landarea"]



# split the data into training and test sets
X = votes[features]
Y = votes['state_abbr']

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

# fit the model
model = LogisticRegression(multi_class='multinomial', solver='newton-cg')
model.fit(X_train, y_train)



In [None]:


print(model.classes_)

# make predictions
y_pred = model.predict(X_test)

# get misclassification error
err = np.mean(y_pred != y_test)
print(f'misclassification error: {err:.3f}')

# baseline error of predicting TX for all
baseline_err = np.mean(y_test != 'TX')
print(f'baseline error: {baseline_err:.3f}')

## get coefficients
coefficients = model.coef_
intercept = model.intercept_

## Compare coefficients
print("")
diff_coef = coefficients[1] - coefficients[3]
for j in range(len(features)):
    print(f'{features[j]}: {diff_coef[j]}')

