#### Model Controls

In [1]:
RUN_GRID_SEARCH_CV = False

#### Base Library Imports

In [2]:
import pandas as pd
import numpy as np 
from matplotlib import pyplot as plt
from matplotlib.ticker import MaxNLocator
import seaborn as sns
%matplotlib inline



#### Load Data

In [3]:
df = pd.read_csv('../../../data/cardio_train.csv', delimiter=';')
# set id as index
df.set_index("id", inplace=True)
# copy original data
df_clean = df.copy(deep=True)
# drop duplicates
df_clean.drop_duplicates(inplace=True)



#### Transformations

In [4]:
# %%time

# Convert age into years
df_clean['age'] = (df_clean['age'] / 365).round().astype('int')

# re-encode gender to 1 (male) and 0 (female)
df_clean['gender'] = np.where((df_clean.gender == 2), 1, 0)

# compute the body mass index based on weight and height
df_clean['bmi'] = df_clean['weight'] / (df_clean['height']/100)**2

# create a BMI group
df_clean['bmiGrp'] = np.where((df_clean.bmi < 18.5), 1, 0)
df_clean['bmiGrp'] = np.where((df_clean.bmi >= 18.5) & (df_clean.bmi < 25), 2, df_clean.bmiGrp)
df_clean['bmiGrp'] = np.where((df_clean.bmi >= 25) & (df_clean.bmi < 30), 3, df_clean.bmiGrp)
df_clean['bmiGrp'] = np.where((df_clean.bmi >= 30), 4, df_clean.bmiGrp)

# bin blood pressure groups based on the api hi/ lo variables
df_clean['bp'] = np.where((df_clean.ap_hi < 120) & (df_clean.ap_lo < 80), 1, 0)
df_clean['bp'] = np.where((df_clean.ap_hi >= 120) & (df_clean.ap_hi < 130) & (df_clean.ap_lo < 80), 2, df_clean.bp)
df_clean['bp'] = np.where((df_clean.ap_hi >= 130) & (df_clean.ap_hi < 140) | ((df_clean.ap_lo >= 80) & (df_clean.ap_lo < 90)), 3, df_clean.bp)
df_clean['bp'] = np.where((df_clean.ap_hi >= 140) | (df_clean.ap_lo >= 90), 4, df_clean.bp)
df_clean['bp'] = np.where((df_clean.ap_hi > 180) | (df_clean.ap_lo > 120), 5, df_clean.bp)



In [5]:
# Full Model
# X_cols = ['age', 'gender', 'height', 'weight', 'ap_hi', 'ap_lo', 'cholesterol', 'gluc', 'smoke', 'alco', 'active']

# New Feature Model
X_cols = ['age', 'gender', 'bmiGrp', 'bp', 'cholesterol', 'gluc', 'smoke', 'alco', 'active']

# Store feature matrix 
X = df_clean[X_cols] #.to_numpy()
# Store response vector
y = df_clean['cardio'] #.to_numpy()



# Create Models (50 points)

Create a logistic regression model and a support vector machine model for the classification task involved with your dataset. Assess how well each model performs (use 80/20 training/testing split for your data). Adjust parameters of the models to make them more accurate. If your dataset size requires the use of stochastic gradient descent, then linear kernel only is fine to use. That is, the SGDClassifier is fine to use for optimizing logistic regression and linear support vector machines. For many problems, SGD will be required in order to train the SVM model in a reasonable timeframe. 



In [None]:
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import plot_roc_curve

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

# Estimators
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.svm import SVC



## Logistic Regression

Model consisting of all original and new features with standardized values. RobustScaler below will scale features using statistics that are robutst to outliers.

https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.RobustScaler.html

In [None]:


X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=1)

rs = RobustScaler()
X_train_std = rs.fit_transform(X_train)
X_test_std = rs.transform(X_test)

# sc = StandardScaler()
# X_train_std = sc.fit_transform(X_train)
# X_test_std = sc.transform(X_test)

# si = SimpleImputer(strategy="median")
# X_train_std = si.fit_transform(X_train_std)
# X_test_std = si.transform(X_test_std)

logreg = LogisticRegression(n_jobs=-1, random_state=1)
logreg.fit(X_train_std, y_train)

fig = plt.figure(1, figsize=(20, 5))

chart_1 = fig.add_subplot(121)
chart_2 = fig.add_subplot(122)

# Pass Fitted Model, and our test sets, see how they do
plot_confusion_matrix(logreg, X_test_std, y_test, normalize='true', ax=chart_1)
chart_1.set_title('Confusion Matrix')

plot_roc_curve(logreg, X_test_std, y_test, ax=chart_2)
chart_2.set_title('ROC Curve')

plt.show()


# Interpret Feature Importance (30)

Use the weights from logistic regression to interpret the importance of different features for the classification task. Explain your interpretation in detail. Why do you think some variables are more important?

In [None]:
# sort these attributes and spit them out
zip_vars = zip(logreg.coef_.T, X.columns) # combine attributes
zip_vars = sorted(zip_vars)
for coef, name in zip_vars:
    print(name, 'has weight of', round(coef[0], 3)) # now print them out

In [None]:
# now let's make a pandas Series with the names and values, and plot them

from collections import OrderedDict

plt.style.use('ggplot')

coef_dict = {}

for coef, feat in zip(logreg.coef_[0,:], X.columns):
    coef_dict[feat] = coef

coef_dict = OrderedDict({k: v for k, v in sorted(coef_dict.items(), key=lambda item: item[1])})
  
# weights = pd.Series(logreg.coef_[0],index=X.columns)
weights = pd.Series(coef_dict).sort_values(ascending=False)
weights.plot(kind='bar')
plt.show()


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=1)

# X_train = pd.get_dummies(X_train, columns=['gender', 'cholesterol', 'gluc', 'smoke', 'alco', 'active'])
# X_test = pd.get_dummies(X_test, columns=['gender', 'cholesterol', 'gluc', 'smoke', 'alco', 'active'])

# *dcrouthamel - Begin Section
# New Feature Model
X_cols = ['age', 'bmiGrp', 'bp', 'cholesterol']

# Store feature matrix 
X = df_clean[X_cols] #.to_numpy()
# Store response vector
y = df_clean['cardio'] #.to_numpy()

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=1)

X_train = pd.get_dummies(X_train, columns=['cholesterol'])
X_test = pd.get_dummies(X_test, columns=['cholesterol'])
# *dcrouthamel - End Section

rs = RobustScaler()
X_train_std = rs.fit_transform(X_train)
X_test_std = rs.transform(X_test)

# use get dummies instead
# ohe = OneHotEncoder()
# X_train_std = ohe.fit_transform(X_train_std)
# X_test_std = ohe.transform(X_test_std)

logreg = LogisticRegression(n_jobs=-1, random_state=1)
logreg.fit(X_train_std, y_train)




In [None]:
# sort these attributes and spit them out
zip_vars = zip(logreg.coef_.T, X_train.columns) # combine attributes
zip_vars = zip_vars
for coef, name in zip_vars:
    print(name, 'has weight of', round(coef[0], 3)) # now print them out

    

In [None]:

weights = pd.Series(logreg.coef_[0], X_train.columns)
weights.plot(kind='bar')
plt.show()



In [None]:
y_pred = logreg.predict(X_test_std)

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
scores = cross_val_score(logreg, X_train_std, y_train,
                         scoring="roc_auc", cv=3)

def display_scores(scores):
    print("Scores:", scores.round(3))
    print("Mean:", scores.mean().round(3))
    print("Standard deviation:", scores.std().round(3))

display_scores(scores)

In [None]:
# if MODEL_TYPE == "Full":
#     # Full Model
#     num_attribs = ["age", "height", "weight", "ap_hi", "ap_lo"]
#     cat_attribs = ["gender", "cholesterol", "gluc", "smoke", "alco", "active"]
# else:
# New Features
num_attribs = ["age", "bmiGrp", "bp"]
cat_attribs = ["gender", "cholesterol", "gluc", "smoke", "alco", "active"]



In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import classification_report


num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('rbs_scaler', RobustScaler()),
        # ('std_scaler', StandardScaler()),
    ])


In [None]:
from sklearn.compose import ColumnTransformer

full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", OneHotEncoder(drop="first"), cat_attribs),
    ])

X_prepared = full_pipeline.fit_transform(X)

# Model Advantages (10)

Does one type of model offer superior performance over another in terms of prediction accuracy? In terms of training time or efficiency? Explain in detail.


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_prepared, y, stratify=y, test_size=0.2, random_state=1)


In [None]:
logreg = LogisticRegression(n_jobs=-1, C=2, random_state=1)
logreg.fit(X_train, y_train)

In [None]:
y_pred = logreg.predict(X_test)


In [None]:
print(classification_report(y_test, y_pred))

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, y_pred)
print(confusion_matrix)

In [None]:
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import plot_roc_curve

fig = plt.figure(1, figsize=(20, 5))

chart_1 = fig.add_subplot(121)
chart_2 = fig.add_subplot(122)

plot_confusion_matrix(logreg, X_test, y_test, normalize='true', ax=chart_1)
chart_1.set_title('Confusion Matrix')

plot_roc_curve(logreg, X_test, y_test, ax=chart_2)
chart_2.set_title('ROC Curve')

plt.show()

In [None]:
# from sklearn.linear_model import LogisticRegression
# from sklearn.linear_model import SGDClassifier
# from sklearn.svm import LinearSVC
# from sklearn.svm import SVC

model_params = {
    "sgd": {
        "model": SGDClassifier(),
        "params": {
            "alpha": [0.00001, 0.0001, 0.001, 0.01, 0.1],
            "class_weight": ["balanced", None]
        }
    },
    "logistic_regression": {
        "model": LogisticRegression(),
        "params": {
            "C": [1, 2, 5, 10, 15, 20]
        }
    },
    "linear_svc": {
        "model": LinearSVC(),
        "params": {
            "C": [1, 2, 5, 10, 15, 20], 
            "class_weight": ["balanced", None]
        }
    },
    # "svc": {
    #     "model": SVC(),
    #     "params": {
    #         "C": [1, 10, 100, 1000],
    #         "kernel": ["rbf"],
    #         "gamma": [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
    #     }
    # },
}



In [None]:
if (RUN_GRID_SEARCH_CV):

    from sklearn.model_selection import GridSearchCV

    scores = []

    for model_name, mp in model_params.items():
        clf = GridSearchCV(estimator = mp["model"], param_grid=mp["params"], cv=10, scoring="roc_auc", n_jobs=-1)
        clf.fit(X_train, y_train)
        scores.append({"model": model_name,
        "best_score": clf.best_score_, # Mean cross-validated score of the best_estimator
        "best_params": clf.best_params_
        })

    df_grid_search_scores = pd.DataFrame(scores, columns=["model", "best_score", "best_params"])
    print(df_grid_search_scores)


## SGDClassifier

In [None]:

svm_sgd = SGDClassifier(alpha=0.001, class_weight="balanced", n_jobs=-1, random_state=1) # get object

In [None]:
svm_sgd.fit(X_train, y_train)  # train object

In [None]:
y_pred = svm_sgd.predict(X_test)

In [None]:
print(classification_report(y_test, y_pred))

## LinearSVC

In [None]:

svm_lin = LinearSVC(C=15, class_weight=None) # get object
svm_lin.fit(X_train, y_train)  # train object


In [None]:
y_pred = svm_lin.predict(X_test)

In [None]:
print(classification_report(y_test, y_pred))

## SVC

In [None]:

svm_best = SVC(kernel='rbf', C=1, gamma=0.1, class_weight="balanced", random_state=1) # get object
svm_best = SVC() # get object


In [None]:
svm_best.fit(X_train, y_train)  # train object

In [None]:
y_pred = svm_best.predict(X_test)

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
fig = plt.figure(1, figsize=(20, 5))

chart_1 = fig.add_subplot(121)
chart_2 = fig.add_subplot(122)

plot_confusion_matrix(svm_best, X_test, y_test, normalize='true', ax=chart_1)
chart_1.set_title('Confusion Matrix')

plot_roc_curve(svm_best, X_test, y_test, ax=chart_2)
chart_2.set_title('ROC Curve')

plt.show()

# Interpret Support Vectors (10)

Look at the chosen support vectors for the classification task. Do these provide any insight into the data? Explain. If you used stochastic gradient descent (and therefore did not explicitly solve for support vectors), try subsampling your data to train the SVC model— then analyze the support vectors from the subsampled dataset.