## Notebook Imports

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import make_scorer, mean_squared_error
from lightgbm import LGBMRegressor
from lofo import LOFOImportance, Dataset, plot_importance

import xgboost as xgb

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import Sequential
from tensorflow.keras import layers

import warnings

In [None]:
warnings.filterwarnings("ignore")

#### Loading Dataset

In [None]:
train = pd.read_csv('./data/breastCancerTraining.csv')

train.head()

In [None]:
train.info()

In [None]:
train.describe()

In [None]:
len(train)

There does not seem to be any missing or 'na' values in this dataset.

In [None]:
train.isnull().sum()

In [None]:
# Q. How many features are there? What are they?

print(f'Total Features: {len(train.columns)}')
train.columns

In [None]:
train = train.drop(['id', 'count'], axis=1)

In [None]:
# Correlation Heatmap

plt.figure(figsize=(20, 15))
mask = np.triu(np.ones_like(train.corr().round(2), dtype=bool))
heatmap = sns.heatmap(train.corr().round(2), mask=mask, vmin=-1, vmax=1, annot=True, cmap=sns.cubehelix_palette(as_cmap=True))
heatmap.set_title('Heatmap', fontdict={'fontsize':10}, pad=18);

Importance Plot

In [None]:
cv = KFold(n_splits=8, shuffle=True, random_state=1)

scorer = make_scorer(mean_squared_error, greater_is_better=False)

dataset = Dataset(
    df=train.sample(frac=0.4, random_state=1),
    target="cancer",
    features=[col for col in train.columns if col != "cancer"]
)

lgbm = LGBMRegressor(random_state=0, n_jobs=1)

lofo_imp = LOFOImportance(dataset, cv=cv, scoring=scorer, model=lgbm)

In [None]:
importance_df = lofo_imp.get_importance()

In [None]:
plot_importance(importance_df, figsize=(10, 5))

In [None]:
feat = ['invasive', 'race', 'bmi', 'Hispanic']

In [None]:
scaler = StandardScaler()

scaler.fit_transform(train[feat])

In [None]:
X = train[feat]
y = train['cancer']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, shuffle=True)

In [None]:
def rmse(a,b):
    return np.sqrt(np.square(np.subtract(a,b)).mean())

#### Model1: LogisticRegression

In [None]:
logreg = LogisticRegression()

logreg.fit(X_train, y_train)

y_preds_logreg = logreg.predict(X_test)

In [None]:
rmse(y_test, y_preds_logreg)

#### Model2: RandomForestClassifier

In [None]:
rf = RandomForestClassifier(max_depth=1, n_estimators = 2, random_state=24)

rf.fit(X_train, y_train)

y_preds_rf = rf.predict(X_test)

In [None]:
rmse(y_test, y_preds_rf)

#### Model3: XGBoost Classifier

In [None]:
dtrain_reg = xgb.DMatrix(X_train, y_train, enable_categorical=True)
dtest_reg = xgb.DMatrix(X_test, y_test, enable_categorical=True)

In [None]:
params = {"objective": "reg:squarederror", "tree_method": "gpu_hist"}

In [None]:
n = 100
model_xgb = xgb.train(
   params=params,
   dtrain=dtrain_reg,
   num_boost_round=n,
)

In [None]:
y_preds_xgb = model_xgb.predict(dtest_reg)

In [None]:
rmse(y_test, y_preds_xgb)

#### Model4: ANN

In [None]:
ann = Sequential(
    [
    layers.Dense(4, activation='relu'),
    layers.Dense(8, activation='relu'),
    layers.Dense(2, activation='sigmoid'),
    layers.Dense(1, activation='sigmoid')
    ]
)

In [None]:
ann.compile(optimizer='Adagrad', loss='binary_crossentropy')

In [None]:
ann.fit(X_train, y_train)

y_preds_ann = ann.predict(X_test)

In [None]:
rmse(np.reshape(y_test, (-1,1)), y_preds_ann)