# 0. Libraries

In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import confusion_matrix

# 1. Loading the Dataset

In [None]:
file_path = 
_data = pd.read_csv(file_path)

# 2. Quick dataset summary

## 2.0. Checking for missing data

In [None]:
_data.isnull().sum()

## 2.1. Checking for the dimension of the dataset

In [None]:
_data.shape

## 2.2. Statistical summary using .describe()

In [None]:
_data.describe()

## 2.3. Checking for the distribution of each class that we are trying to predict

In [None]:
_data.groupby('class').size()
# or: class_counts = _data['class'].value_counts()
# print(class_counts)

# 3. Exploring data with visualization

## 3.0. Visualizing the distribution

In [None]:
nameplot = _data['class'].value_counts().plot.bar(title='')
nameplot.set_xlabel('class',size=20)
nameplot.set_ylabel('count',size=20)

## 3.1. Box and Whisker plot

In [None]:
_data.plot(kind='box', vert=False, figsize=(10, 2), title = 'Boxplot of numerical features')
plt.show()

## 3.2. Histogram

In [2]:
_data.hist()
plt.show()
# We can see which attributes have normal distribution

## 3.3. Multivariate scatter plot

In [None]:
sns.set(style="ticks")
sns.pairplot(_data, hue="class")
# We can visualize the pair-wise relationship in our dataset¶

# 4. Data Modeling

## 4.0 Train-Test Split

In [None]:
X = _data.drop(['class'], axis=1)
Y = _data['class']
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=7)

## 4.1 Models Building

In [None]:
# models
models = []

# linear models
models.append(('LR', LogisticRegression(solver='liblinear', multi_class="auto")))
models.append(('LDA', LinearDiscriminantAnalysis()))

# nonlinear models
models.append(('CART', DecisionTreeClassifier()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('GNB', GaussianNB()))
models.append(('SVC', SVC(gamma="auto")))

# evaluate each model in turn
print("Model Accuracy:")
names = []
accuracy = []
for name, model in models:
    # 10 fold cross validation to evaluate model
    kfold = KFold(n_splits=10, random_state=7)
    cv_results = cross_val_score(model, X_train, Y_train, cv=kfold, scoring='accuracy')
    
    # display the cross validation results of the current model
    names.append(name)
    accuracy.append(cv_results)
    msg = "%s: accuracy=%f std=(%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg) 

In [None]:
# Visualizing the training with Box Plot

list_of_tuples = list(zip(names, accuracy))
df = pd.DataFrame(list_of_tuples,
                  columns=['names', 'accuracy'])

df = df.explode('accuracy')
df['accuracy'] = df['accuracy'].astype('float')
ax = sns.boxplot(data=df, x='names', y='accuracy')
ax.set_title('Model Accuracy Comparison')

# This will help us to see which models have small accuracy deviation to be chosen as the best models so far

In [None]:
#Test the goode models with test data and output their accuracy with confusion matrix together for selecting model
# models
models = []
models.append(('model1', KNeighborsClassifier()))
models.append(('model2', SVC(gamma="auto")))

In [None]:
# We will evaluate the testing with accuracy score, confusion matrix, and classification report with Sklearn
# reusable function to test our model
def test_model(model):
    model.fit(X_train, Y_train) # train the whole training set
    predictions = model.predict(X_test) # predict on test set
    
    # output model testing results
    print("Accuracy:", accuracy_score(Y_test, predictions))
    print("Confusion Matrix:")
    print(confusion_matrix(Y_test, predictions))
    print("Classification Report:")
    print(classification_report(Y_test, predictions))