In [None]:
# basic imports
import pandas as pd
import numpy as np
import lib.mySqlManager as mySqlManager

# from sklearn.datasets import load_breast_cancer

# import sic-learn modules/ ml algoriths
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold 
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics

# classifiers
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

# visualization
from tabulate import tabulate
from IPython.display import display, HTML
from matplotlib import pyplot as plt
import seaborn as sns

# test imports
import pymysql
from sqlalchemy import create_engine


In [None]:
# read dataframe from datasets directory
# cancer_df = pd.read_csv("/src/datasets/breast_cancer_data.csv")
# prtin dataset shape,head and tail in order to understand data structure
# print("Shape of breast_cancer data:", cancer_df.shape)
# display(cancer_df.head())
# display(cancer_df.tail())

In [None]:
# set up mySQL database helper.
# myhost = 'mysql'

# def setUpMySql(engine):
#     myDB = "breastcancer"
#     engine.execute("CREATE DATABASE IF NOT EXISTS {};".format(myDB)) #create db
#     # read dataframe from datasets directory
#     cancer_df = pd.read_csv("/src/datasets/breast_cancer_data.csv")
#     updloadTableFromDF(cancer_df,"breastdataset",False)
#     print("Done")

# def updloadTableFromDF(df,table_name, index=True):
#     con_str = 'mysql+pymysql://root:root@{0}:3306/{1}'.format(myhost, "breastcancer")
#     engine = create_engine(con_str)
#     df.to_sql(name = table_name, con = engine, if_exists = 'replace',index=index)
#     print("Done")

# def fetchBreastData():
#     con_str = 'mysql+pymysql://root:root@{0}:3306/{1}'.format(myhost, "breastcancer")
#     con = create_engine(con_str)
#     df = pd.read_sql("SELECT * FROM breastcancer.breastdataset", con=con)
#     return df

# print("Im mySQL manager")
# con_str = 'mysql+pymysql://root:root@{0}:3306'.format(myhost)
# engine = create_engine(con_str)
# # set up mySQL
# mySqlManager.setUpMySql(engine)  

In [None]:
# read dataframe from MYSQL DB  
cancer_df = mySqlManager.fetchBreastData()
# prtin dataset shape,head and tail in order to understand data structure
print("Shape of breast_cancer data:", cancer_df.shape)
display(cancer_df.head())
display(cancer_df.tail())

In [None]:
#apo ta parapano paratiroume oti to dataset apotelite apo 569 sumples opou to kathe ena apo auta     analiete se 33 columns.
#Analisi ton columns 
#1) to column id periexei to id tou digmatos
#2) to column diagnosis periexei tin etiketa tou digmatos (label) diladi tin pliroforia pou
#epithimoume na provlepsoume sindiazontas tin pliroforia pou eksagoume apo ta ipolipa 
#xaraktiristika
#3) to feature Unnamed:32 to opoio fenete na periexei pola NaN values 
#4) ta ipolipa 30 xaraktiristika ta opoia voithoun sto na prosdiorisoume to label twn samples.

#%% Pie chart
labels = 'Benign','Malignant'
sizes = (cancer_df['diagnosis'].value_counts(normalize=True) * 100).to_numpy()
explode = (0.2, 0) # leei poso tha apexri to kathe komati tis pitas apo ta alla an einai 0 einai enomena me ta alla
plt.pie(sizes, explode=explode, labels=labels, autopct='%1.1f%%',
        shadow=True, startangle=90)
plt.title('Percentage Of Diagnosis Variable Occurencies')
plt.show()

#  horizontal bar charts

M = cancer_df[(cancer_df['diagnosis'] != 'B')]
B = cancer_df[(cancer_df['diagnosis'] == 'B')]

y_pos = np.arange(2)

fig, ax = plt.subplots()
ax.barh(y_pos, [len(M), len(B)], align='center')
ax.set_yticks(y_pos)
ax.set_yticklabels(['malignant', 'benign'])
ax.invert_yaxis()  # labels read top-to-bottom
ax.set_title('Diagnosis Variable Occurencies')

plt.show()


In [None]:
"""
 check dataset for missing values
 1) create a new df containing only boolean values representing whether the value is NaN or not (true or false)
 2) create the not null df whitch contains the number of not null values per feature
 3) display this df in a bar graph.
"""
isNullDf = cancer_df.isnull()
isNotNullDf = pd.DataFrame(isNullDf.shape[0] - isNullDf.sum(), columns = ['Count'])
# create a bar plot for null values
plt.figure(figsize=(15, 8))  # width:20, height:3
plt.bar(isNotNullDf.index, isNotNullDf['Count'], color ='green', align='edge', width=0.5)
# sets x axis labels and spacifies rotation to vertical
plt.xticks(isNotNullDf.index, isNotNullDf.index, rotation='vertical')
plt.xlabel("Feature Name") 
plt.ylabel("No. of not null values") 
plt.title("Not Null Values Per Feature") 


In [None]:
"""
# Notes:
 from the graph of step 3 we can see that we have no missing values in all featues except from Unamed:32
 where all values on this column are missing.
 
 besids that id column doesn't give us any information 

 So we are dropping from the cancer_df the Unamed:32 and the id columns and store it to a new df named cancerData.

 episis alazoume tis times tis stilis diagnosis me 0 kai 1
"""

cancerData = cancer_df.drop(['id', 'Unnamed: 32'], axis = 1)

cancerData['diagnosis'].replace("M", 1, inplace=True)
cancerData['diagnosis'].replace("B", 0, inplace=True)

display(cancerData.head())




In [None]:
# featur Analysis!

# extract information from the cleaned dataset
# 1) display a dataframe whitch discribes cancerData characteristics like (count, mean, std etc)
# 2) display pandas ploting scatter matrix whitch displays a matrinx consists of graphs for the corellations tha have featurs with each other 
# and the distribution for each feature on its diagonal
display(cancerData.describe())

# provaloume ena cluster correlation diagram to opoio mas provalei tin sisxetisi pou exoun ta xaraktiristika metaksi
# tous kai mas dixnei kai apotelesma pou prokiptei an efarmosoume ierarxiko algorithmo omadopoiisis pano stin 
# sisxetisi twn xaraktiristikon.
corrMatrix = cancerData.corr()
display(corrMatrix.head())
plt.figure(figsize =(30,20))
sns.clustermap(corrMatrix, annot = False,  fmt = ".2f")
plt.show()


In [None]:
"""
Apo to parapano diagrama paratiroume oti iparxei poli megalos ogkos pliroforias opote gia na na mporesoume
na katalavoume kalitera tis sisxetisis tha efarmosoume tin idia diadikasia me ena threshold 0.7 
opote kratame ta features pou exoun sisxetisi kata apoliti timi megaliteri tou 0.7 me to xaraktiristiko tis
diagnosis
"""
threshold = 0.7

corr_diagnosis = corrMatrix[abs(corrMatrix.diagnosis) > threshold]
corr_diagnosis = corr_diagnosis[corr_diagnosis.index]
display(corr_diagnosis)

# kai provaloume ena pair plot to opoio einai ena matrix pou dixnei tin grafiki parastasi sisxetisis pou exoun ta 
# xaraktiristika metaksi tous kai stin diagonio tou perixei
plt.figure(figsize =(20,20))
sns.pairplot(cancerData[corr_diagnosis.index],  markers = "x", hue = "diagnosis")
plt.show()


In [None]:
"""
    eproepeksergasia dedomenon gia tin efarmogi PCA.
    1) aferoume apo to cancer data tin stili diagnosis kai tin apothikeuoume se ena ksexoristo array (label)
    2) dimiourgoume ena kenourio dataframe to opoio periexei kanonikopoiimena ta xaraktiristika mas
    3) alazoume to onoma ton xaraktiristikon dioti ta xaraktiristika pou tha prokipsoun apo ton PCA (components)
    periexoun plirofora apo ola ta xaraktiristika

"""

# step 1
label = cancerData.diagnosis.to_numpy()
print(label)

x = cancerData.drop(['diagnosis'], axis = 1)

# step 2
x = StandardScaler().fit_transform(x)  # normalizing the features
print("\nShape of normalized data:", x.shape)

# elenxoume an to kanonikopoiimeno mas data set exei mean 0 kai tipiki apoklisi 1
print("\nPrints mean:", np.mean(x), " and Standard deviation of normalized dataset: ", np.std(x))

# step 3: change feature name
feature_columns = ['feature' + str(i) for i in range(x.shape[1])]
normalized_dataFrame = pd.DataFrame(x, columns=feature_columns)

display(normalized_dataFrame.head())

In [None]:
"""
    Apply PCA 
    χρισιμοποιουμε αυτη την μεθοδο για να εφαρμοσουμε τον αλγοριθμο PCA σε ενα κανονικοποιιμενο data set
    και επιστρεφουμε ως αποτελεσμα ενα dataFrame με τα δεδομενα που προκυπτουν απο την εφαρμογη του PCA 
    θεωροντας ως default παραμετρους οτι ο PCA θα μιοσει της διαστασης των δεδομενον στης 2 και το dataFrame
    θα εχει column labels τα παρακατο: 'principal component 1', 'principal component 2'
"""


def applyPCA(normalizedData, dimensions=2, dataframeLabels=['principal component 1', 'principal component 2']):
    pca = PCA(n_components=dimensions)
    pca_data = pca.fit_transform(normalizedData)

    print("Explained variation per principal component:", format(pca.explained_variance_ratio_))
    print("Total variation: ", sum(pca.explained_variance_ratio_))

    pca_data_dataFrame = pd.DataFrame(data=pca_data, columns=dataframeLabels)
    return pca_data_dataFrame



In [None]:
"""
    USING PCA Gia na provaloume tis 30 diastasis tou dataset se 2 kai 3
    gia na pragmatopoiithei auto tha xrisimopoiisoume to paketo PCA tis vivliothikis sklearn
    kai stin methodo PCA tha perasoume os argument to plithos ton diastaseon pou theloume na provaloume ta data mas
    stin periptosi mas 2 kai 3. Telos tha xrisimopoiisoume tin fit transform gia tin efarmogi tou PCA sta 
    kanonikopoiimena dedomena pou ipologisame sto proigoumeno bhma (x oxi to normalized_dataFrame pou dimiourgisame gia 
    visualization purposes.
"""

# efarmofi PCA gia 2 diastasis
pca_data_dataFrame = applyPCA(x)

# lets print the head of our pca data set for testing purposes
print("-------------- PRINT PCA DATA SET HEAD -------------------")
# print(tabulate(pca_data_dataFrame.head(), headers='keys', tablefmt='psql'))
display(pca_data_dataFrame.head())


# efarmogi PCA gia 3 diastasis
print("\napply PCA to reduce 30 to 3 dimensions")
pca_data_3D_dataFrame = applyPCA(x, dimensions=3, dataframeLabels=['principal component 1', 'principal component 2',
                                                                       'principal component 3'])
# lets print the head of our pca data set for testing purposes
print("\n-------------- PRINT PCA DATA SET HEAD -------------------")
# print(tabulate(pca_data_3D_dataFrame.head(), headers='keys', tablefmt='psql'))
display(pca_data_3D_dataFrame.head())



In [None]:
"""
  προβαλουμε την αναπαρασταση των δεδομενον που πορεκιψαν απο την εφαρμογη του PCA στο αρχικο data set  
"""

# plot 2D pca results
plt.figure(figsize=(10, 7))
plt.xticks(fontsize=12)
plt.yticks(fontsize=14)
plt.xlabel('Principal Component - 1', fontsize=20)
plt.ylabel('Principal Component - 2', fontsize=20)
plt.title("Principal Component Analysis of Breast Cancer Dataset", fontsize=20)
targets = cancerData['diagnosis'].unique()
colors = ['r', 'g']
for target, color in zip(targets, colors):
    indicesToKeep = cancerData['diagnosis'] == target
    plt.scatter(pca_data_dataFrame.loc[indicesToKeep, 'principal component 1'],
                pca_data_dataFrame.loc[indicesToKeep, 'principal component 2'], c=color, s=50)

plt.legend(cancer_df.diagnosis.unique(), prop={'size': 15})
plt.show()

# plot 3D PCA results
plt.figure(figsize=(10, 7))
ax = plt.axes(projection="3d")
ax.set_xlabel('principal component 1')
ax.set_ylabel('principal component 2')
ax.set_zlabel('principal component 3')
targets = cancerData['diagnosis'].unique()
colors = ['r', 'g']

# Creating plot
for target, color in zip(targets, colors):
    indicesToKeep = cancerData['diagnosis'] == target

    ax.scatter3D(pca_data_3D_dataFrame.loc[indicesToKeep, 'principal component 1'],
                    pca_data_3D_dataFrame.loc[indicesToKeep, 'principal component 2'],
                    pca_data_3D_dataFrame.loc[indicesToKeep, 'principal component 3'], color=color)

plt.title("simple 3D PCA results")

# show plot
plt.show()

In [None]:
def kFoldCV(df, label, classifiers):
    """
    Orizoume mia helper function gia na pragmatopoiei k-fold cross validation
 
    :param df: the x dataset of cross validation
    :param label: the y dataset of cross validation
    :param classifiers: is a dictionary of classifiers in the folowung format
     {
        "name": "KNN",
        "classifier": KNeighborsClassifier(),
        "train_scores": [],
        "acc_scores": []
    }
    :return: returns the trained classifiers dict
    """
    kf = KFold(n_splits=10, random_state=None) 

    for train_index, test_index in kf.split(df):
        X_train, X_test = df.loc[train_index, :], df.loc[test_index, :]
        y_train, y_test = label[train_index], label[test_index]

        # fit train dataset to classifiers
        for c in classifiers:
            c["classifier"].fit(X_train, y_train)
            c["train_scores"].append( c["classifier"].score(X_train, y_train))
            c["acc_scores"].append( c["classifier"].score(X_test, y_test))

    return classifiers


def stratifiedKFoldCV(df, label, classifiers):
    """
    Orizoume mia helper function gia na pragmatopoiei statified k-fold cross validation
 
    :param df: the x dataset of cross validation
    :param label: the y dataset of cross validation
    :param classifiers: is a dictionary of classifiers in the folowung format
     {
        "name": "KNN",
        "classifier": KNeighborsClassifier(),
        "train_scores": [],
        "acc_scores": []
    }
    :return: returns the trained classifiers dict
    """
    skf = StratifiedKFold(n_splits=10, random_state=None)
    # pca_data_dataFrame is the feature set and label is the target
    for train_index, test_index in skf.split(df,label): 
        X_train, X_test = df.loc[train_index, :], df.loc[test_index, :]
        y_train, y_test = label[train_index], label[test_index]

        # fit train dataset to classifiers
        for c in classifiers:
            c["classifier"].fit(X_train, y_train)
            c["train_scores"].append( c["classifier"].score(X_train, y_train))
            c["acc_scores"].append( c["classifier"].score(X_test, y_test))
    return classifiers



In [None]:
"""
    xorisoume to dataset mas se 2 meroi train kai test sumple.
    gia na einai poio antiprosopeftiko to apotelesma tis diadikasias tha xrisimopoiisoume
    tin texniki k-fold-cross-validation gia na efarmostei to modelo mas me arketous piithanous 
    sindiasmous train kai test gia na imaste sigouroi oti ta apotelesmata mas einai antiprosopeutika
"""

classifiers = [
    {
        "name": "KNN",
        "classifier": KNeighborsClassifier(),
        "train_scores": [],
        "acc_scores": []
    },
    {
        "name": "LogisticRegression",
        "classifier": LogisticRegression(),
        "train_scores": [],
        "acc_scores": []
    },
    {
        "name": "LinearRegression",
        "classifier": LinearRegression(),
        "train_scores": [],
        "acc_scores": []
    },
    {
        "name": "DecisionTreeClassifier",
        "classifier": DecisionTreeClassifier(),
        "train_scores": [],
        "acc_scores": [] 
    },
    {
        "name": "GaussianNB",
        "classifier": GaussianNB(),
        "train_scores": [],
        "acc_scores": [] 
    }
]

# this list will contain results in dict format in order to display them into dataframe.
results = []

display(pca_data_dataFrame)

classifier = kFoldCV(pca_data_dataFrame, label, classifiers)

# ipologizoume tin mesi timi ton apotelesmaton tou taksinomiti mas pou proekipsan apo tin 10-fold-cross-validation
for c in classifiers:
    print("Results for classifier {}: training score: {:.4f}, Accuracy: {:.4f} using 2 Principal Components and simple kfold \n".format(c["name"], np.mean(c["train_scores"]), np.mean(c["acc_scores"])))
    results.append({
    "algorithm": c["name"],
    "test": "2D-PCA",
    "folds": "K-Fold",
    "train_score": np.mean(c["train_scores"]),
    "test_score": np.mean(c["acc_scores"])
    })



In [None]:
"""
Epanalamvanoume tin idia diadikasia gia stratified k-fold cross validation 
gia ola ta montela provlepsis pou ilopoiisame.
"""

# reset classifiers
for c in classifiers:
    c["acc_scores"] = []
    c["train_scores"] = []

classifiers = stratifiedKFoldCV(pca_data_dataFrame, label, classifiers)

# ipologizoume tin mesi timi ton apotelesmaton tou taksinomiti mas pou proekipsan apo tin 10-fold-cross-validation
for c in classifiers:
    print("Results for classifier {}: training score: {:.4f}, Accuracy: {:.4f} using 2 Principal Components and stratified K-fold \n".format(c["name"], np.mean(c["train_scores"]), np.mean(c["acc_scores"])))
    results.append({
    "algorithm": c["name"],
    "test": "2D-PCA",
    "folds": "stratified-K-Fold",
    "train_score": np.mean(c["train_scores"]),
    "test_score": np.mean(c["acc_scores"])
    })



In [None]:
"""
Epanalamvanoume tin parapano diadikasia gia ta 3 principal components tou PCA.
"""

# reset classifiers
for c in classifiers:
    c["acc_scores"] = []
    c["train_scores"] = []


classifier = kFoldCV(pca_data_3D_dataFrame, label, classifiers)


# kf = KFold(n_splits=10, random_state=None) 

# ipologizoume tin mesi timi ton apotelesmaton tou taksinomiti mas pou proekipsan apo tin 10-fold-cross-validation
for c in classifiers:
    print("Results for classifier {}: training score: {:.4f}, Accuracy: {:.4f} using 3 Principal Components and simple kfold \n".format(c["name"], np.mean(c["train_scores"]), np.mean(c["acc_scores"])))
    results.append({
    "algorithm": c["name"],
    "test": "3D-PCA",
    "folds": "K-Fold",
    "train_score": np.mean(c["train_scores"]),
    "test_score": np.mean(c["acc_scores"])
    })


In [None]:
"""
Epanalamvanoume tin idia diadikasia gia stratified k-fold cross validation 
gia ola ta montela provlepsis pou ilopoiisame.
"""

# reset classifiers
for c in classifiers:
    c["acc_scores"] = []
    c["train_scores"] = []

classifiers = stratifiedKFoldCV(pca_data_3D_dataFrame, label, classifiers)

# ipologizoume tin mesi timi ton apotelesmaton tou taksinomiti mas pou proekipsan apo tin 10-fold-cross-validation
for c in classifiers:
    print("Results for classifier {}: training score: {:.4f}, Accuracy: {:.4f} using 3 Principal Components and stratified K-fold \n".format(c["name"], np.mean(c["train_scores"]), np.mean(c["acc_scores"])))
    results.append({
    "algorithm": c["name"],
    "test": "3D-PCA",
    "folds": "stratified-K-Fold",
    "train_score": np.mean(c["train_scores"]),
    "test_score": np.mean(c["acc_scores"])
    })

In [None]:
"""
Epanalamvanoume thn diadikasia gia kratontas ta xaraktiristika pou parousiazoun sisxetisi megaliterei apo 0.7 kata apolith timi
1) kratisame ta xaraktiristika pou exoun sisxetisi megaliteri apo 0.7 me to xaraktiristiko stoxos
gia na epiteuthei auto xrisimopoiisame to correlation ton xaraktiristikwn pou ipologisame kata tin provoli tou Heatmap kai apomonosame 
apo to kurio dataset ta features pou oikanopoioun tin sunthiki.
2) Stun sinexeia kanonikopoiisame ta dedomena pou diamorfosame proigumenos kai ta dosame ws inpunt ston algorithmo K-fold prokimenou
na ipologisoyme to score provlepeis
"""

data = cancerData[corr_diagnosis.index]
data = data.drop(['diagnosis'], axis=1)

# for feature in data.columns:
#         le = preprocessing.LabelEncoder()
#         data[feature] = le.fit_transform(data[feature])
        # x[feature] = le.transform(X_test[feature])

data = StandardScaler().fit_transform(data)
feature_columns = ['feature' + str(i) for i in range(data.shape[1])]
data = pd.DataFrame(data, columns=feature_columns)

# reset classifiers
for c in classifiers:
    c["acc_scores"] = []
    c["train_scores"] = []

classifiers = kFoldCV(data, label, classifiers)

# ipologizoume tin mesi timi ton apotelesmaton tou taksinomiti mas pou proekipsan apo tin 10-fold-cross-validation
for c in classifiers:
    print("Results for classifier {}: training score: {:.4f}, Accuracy: {:.4f} using features with correlation greater than 0.7 with class feature and normal K-fold \n".format(c["name"], np.mean(c["train_scores"]), np.mean(c["acc_scores"])))
    results.append({
    "algorithm": c["name"],
    "test": "Custom DR",
    "folds": "K-Fold",
    "train_score": np.mean(c["train_scores"]),
    "test_score": np.mean(c["acc_scores"])
    })

In [None]:
# reset classifiers
for c in classifiers:
    c["acc_scores"] = []
    c["train_scores"] = []

classifiers = stratifiedKFoldCV(data, label, classifiers)

# ipologizoume tin mesi timi ton apotelesmaton tou taksinomiti mas pou proekipsan apo tin 10-fold-cross-validation
for c in classifiers:
    print("Results for classifier {}: training score: {:.4f}, Accuracy: {:.4f} using features with correlation greater than 0.7 with class feature and stratified K-fold \n".format(c["name"], np.mean(c["train_scores"]), np.mean(c["acc_scores"])))
    results.append({
    "algorithm": c["name"],
    "test": "Custom DR",
    "folds": "stratified-K-Fold",
    "train_score": np.mean(c["train_scores"]),
    "test_score": np.mean(c["acc_scores"])
    })

In [None]:
folds = ["K-Fold", "Stratified K-Fold"]
tests = ["2D-PCA", "3D-PCA", "CUSTOM-DIM-REDUCTION"]

classifiers_df = pd.DataFrame.from_dict(results)

classifiers_df = classifiers_df.set_index(['folds', 'test', "algorithm"]) 
# classifiers_df = classifiers_df.set_index('test') 

# classifiers_df = classifiers_df.set_index(folds)
display(classifiers_df)

In [None]:
# reset indexies
classifiers_df.reset_index(drop=False, inplace=True)
display(classifiers_df)
# UPLOAD CLASSIFICATION RESULTS TO MYSQL
mySqlManager.updloadTableFromDF(classifiers_df, "classifiers_results")
