#TODO
1. sprawdzić zbalansowanie datasetów; poprawić w razie konieczności
2. klasyfikacja; wybrać klasyfikatory i puścić na datasetach
3. syntetyczny zbiór danych???

In [349]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from utilities import one_hot
from L1PCA import l1pca

from sklearn.decomposition import PCA, SparsePCA, KernelPCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.preprocessing import StandardScaler
from sklearn import datasets

from utilities import one_hot

In [350]:
def getLabels(path):
    return np.genfromtxt(path, delimiter=",")[1:, -1]
    
def standardize_dataset(path):
    data = pd.read_csv(path)
    features = list(data.columns[:-1])
    x = data.loc[:, features].values
    x = StandardScaler().fit_transform(x)
    return x


In [351]:
def l1pca_transform(path, n):
    data = pd.read_csv(path)
    features = list(data.columns[:-1])
    x = data.loc[:, features].values
    principal_components = l1pca(x, n)
    return principal_components

def pca_transform(path, n):
    n = int(n)
    standardized_data = standardize_dataset(path)
    pca = PCA(n_components=n)
    principal_components = pca.fit_transform(standardized_data)
    #print(f'EXPLAINED VARIANCE RATIO:\t{np.around(pca.explained_variance_ratio_, decimals=3)}')
    print(f'INFORMATION LOSS:\t\t{1-sum(pca.explained_variance_ratio_):.3}')
    return principal_components

def kernel_pca_transform(path, n):
    standardized_data = standardize_dataset(path)
    kernel_pca = KernelPCA(n_components=n)
    principal_components = kernel_pca.fit_transform(standardized_data)
    return principal_components

def sparse_pca_transform(path, n):
    standardized_data = standardize_dataset(path)
    sparse_pca = SparsePCA(n_components=n)
    principal_components = sparse_pca.fit_transform(standardized_data)
    return principal_components

def lda_transform(path, n):
    data = pd.read_csv(path)
    labels = data['label'].to_numpy()
    standardized_data = standardize_dataset(path)
    lda = LinearDiscriminantAnalysis(n_components=n)
    principal_components = lda.fit_transform(standardized_data, labels)
    return principal_components

In [352]:
SYNTHETIC_PATH1 = "../data/synthetic1.csv"
SYNTHETIC_PATH2 = "../data/synthetic2.csv"
SYNTHETIC_PATH3 = "../data/synthetic3.csv"

BREAST_PATH = '../data/breast.csv'
HR_PATH = '../data/human_resources.csv'
SONAR_PATH = '../data/sonar.csv'
SPAM_PATH = '../data/spam.csv'
SMOKING_PATH = '../data/smoking.csv'

dataset_paths = [SYNTHETIC_PATH1, SYNTHETIC_PATH2, SYNTHETIC_PATH3] #SMOKING_PATH, SONAR_PATH, HR_PATH]

methods = {
    'pca': pca_transform,
    #'kernel_pca': kernel_pca_transform,
    'sparse_pca': sparse_pca_transform,
    'lda': lda_transform
} # l1pca

clfs = {
    'GNB': GaussianNB(),
    'SVM': SVC(),
    'kNN': KNeighborsClassifier(),
    'CART': DecisionTreeClassifier(random_state=1410),
}

In [353]:
def genSyntheticDataset(n_features, n_informative, n_redundant, n_repeated, n_classes, dataset_id=None):
    X, y = datasets.make_classification(
        n_samples=100,
        n_features=n_features,
        n_redundant=n_redundant,
        n_informative=n_informative,
        n_repeated=n_repeated,
        random_state=1410,
        n_classes=n_classes,
        n_clusters_per_class=1
    )
    dfX = pd.DataFrame(X, columns=[f'f{i}' for i in range(1,X.shape[1] + 1)])
    dfX = np.round(dfX, decimals= 5)
    dfY = pd.DataFrame(y, columns=['label'])

    df = pd.concat([dfX, dfY], axis = 1)
    df.to_csv('../data/synthetic'+str(dataset_id)+".csv", index = False)

# X = pca_transform(SYNTHETIC_PATH, 15)
# y = getLabels(SYNTHETIC_PATH)

In [354]:
"""
Spam base dataset
"""
def init_spam():
    dataset_name = "spam"
    spam_features = ['word_freq_make','word_freq_address','word_freq_all','word_freq_3d','word_freq_our','word_freq_over','word_freq_remove','word_freq_internet','word_freq_order','word_freq_mail','word_freq_receive','word_freq_will','word_freq_people','word_freq_report','word_freq_addresses','word_freq_free','word_freq_business','word_freq_email','word_freq_you','word_freq_credit','word_freq_your','word_freq_font','word_freq_000','word_freq_money','word_freq_hp','word_freq_hpl','word_freq_george','word_freq_650','word_freq_lab','word_freq_labs','word_freq_telnet','word_freq_857','word_freq_data','word_freq_415','word_freq_85','word_freq_technology','word_freq_1999','word_freq_parts','word_freq_pm','word_freq_direct','word_freq_cs','word_freq_meeting','word_freq_original','word_freq_project','word_freq_re','word_freq_edu','word_freq_table','word_freq_conference','char_freq_;','char_freq_(','char_freq_[','char_freq_!','char_freq_$','char_freq_#','capital_run_length_average','capital_run_length_longest','capital_run_length_total','label']
    spam_dataset = pd.read_csv(f"../raw_data/{dataset_name}.csv", header=None)
    spam_dataset.columns=spam_features
    spam_dataset.to_csv(SPAM_PATH, index = False)

"""
Sonar dataset
"""
def init_sonar():
    dataset_name = "sonar"
    sonar_dataset = pd.read_csv(f"../raw_data/{dataset_name}.csv", names = range(0,61))
    sonar_dataset.rename(columns={60:'label'}, inplace=True)
    # sonar_dataset['label'].replace(0, 'R',inplace=True)
    # sonar_dataset['label'].replace(1, 'M',inplace=True)
    sonar_dataset.to_csv(SONAR_PATH, index = False)

"""
Smoking dataset
"""
def init_smoking():
    smoking_dataset = pd.read_csv(f"../raw_data/smoking.csv")
    smoking_dataset = smoking_dataset.drop(['ID'], axis=1)
    smoking_dataset_label = smoking_dataset.pop('smoking')
    smoking_dataset = one_hot(smoking_dataset, 'gender')
    smoking_dataset = one_hot(smoking_dataset, 'oral')
    smoking_dataset.rename(columns={'Y':'oral'}, inplace=True)
    smoking_dataset = one_hot(smoking_dataset, 'tartar')
    smoking_dataset.rename(columns={'Y':'tartar'}, inplace=True)
    smoking_dataset.rename(columns={'N':'no_tartar'}, inplace=True)
    smoking_dataset.insert(len(smoking_dataset.columns), 'label', smoking_dataset_label)
    smoking_dataset.to_csv(SMOKING_PATH, index = False)
    

"""
Human resources dataset
"""
def init_hr():
    dataset_name = "human_resources"
    hr_dataset = pd.read_csv(f"../raw_data/{dataset_name}.csv")
    hr_labels = hr_dataset.pop('left')
    hr_dataset.rename(columns={'sales':'department'}, inplace=True)
    hr_dataset = one_hot(hr_dataset, 'salary')
    hr_dataset = one_hot(hr_dataset, 'department')
    hr_dataset.insert(len(hr_dataset.columns), 'label', hr_labels)
    hr_dataset.to_csv(HR_PATH, index = False)
    
"""
Breast dataset
"""
def init_breast():
    breast_dataset = pd.DataFrame(pd.read_csv(f"../raw_data/breast.csv"))
    breast_labels = breast_dataset.pop('diagnosis')
    breast_dataset.drop(columns=(['id', 'Unnamed: 32']), inplace=True)
    breast_dataset.insert(len(breast_dataset.columns), 'label', breast_labels)
    breast_dataset['label'].replace('B', 0,inplace=True)
    breast_dataset['label'].replace('M', 1,inplace=True)
    breast_dataset.to_csv(BREAST_PATH, index = False)


In [355]:
# init_hr()
# init_smoking()
# init_breast()
# init_spam()
# init_sonar()
genSyntheticDataset(n_features=20, n_informative=2, n_redundant=2, n_repeated=2, n_classes=4, dataset_id=1)
genSyntheticDataset(n_features=40, n_informative=5, n_redundant=10, n_repeated=5, n_classes=10, dataset_id=2)
genSyntheticDataset(n_features=60, n_informative=10, n_redundant=20, n_repeated=10, n_classes=2, dataset_id=3)

In [356]:

def draw_plots(dimensions):

    if(dimensions==3):
        subplot_kw = dict(projection='3d')
    else:
        subplot_kw = None
    
    fig, axs = plt.subplots(len(dataset_paths), len(methods), subplot_kw=subplot_kw, figsize=(25,25))

    for row, path in enumerate(dataset_paths):
        dataset = pd.read_csv(path)
        targets = dataset[dataset.columns[-1]].unique()
        colors = ['r', 'b', 'g']

        lda_max_dims = len(targets) - 1

        for column, method in enumerate(methods):
            if row == 0:
                axs[row,column].set_title(method)

            print(f"{method} --- --- --- {path}")

            if method == 'lda':
                df_dims = lda_max_dims
            else:
                df_dims = dimensions

            components = ([f'principal component {i+1}' for i in range(df_dims)])
			
            dataframe = pd.DataFrame(data = methods[method](path, df_dims), columns=components)
            
            for (target, color) in zip(targets,colors):

                plt.text(0, 1, path, ha='left', va='top', transform=axs[row, column].transAxes)

                indicesToKeep = pd.read_csv(path)[dataset.columns[-1]] == target
                if df_dims == 1:
                    x = dataframe.loc[indicesToKeep, 'principal component 1']
                    axs[row, column].scatter(x = x, y = np.zeros_like(x), c = color, s = 40)
                elif df_dims == 2:
                    axs[row, column].scatter(x = dataframe.loc[indicesToKeep, 'principal component 1'], y = dataframe.loc[indicesToKeep, 'principal component 2'], c = color, s = 40)
                elif df_dims == 3:
                    axs[row, column].scatter(xs = dataframe.loc[indicesToKeep, 'principal component 1'], ys = dataframe.loc[indicesToKeep, 'principal component 2'], zs = dataframe.loc[indicesToKeep, 'principal component 3'], c = color, s = 40)
    
    ax_args = {chr(ord('x')+i) + 'label' : 'Principal Component ' + str(i+1) for i in range(0, dimensions)}
    for ax in axs.flat:
        ax.set(**ax_args)

In [357]:
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold
from sklearn.base import clone
from sklearn.metrics import accuracy_score


n_datasets = len(dataset_paths)
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits)

scores = np.zeros((len(clfs), n_datasets, n_splits))

In [369]:
def calculate_scores(dataset_paths, method=None, f_ratio=None):
	folder_location = '../scores/'
	filename = "raw" if method == None and f_ratio == None else method+str(int(f_ratio*100))
	save_path = folder_location+filename

	for data_id, dataset in enumerate(dataset_paths):
		if method != None and f_ratio != None:
			df = pd.read_csv(dataset).to_numpy()
			n_features = df.shape[1]
			print("PRINCIPAL COMPONENTS:\t\t", int(f_ratio*n_features))
			X=methods[method](dataset, int(f_ratio*n_features))
			y=getLabels(dataset)
		else:
			dataset = pd.read_csv(dataset).to_numpy()
			X = dataset[:, :-1]
			y = dataset[:, -1].astype(int)
		for fold_id, (train, test) in enumerate(skf.split(X, y)):
			for clf_id, clf_name in enumerate(clfs):
				clf = clone(clfs[clf_name])
				clf.fit(X[train], y[train])
				y_pred = clf.predict(X[test])
				scores[clf_id, data_id, fold_id] = accuracy_score(y[test], y_pred)
				
	np.save(save_path, scores)

In [370]:
# original scores; no dimensionality reduction
calculate_scores(dataset_paths=dataset_paths)
print("20 TOTAL FEATURES")
calculate_scores(dataset_paths=dataset_paths, method="pca", f_ratio=0.25)
print("40 TOTAL FEATURES")
calculate_scores(dataset_paths=dataset_paths, method="pca", f_ratio=0.50)
print("60 TOTAL FEATURES")
calculate_scores(dataset_paths=dataset_paths, method="pca", f_ratio=0.75)

20 TOTAL FEATURES
PRINCIPAL COMPONENTS:		 5
INFORMATION LOSS:		0.449
PRINCIPAL COMPONENTS:		 10
INFORMATION LOSS:		0.275
PRINCIPAL COMPONENTS:		 15
INFORMATION LOSS:		0.162
40 TOTAL FEATURES
PRINCIPAL COMPONENTS:		 10
INFORMATION LOSS:		0.184
PRINCIPAL COMPONENTS:		 20
INFORMATION LOSS:		0.0582
PRINCIPAL COMPONENTS:		 30
INFORMATION LOSS:		3.75e-13
60 TOTAL FEATURES
PRINCIPAL COMPONENTS:		 15
INFORMATION LOSS:		0.0199
PRINCIPAL COMPONENTS:		 30
INFORMATION LOSS:		1.69e-13
PRINCIPAL COMPONENTS:		 45
INFORMATION LOSS:		2.44e-14


In [360]:
scores = np.load('results.npy')
print("\nScores:\n", scores.shape)
mean_scores = np.mean(scores, axis=2).T
print("\nMean scores:\n", mean_scores)

FileNotFoundError: [Errno 2] No such file or directory: 'results.npy'

In [None]:
from scipy.stats import rankdata
ranks = []
for ms in mean_scores:
    ranks.append(rankdata(ms).tolist())
ranks = np.array(ranks)
print(f"Ranks:\n{ranks}")
mean_ranks = np.mean(ranks, axis=0)
print(f"\nModels:\t\t{[i for i in clfs]}")
print(f"Mean ranks:\t{mean_ranks}\n")

Ranks:
[[2. 1. 3. 4.]
 [4. 1. 2. 3.]]

Models:		['GNB', 'SVM', 'kNN', 'CART']
Mean ranks:	[3.  1.  2.5 3.5]



In [None]:
from scipy.stats import ttest_rel

alfa = .05
t_statistic = np.zeros((len(clfs), len(clfs)))
p_value = np.zeros((len(clfs), len(clfs)))

for i in range(len(clfs)):
    for j in range(len(clfs)):
        t_statistic[i, j], p_value[i, j] = ttest_rel(ranks.T[i], ranks.T[j])

In [None]:
from tabulate import tabulate

headers = list(clfs.keys())
names_column = np.expand_dims(np.array(list(clfs.keys())), axis=1)
t_statistic_table = np.concatenate((names_column, t_statistic), axis=1)
t_statistic_table = tabulate(t_statistic_table, headers, floatfmt=".2f")
p_value_table = np.concatenate((names_column, p_value), axis=1)
p_value_table = tabulate(p_value_table, headers, floatfmt=".2f")
print("\nt-statistic:\n", t_statistic_table, "\n\np-value:\n", p_value_table)


t-statistic:
          GNB     SVM     kNN     CART
----  ------  ------  ------  -------
GNB   nan       2.00    0.33    -0.33
SVM    -2.00  nan      -3.00    -5.00
kNN    -0.33    3.00  nan     -inf
CART    0.33    5.00  inf      nan 

p-value:
          GNB     SVM     kNN    CART
----  ------  ------  ------  ------
GNB   nan       0.30    0.80    0.80
SVM     0.30  nan       0.20    0.13
kNN     0.80    0.20  nan       0.00
CART    0.80    0.13    0.00  nan


In [None]:
advantage = np.zeros((len(clfs), len(clfs)))
advantage[w_statistic > 0] = 1
advantage_table = tabulate(np.concatenate(
    (names_column, advantage), axis=1), headers)
print("\nAdvantage:\n", advantage_table)

NameError: name 'w_statistic' is not defined

In [None]:
significance = np.zeros((len(clfs), len(clfs)))
significance[p_value <= alfa] = 1
significance_table = tabulate(np.concatenate(
    (names_column, significance), axis=1), headers)
print("\nStatistical significance (alpha = 0.05):\n", significance_table)


Statistical significance (alpha = 0.05):
         GNB    SVM    kNN    CART
----  -----  -----  -----  ------
GNB       0      0      0       0
SVM       0      0      0       0
kNN       0      0      0       1
CART      0      0      1       0
