In [None]:
# importera en dataset från sklearn
# vi väljer att importera breast cancer dataset

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_breast_cancer

In [None]:
# ladda in datasetet
cancer = load_breast_cancer()

# skriva en sammanfattning av datasetet
print(
    f"Databasen innehåller information om bröstcancer.\n\n"
        f"Database filename är {cancer['filename']}\n"
        f"Vi importerar databasen från den {cancer['data_module']} module\n\n"
    f"Database summary:\n"
        f"Number of samples: {cancer['data'].shape[0]}\n"
        f"Number of features: {cancer['data'].shape[1]}\n"
        f"Number of classes: {cancer['target_names'].shape[0]}\n"
        f"Number of missing values: {np.isnan(cancer['data']).sum()}\n"
        f"Number of unique values: {np.unique(cancer['data']).shape[0]}\n"
        f"Number of features with missing values: {np.isnan(cancer['data']).any().sum()}\n"
        f"Number of features with constant values: {np.unique(cancer['data'], axis=0).shape[0]}\n"
        f"Number of features with duplicated values: {cancer['data'].shape[1] - np.unique(cancer['data'], axis=1).shape[1]}\n"
        f"Number of features with zero values: {np.count_nonzero(cancer['data'] == 0)}\n"
)

# nu printar vi en lite deskription av datasetet
print()
print(cancer['DESCR'])

# skapa en dataframe
df_cancer = pd.DataFrame(
                    np.c_[cancer['data'], cancer['target_names'][cancer['target']]], 
                    columns = np.append(cancer['feature_names'], ['target'])
            )

In [None]:
# skriv ut de första 5 raderna
df_cancer.head()

In [None]:
df_cancer.info()

In [None]:
df_cancer.describe()

In [None]:
df_cancer.isnull().sum()

In [None]:
# normalisera data i datasetet
# importera preprocessing från sklearn
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df_cancer_scaled = pd.DataFrame(scaler.fit_transform(df_cancer.drop(['target'], axis = 1)), columns=df_cancer.columns[:-1])

df_cancer = pd.concat([df_cancer_scaled, df_cancer['target']], axis = 1)

In [None]:
df_cancer.head()

In [None]:
# konvertera till culumner till float
culumn = list(df_cancer.columns)[:-1]
print(culumn)


In [None]:
# konvertera till float med pandas to_numeric
df_cancer[culumn] = df_cancer[culumn].apply(pd.to_numeric, errors='coerce')

In [None]:
df_cancer.info()

In [None]:
corr = df_cancer.corr()

result = []
for i in range(len(corr.columns)):
    for j in range(i):
        if abs(corr.iloc[i, j]) > 0.85 and abs(corr.iloc[i, j]) < 1:
            name_i = corr.columns[i]
            name_j = corr.columns[j]
            #print(name_i, '/', name_j)
            #print(corr.iloc[i, j])
            #print()
            result.append([name_i, name_j, corr.iloc[i, j]])

sorted_corr = sorted(result, key=lambda x: x[2], reverse=True)
print(sorted_corr[:5])

In [None]:
from matplotlib import rcParams
plt.style.use("ggplot")
rcParams['figure.figsize'] = (12,  6)

# köra en korrelation mellan alla kolumner i datasetet
for i in range(len(sorted_corr[:5])):
    sns.pairplot(df_cancer, x_vars=sorted_corr[i][0], y_vars=sorted_corr[i][1], hue="target", height=5)
    plt.show()

# skapa en lista av kolumner son vill använda för att träna modellen
temp = np.array([])
for i in range(len(sorted_corr[:5])):
    temp = np.append(temp, sorted_corr[i][1])
    temp = np.append(temp, sorted_corr[i][0])

# ta bort dubletter
temp = np.unique(temp)
print(temp)


"""
corrmat = df_cancer.corr(method="pearson")
hm = sns.heatmap(corrmat, 
                 cbar=True, 
                 annot=False, 
                 square=True, 
                 fmt='.2f', 
                 annot_kws={'size': 10}, 
                 yticklabels=df_cancer.columns, 
                 xticklabels=df_cancer.columns, 
                 cmap="Spectral_r")
plt.show()"""