# Indian Liver Patient Dataset
Data Spaces (01RLPOV), A.A. 2021/22  <br /> 
Politecnico di Torino - Corso di Laurea Magistrale in Ingegneria Informatica <br /> 
Elisa Cenedese - s255202 <br /> 
Link to data source: https://archive.ics.uci.edu/ml/datasets/ILPD+%28Indian+Liver+Patient+Dataset%29#

## Table of Contents

* [1. Introduction](#introduction)
    * [1.1 Attributes description](#attributes_descr)
    * [1.2 Basic dataset exploration](#basic_dataset_exploration)
    * [1.3 Check_for_missing_values](#check_missing_values)
    * [1.4 Check_for_outliers](#check_outliers)
    * [1.5 Split_dataset](#split_dataset)
* [2. Exploratory data analysis](#exploratory_data_analysis)
    * [2.1 Statistical quantitative description of features](#stat_features_descr)
    * [2.2 Box plots](#box_plots)
    * [2.3 Correlation analysis](#corr_analysis)
      * [2.3.1 Heatmap](#heatmap)
      * [2.3.2 Dendogram](#dendogram)

In [None]:
#Mount drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
#Install
!pip install chart_studio
!pip install -U kaleido

In [None]:
%matplotlib inline

#Imports
import pandas as pd
import os
import seaborn as sns
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import itertools
import math
import pickle
from numpy import mean
from numpy import std
#plot libaries
import plotly
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
import plotly.figure_factory as ff
import yaml
import pprint

from matplotlib.colors import ListedColormap
from sklearn.model_selection import StratifiedKFold, GridSearchCV, cross_val_score, RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report
from sklearn.metrics import plot_confusion_matrix
from sklearn import metrics
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import make_scorer, f1_score
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn import tree, svm, linear_model, ensemble, neighbors 
from sklearn.svm import LinearSVC
from sklearn import preprocessing
#from plotly.offline import init_notebook_mode
#init_notebook_mode(connected=True)
# online plotly
import chart_studio
chart_studio.tools.set_credentials_file(username='elisa_c', api_key='ixjDGQPn6k6yG1D96Wnr')
#chart_studio.tools.set_config_file(world_readable=False, sharing='secret')
import chart_studio.plotly as py
from scipy.cluster import hierarchy as hc

In [None]:
#Paths
DATA_FILE_NAME = "indian_liver_patient_dataset.csv"

ROOT_DIRPATH = os.path.join(
    '/content',
    'drive',
    'MyDrive',
    'DataSpaces',
    'DataSpaces_project',
)

DATA_DIRPATH = os.path.join(
    ROOT_DIRPATH,
    'data',
)

MODELS_DIRPATH = os.path.join(
    ROOT_DIRPATH,
    'models',
)

data_file_path = os.path.join(DATA_DIRPATH, DATA_FILE_NAME)

if not os.path.exists(MODELS_DIRPATH):
    os.makedirs(MODELS_DIRPATH)

In [None]:
def read_dataset(dirpath: str, file_type: str, sheet_name_=None):
  if file_type == "xlsx":
    dataset_df = pd.read_excel(dirpath, sheet_name= sheet_name_)
  elif file_type == "csv":
    with open(dirpath) as in_fp:
      dataset_df = pd.read_csv(in_fp, sep=',', header=0)
  else:
    raise Exception("Unrecognised dataset format")
  return dataset_df

In [None]:
def show_dict(dict_: dict):
    pprint.pprint(dict_, width=1)

In [None]:
def print_formatted_list(str_, list):
  print(str_ + ":")
  for value in list:
    print("  - " + str(value))
  print()

In [None]:
def prepare_bar_plot(df, title, template_):
  colors = plotly.colors.DEFAULT_PLOTLY_COLORS
  data_series = df.value_counts() # return series
  data = [go.Bar(x=data_series.index, y=data_series.values, marker = dict(color = colors[:len(data_series.index)]))]
  layout = go.Layout(
      #paper_bgcolor='rgba(0,0,0,0)',
      #plot_bgcolor='rgba(0,0,0,0)',
      title=title,
      template = template_,
      autosize=False,
      width=400,
      height=400,
      yaxis=dict(
          title='#samples',
      ),
  )
  fig = go.Figure(data=data, layout=layout)
  return fig, data_series

In [None]:
def prepare_histogram_plot(X, features_names, class_names, config_dict, template_, per_feature=True):
  start_pos = 0
  data = []
  num_steps = len(features_names)

  for count_class, target in enumerate(class_names):
    trace_list = []
    for count, feature in enumerate(features_names):
      if count != start_pos:
        trace_list.append(go.Histogram(x=X[feature].loc[X[config_dict['TARGET_COLUMN_NAME']] == target], name = target, visible = False))
      else: 
        if per_feature and (count_class == 0 or count_class == 1):
            visibility_ =  True
        else:
            visibility_ =  'legendonly'
        trace_list.append(go.Histogram(x=X[feature].loc[X[config_dict['TARGET_COLUMN_NAME']] == target], name = target, visible = visibility_))
    data = data + trace_list

  trace_list_all_classes = []
  for count, feature in enumerate(features_names):
      if count != start_pos:
        trace_list_all_classes.append(go.Histogram(x=X[feature], name = 'all', visible = False))
      else: 
        if per_feature:
          visibility_ =  'legendonly'
        else:
           visibility_ = True
        trace_list_all_classes.append(go.Histogram(x=X[feature], name = 'all', visible = visibility_))

  data = data + trace_list_all_classes

  steps = []

  for i in range(num_steps):
      # Hide all traces
      step = dict(
          method = 'restyle',  
          args = ['visible', [False] * len(data)],
          label = features_names[i],
      )

      # Enable the traces we want to see
      for count, class_name in enumerate(class_names):
        #print(i+count*len(features_names))
        if per_feature and (count == 0 or count == 1):
          visibility_ =  True
        else:
          visibility_ =  'legendonly'
        step['args'][1][i+count*len(features_names)] = visibility_
     
      if per_feature:
        visibility_ = 'legendonly'
      else:
        visibility_ = True
      step['args'][1][i+(count+1)*len(features_names)] = visibility_
      
      # Add step to step list
      steps.append(step)

  sliders = [dict(
      active = start_pos, # from where to start the slider
      currentvalue = dict(
          prefix = "Feature: ", 
          xanchor= 'center',
      ),
      pad = {"t": 50},
      steps = steps,
      len=1,
  )]

  layout = dict(
      #paper_bgcolor='rgba(0,0,0,0)',
      #plot_bgcolor='rgba(0,0,0,0)',
      sliders=sliders,
      template = template_,  
      yaxis=dict(
          title='#Samples',
          automargin=True,
      ),
      #font=dict(
      #    color=font_color
      #)
  )

  return go.Figure(data=data, layout=layout)

In [None]:
def prepare_box_plot(X, features_names, class_names, config_dict, template_):
  active_pos = 0
  data = []
  num_steps = len(features_names)

  for target in class_names:
    trace_list = []
    for count, feature in enumerate(features_names):
      if count != active_pos:
        trace_list.append(go.Box(y=X[feature].loc[X[config_dict['TARGET_COLUMN_NAME']] == target], name = target, visible = False))
      else:
         trace_list.append(go.Box(y=X[feature].loc[X[config_dict['TARGET_COLUMN_NAME']] == target], name = target, visible = True))
    data = data + trace_list

  steps = []

  for i in range(num_steps):
      # Hide all traces
      step = dict(
          method = 'restyle',  
          args = ['visible', [False] * len(data)],
          label = features_names[i],
      )
      # Enable the two traces we want to see
      for count, class_name in enumerate(class_names):
        #print(i+count*len(features_names))
        step['args'][1][i+count*len(features_names)] = True
        
      # Add step to step list
      steps.append(step)

  sliders = [dict(
      active = active_pos,
      currentvalue = dict(
          prefix = "Feature: ", 
          xanchor= 'center',
      ),
      pad = {"t": 50},
      steps = steps,
      len=1,
  )]

  layout = dict(
      #paper_bgcolor='rgba(0,0,0,0)',
      #plot_bgcolor='rgba(0,0,0,0)',
      sliders=sliders,
      template = template_,  
      yaxis=dict(
          title='Feature value',
          automargin=True,
      ),
      #font=dict(
      #    color=font_color
      #)
  )

  return go.Figure(data=data, layout=layout)

In [None]:
def prepare_dendogram(X, title_, template_):
  colors = plotly.colors.DEFAULT_PLOTLY_COLORS
  feature_names = X.columns
  inverse_correlation = 1 - abs(X.corr())

  fig = ff.create_dendrogram(inverse_correlation.values, orientation='left', labels=feature_names, colorscale=colors, linkagefun=lambda x: hc.linkage(x, 'average'))
  
  fig['layout'].update(dict(
      title= title_,
      template= template_,
      width=800, 
      height=600,
      margin=go.layout.Margin(l=180, r=50),
      xaxis=dict(
          title='distance',
      ),
      yaxis=dict(
          title='features',
          automargin=True,
      ),
  ))
  return fig

In [None]:
def prepare_heatmap(X, template_):
  corr = X.corr()
  trace = go.Heatmap(z=corr.values.tolist(), x=corr.columns, y=corr.columns)
  data=[trace]
  layout = go.Layout(
      title='Heatmap of pairwise correlation of the columns',
      autosize=False,
      template = template_,
      width=650,
      height=500,
      yaxis=go.layout.YAxis(automargin=True),
      xaxis=dict(tickangle=40),
      margin=go.layout.Margin(l=80, r=80, b=80, t=80)
  )

  return go.Figure(data=data, layout=layout)

In [None]:
def prepare_pairplot(X, config_dict, combinations, features_names, template_):
  trace_list = []
  combinations_names = []
  start_pos = 0
  index_vals = X[config_dict['TARGET_COLUMN_NAME']].astype('category').cat.codes

  #combinations = list(itertools.combinations(range(len(features_list)), 2))
  num_steps = len(combinations)

  for couple in combinations:
    tuple_ = (features_names[couple[0]], features_names[couple[1]])
    combinations_names.append(str(tuple_))

  for count, couple in enumerate(combinations):
    #print(features_names[couple[0]], features_names[couple[1]])
    if count == start_pos:
      visibility_ = True
    else:
      visibility_ = False
 
    trace_list.append(go.Splom(dimensions=[dict(label=features_names[couple[0]],
                                                values=X[features_names[couple[0]]]),
                                          dict(label=features_names[couple[1]],
                                          values=X[features_names[couple[1]]])],
                              diagonal_visible=False, # remove plots on diagonal
                              text=X[config_dict['TARGET_COLUMN_NAME']],
                              marker=dict(color=index_vals,
                                          showscale=False, # colors encode categorical variables
                                          line_color='white', line_width=0.5),
                              visible = visibility_))

  steps = []
  
  for i in range(num_steps):
      # Hide all traces
      step = dict(
          method = 'restyle',  
          args = ['visible', [False] * len(trace_list)],
          label = combinations_names[i],
      )

      # Enable the traces we want to see
      step['args'][1][i] = True
        
      # Add step to step list
      steps.append(step)
  
  sliders = [dict(
      active = start_pos, # from where to start the slider
      currentvalue = dict(
            prefix = "Features couple: ", 
            xanchor= 'center',
      ),
      pad = {"t": 50},
      steps = steps,
      len=1,
    )]

  layout = dict(
        #paper_bgcolor='rgba(0,0,0,0)',
        #plot_bgcolor='rgba(0,0,0,0)',
        sliders=sliders,
        template = template_,  
        #font=dict(
        #    color=font_color
        #)
        width=600,
        height=600,
  )

  return go.Figure(data=trace_list, layout=layout)

In [None]:
def prepare_PCA_explained_variance_plot(pca, template_):
  #explained_variance_ratio_ is the percentage of variance explained by each of the selected components.
  '''
  fig = plt.figure()
  plt.plot(np.cumsum(pca.explained_variance_ratio_), label="Cumulative variance")
  plt.plot(pca.explained_variance_ratio_, label="Component variance")
  plt.legend()
  plt.xlabel('Principal component')
  plt.ylabel('Explained variance ratio')
  '''
  cum_explained_variance_ratio = np.cumsum(pca.explained_variance_ratio_)

  found_explained_variance_ratio = 0.0
  found_pc_id = 0
  for pc_id, value in enumerate(cum_explained_variance_ratio):
    if value >= 0.95:
      found_explained_variance_ratio = value
      found_pc_id = pc_id +1
      break

  trace_cum_var_exp = go.Scatter(
    x=list(range(1, len(pca.explained_variance_ratio_) + 1)), 
    y=pca.explained_variance_ratio_,
    name="Component explained variance",
  )
  trace_ind_var_exp = go.Scatter(
      x=list(range(1, len(cum_explained_variance_ratio) + 1)),
      y=cum_explained_variance_ratio,
      name="Cumulative explained variance",
    )
  data = [trace_cum_var_exp, trace_ind_var_exp]
  layout = go.Layout(
      template = template_,
      title='Individual and Cumulative Explained Variance',
      autosize=True,
      yaxis=dict(
          title='Explained variance ratio',
      ),
      xaxis=dict(
          title="Principal component",
          dtick=1,
      )
  )
  
  fig = go.Figure(data=data, layout=layout)

  fig.add_vline(x=found_pc_id, line_width=2, line_dash="dash", line_color="green")
  fig.add_hline(y=found_explained_variance_ratio, line_width=2, line_dash="dash", line_color="green")
      
  return fig

In [None]:
def prepare_biplot(pca, X_pca_trasformed, y, features_names, template_):
  loadings = pca.components_.T * np.sqrt(pca.explained_variance_)

  fig = px.scatter(X_pca_trasformed, x=0, y=1, color=y)

  for i, feature in enumerate(features_names):
      fig.add_shape(
          type='line',
          x0=0, y0=0,
          x1=loadings[i, 0],
          y1=loadings[i, 1]
      )
      fig.add_annotation(
          x=loadings[i, 0],
          y=loadings[i, 1],
          ax=0, ay=0,
          xanchor="center",
          yanchor="bottom",
          text=feature,
      )

  fig.update_layout( # customize font and legend orientation & position
    template = template_,
    width=700,
    height=700,
    legend_title_text='Class'
  )

  return fig

In [None]:
config_dict = {}

config_dict['GENERAL'] = {
    'SEED': 42,
    'TARGET_COLUMN_NAME': "CLASS",
    'NUMERIC_FEATURES': ['Age','TB','DB','AAP','SGPT','SGOT','TP','ALBA','AGR'],
    'BOOLEAN_FEATURES': ['Gender'],
    'SHOW_METHOD': 0, # 0 for colab, 1 for plotly, 2 for static png figures
    'PERFORM_NCV': True
}

if config_dict['GENERAL']['SHOW_METHOD'] > 0:
  template_ =  'plotly_white'
else:
  template_ = 'plotly_dark'

show_dict(config_dict)

## 1. Introduction<a class="anchor" id="introduction"></a>

###1.1 Attributes description<a class="anchor" id="attributes_descr"></a>

###1.2 Basic dataset exploration<a class="anchor" id="basic_dataset_exploration"></a>

In [None]:
dataset_df = read_dataset(data_file_path, file_type= "csv")
dataset_df.loc[dataset_df['CLASS'] == 2, 'CLASS'] = 0

le = preprocessing.LabelEncoder()
dataset_df['Gender'] = le.fit_transform(dataset_df['Gender'])
le_name_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print("Label encoding for Gender feature:")
print(le_name_mapping)

print(display(dataset_df.head()))

X = dataset_df.drop(columns=[config_dict['GENERAL']['TARGET_COLUMN_NAME']])
y = dataset_df[config_dict['GENERAL']['TARGET_COLUMN_NAME']] # dataframe containing class column

features_names = X.columns.values
print_formatted_list("Feature names", features_names)
class_names = list(set(y))
print_formatted_list("Class names", class_names)

fig, data_series = prepare_bar_plot(y, title = "Distribution of samples per class", template_=template_)
print(f'\nNumber of samples = {dataset_df.shape[0]} | Number of features = {len(features_names)}\n')
for value in zip(data_series.values, data_series.index):
    print("%s: %d instances (%.2f%%)" % (value[1], value[0], (value[0]/y.shape[0])*100))
print()

if config_dict['GENERAL']['SHOW_METHOD'] == 0:
  fig.show()
elif config_dict['GENERAL']['SHOW_METHOD'] == 1:
  py.iplot(fig, filename = 'bar_plot')
elif config_dict['GENERAL']['SHOW_METHOD'] == 2:
  fig.show(renderer='svg')

###1.3 Check for missing values<a class="anchor" id="check_missing_values"></a>
We can observe below that there are no missing values in the dataset.

In [None]:
# check for null values in the dataset
print("Missing values per feature:")
print(X.isna().sum())
print()
display(dataset_df.groupby('CLASS', as_index=False)['AGR'].min().rename(columns={'AGR': 'min(AGR)'}))
print()
display(dataset_df.groupby('CLASS', as_index=False)['AGR'].max().rename(columns={'AGR': 'max(AGR)'}))
print()
display(dataset_df.groupby('CLASS', as_index=False)['AGR'].mean().rename(columns={'AGR': 'mean(AGR)'}))

In [None]:
dataset_df['AGR'].fillna(dataset_df.groupby('CLASS')['AGR'].transform('mean').round(2), inplace = True)
X = dataset_df.drop(columns=[config_dict['GENERAL']['TARGET_COLUMN_NAME']])
X.info()

###1.4 Checking for outliers<a class="anchor" id="check_outliers"></a>

In [None]:
fig = prepare_histogram_plot(dataset_df, features_names, class_names, config_dict['GENERAL'], template_)

if config_dict['GENERAL']['SHOW_METHOD'] == 0:
  fig.show()
elif config_dict['GENERAL']['SHOW_METHOD'] == 1:
  py.iplot(fig, filename = 'bar_plot')
elif config_dict['GENERAL']['SHOW_METHOD'] == 2:
  fig.show(renderer='svg')

###1.5 Split dataset<a class="anchor" id="split_dataset"></a>

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=config_dict['GENERAL']['SEED'], stratify=y, shuffle=True)

##2. Exploratory data analysis<a class="anchor" id="exploratory_data_analysis"></a>

In [None]:
fig, data_series = prepare_bar_plot(y_train, title = "Training samples: distribution per class", template_=template_)
print(f'Training dataset length: {X_train.shape[0]}\n')
for value in zip(data_series.values, data_series.index):
    print("%s: %d instances (%.2f%%)" % (value[1], value[0], (value[0]/y_train.shape[0])*100))
print()

if config_dict['GENERAL']['SHOW_METHOD'] == 0:
  fig.show()
elif config_dict['GENERAL']['SHOW_METHOD'] == 1:
  py.iplot(fig, filename = 'bar_plot_training')
elif config_dict['GENERAL']['SHOW_METHOD'] == 2:
  fig.show(renderer='svg')

In [None]:
fig, data_series = prepare_bar_plot(y_test, title = "Test samples: distribution per classs", template_=template_)
print(f'Test dataset length: {X_test.shape[0]}\n')
for value in zip(data_series.values, data_series.index):
    print("%s: %d instances (%.2f%%)" % (value[1], value[0], (value[0]/y_test.shape[0])*100))
print()

if config_dict['GENERAL']['SHOW_METHOD'] == 0:
  fig.show()
elif config_dict['GENERAL']['SHOW_METHOD'] == 1:
  py.iplot(fig, filename = 'bar_plot_test')
elif config_dict['GENERAL']['SHOW_METHOD'] == 2:
  fig.show(renderer='svg')

###2.1 Statistical quantitative description of features<a class="anchor" id="stat_features_descr"></a>

Generate descriptive statistics.
For numeric data, the result’s index will include count, mean, std, min, max as well as lower, 50 and upper percentiles. By default the lower percentile is 25 and the upper percentile is 75. The 50 percentile is the same as the median.

In [None]:
print(display(X_train[config_dict['GENERAL']['NUMERIC_FEATURES']].describe()))

In [None]:
X_train_class = X_train.copy()
X_train_class[config_dict['GENERAL']['TARGET_COLUMN_NAME']] = y_train

liver_patients = X_train_class.loc[X_train_class[config_dict['GENERAL']['TARGET_COLUMN_NAME']] == 1].drop(columns=[config_dict['GENERAL']['TARGET_COLUMN_NAME']])
no_liver_patients = X_train_class.loc[X_train_class[config_dict['GENERAL']['TARGET_COLUMN_NAME']] == 0].drop(columns=[config_dict['GENERAL']['TARGET_COLUMN_NAME']])
print("Quantitaive description (class 1)")
print(display(liver_patients.describe())) # per class
print("Quantitaive description (class 0)")
print(display(no_liver_patients.describe())) # per class

###2.2 Box plots<a class="anchor" id="box_plots"></a>

We plot one box plot per numeric feature to visualize whether each feature presents different characteristics depending on the target class.

In [None]:
fig = prepare_box_plot(X_train_class, config_dict['GENERAL']['NUMERIC_FEATURES'], class_names, config_dict['GENERAL'], template_)

if config_dict['GENERAL']['SHOW_METHOD'] == 0:
  fig.show()
elif config_dict['GENERAL']['SHOW_METHOD'] == 1:
  py.iplot(fig, filename = 'box_plot')
elif config_dict['GENERAL']['SHOW_METHOD'] == 2:
  fig.show(renderer='svg')

In [None]:
X_train_class_scaled = X_train_class.copy()
X_train_class_scaled[features_names] = MinMaxScaler().fit_transform(X_train_class_scaled[features_names])
fig = prepare_box_plot(X_train_class_scaled, config_dict['GENERAL']['NUMERIC_FEATURES'], class_names, config_dict['GENERAL'], template_)

if config_dict['GENERAL']['SHOW_METHOD'] == 0:
  fig.show()
elif config_dict['GENERAL']['SHOW_METHOD'] == 1:
  py.iplot(fig, filename = 'bo_plot_scaled')
elif config_dict['GENERAL']['SHOW_METHOD'] == 2:
  fig.show(renderer='svg')

###2.3 Correlation analysis<a class="anchor" id="corr_analysis"></a>

In [None]:
fig = prepare_heatmap(X_train, template_)

if config_dict['GENERAL']['SHOW_METHOD'] == 0:
  fig.show()
elif config_dict['GENERAL']['SHOW_METHOD'] == 1:
  py.iplot(fig, filename = 'heatmap')
elif config_dict['GENERAL']['SHOW_METHOD'] == 2:
  fig.show(renderer='svg')

In [None]:
fig = prepare_dendogram(X_train, title_ = "Dendrogram of clustering the features according to correlation", template_=template_)

if config_dict['GENERAL']['SHOW_METHOD'] == 0:
  fig.show()
elif config_dict['GENERAL']['SHOW_METHOD'] == 1:
  py.iplot(fig, filename = 'dendogram')
elif config_dict['GENERAL']['SHOW_METHOD'] == 2:
  fig.show(renderer='svg')

In [None]:
sns.pairplot(X_train, corner=True)
plt.show()

In [None]:
combinations = list(itertools.combinations(range(len(features_names)), 2))
fig = prepare_pairplot(X_train_class, config_dict['GENERAL'], combinations, features_names, template_)

if config_dict['GENERAL']['SHOW_METHOD'] == 0:
  fig.show()
elif config_dict['GENERAL']['SHOW_METHOD'] == 1:
  py.iplot(fig, filename = 'pair_plot1')
elif config_dict['GENERAL']['SHOW_METHOD'] == 2:
  fig.show(renderer='svg')

##Preprocessing steps 

####Rebalancing

###Dimensionality reduction methods

#### Principal component analysis

In [None]:
X_train_scaled = StandardScaler().fit_transform(X_train)
pca = PCA(n_components=len(features_names), random_state=config_dict['GENERAL']['SEED'])
X_train_pca_trasformed = pca.fit_transform(X_train_scaled)

In [None]:
fig = prepare_PCA_explained_variance_plot(pca, template_)

if config_dict['GENERAL']['SHOW_METHOD'] == 0:
  fig.show()
elif config_dict['GENERAL']['SHOW_METHOD'] == 1:
  py.iplot(fig, filename = 'explaied_variance')
elif config_dict['GENERAL']['SHOW_METHOD'] == 2:
  fig.show(renderer='svg')

Loadings are visualized by arrows that are under an angle and have a certain length. The angle represents the contribution of a particular feature in the direction of the PCs where it contributes. The length of the arrow depicts the strength of the contribution of the feature in that direction.

In [None]:
fig = prepare_biplot(pca, X_train_pca_trasformed, y_train, features_names, template_)

if config_dict['GENERAL']['SHOW_METHOD'] == 0:
  fig.show()
elif config_dict['GENERAL']['SHOW_METHOD'] == 1:
  py.iplot(fig, filename = 'bi_plot')
elif config_dict['GENERAL']['SHOW_METHOD'] == 2:
  fig.show(renderer='svg')

In [None]:
loadings = pd.DataFrame(pca.components_.T[:, 0:2], columns=['PC1', 'PC2'], index=features_names)
loadings

In [None]:
loadings = pca.components_.T * np.sqrt(pca.explained_variance_)
loading_matrix = pd.DataFrame(loadings[:, 0:2], columns=['PC1', 'PC2'], index=features_names)
loading_matrix

####Empirical feature selection

##Classification

###Metrics

###Cross validation

In [None]:
config_dict['CLASSIFICATION'] = {}

config_dict['CLASSIFICATION']['MODELS'] = {
        'LinearSVC': svm.SVC(kernel='linear'),
        'RbfSVC': svm.SVC(kernel='rbf'),
        'KNN': neighbors.KNeighborsClassifier(),
        'LogisticRegression': linear_model.LogisticRegression(),
        'DecisionTree': tree.DecisionTreeClassifier(),
        'RandomForest': ensemble.RandomForestClassifier(),
}

config_dict['CLASSIFICATION']['GENERAL'] = {
    'score_metric': f1_score,
    'cv_inner': 5,
    'cv_outer': 10,
    'seed': config_dict['GENERAL']['SEED']
}

config_dict['CLASSIFICATION']['PARAMS'] = {
    'KNN': {
            'KNN__n_neighbors' : list(range(1,35, 4)), 
            'KNN__weights': ['uniform', 'distance' ],
            #'KNN__n_jobs' : [-1],
            },
    'LinearSVC': {
            'LinearSVC__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], 
            #'LinearSVC__kernel': ['linear'],  
            },
    'RbfSVC': {
            'RbfSVC__C': [10, 100, 1000],  
            'RbfSVC__gamma': [0.001, 0.01, 0.1, 1],    
            #'RbfSVC__kernel': ['rbf'],
            },
    'LogisticRegression': {
            'LogisticRegression__penalty': ['l1', 'l2'],
            'LogisticRegression__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],   
            'LogisticRegression__max_iter': [1000],
            'LogisticRegression__solver': ['liblinear', 'lbfgs'],
            },
    'DecisionTree': {
            'DecisionTree__max_depth': list(range(2, 10)),
            'DecisionTree__min_samples_split': list(range(2, 10)),
            'DecisionTree__criterion' : ['gini', 'entropy'],
            },
    'RandomForest': {
            'RandomForest__n_estimators': [10, 100],
            'RandomForest__criterion' : ['gini', 'entropy'],
            'RandomForest__max_depth': list(range(2, 10)),
            'RandomForest__min_samples_split': [2, 5, 10], 
            'RandomForest__max_features': ['sqrt'],
            #'RandomForest__n_jobs' : [-1],
            },
}

show_dict(config_dict)