In [198]:
import numpy as np
import pandas as pd
import seaborn as sns
%matplotlib inline

from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler
import sklearn.metrics as metrics



# Preprocessing and Feature Selection

First put all features in one pandas.DataFrame and add image ids.

In [341]:
#load all csv files needed
df = pd.read_csv('features_all_malte.csv')
df2 = pd.read_csv('features_all_fabian.csv')
ids = pd.read_csv('image_ids.csv')

In [342]:
ids = ids.drop(['lesion_id', 'dx', 'dx_type', 'age', 'sex', 'localization'], axis = 1)

In [343]:
#add image ids to df2 (features_all_fabian.csv)
merge_df = df2.join(ids, how='outer')

In [344]:
df = merge_df.set_index('image_id').join(df.set_index('image_id'))

In [346]:
#sort here already! Needed for export later
df = df.sort_values(by = 'image_id')

In [516]:
#df.to_csv('features_all_together.csv')

In [207]:
#clean up and drop all rows where NAs occur
df = df.dropna()

In [208]:
df_train = df[df['validation'] == 0]
df_test = df[df['validation'] == 1]

# create label DataFrames
y_train = pd.DataFrame(df_train['dx'])
y_test = pd.DataFrame(df_test['dx'])

In [248]:
# create feature DataFrames
X_train = df_train.drop(['dx', 'validation', 'malignant'], axis = 1)

X_test = df_test.drop(['dx', 'validation', 'malignant'], axis = 1)

In [250]:
#scaling each feature to [0,1]
min_max_scaler = preprocessing.MinMaxScaler()

X_train = pd.DataFrame(min_max_scaler.fit_transform(X_train), columns=X_train.columns)
X_test = pd.DataFrame(min_max_scaler.fit_transform(X_test), columns=X_test.columns)

  return umr_sum(a, axis, dtype, out, keepdims, initial)
  return self.partial_fit(X, y)
  return umr_sum(a, axis, dtype, out, keepdims, initial)
  return umr_sum(a, axis, dtype, out, keepdims, initial)
  return self.partial_fit(X, y)
  return umr_sum(a, axis, dtype, out, keepdims, initial)


In [None]:
#feature selection #1
from sklearn.feature_selection import SelectKBest,f_classif, chi2,mutual_info_classif

select = SelectKBest(f_classif , k=26)
select.fit(X_train, np.ravel(y_train))
names = X_train.columns.values[select.get_support(indices=True)]
X_train = pd.DataFrame(select.transform(X_train))
X_test = pd.DataFrame(select.transform(X_test))

In [251]:
#feature selection #2
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier

select = SelectFromModel(RandomForestClassifier(n_estimators=100, random_state=25)) #random_state to make reproducable
select.fit(X_train, np.ravel(y_train))
names = X_train.columns.values[select.get_support(indices=True)]
X_train = pd.DataFrame(select.transform(X_train))
X_test = pd.DataFrame(select.transform(X_test))

In [None]:
#feature selection #3
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFromModel

#the smaller C the fewer features selected.
lsvc = LinearSVC(C=0.1, penalty="l1", dual=False).fit(X_train, np.ravel(y_train))
select = SelectFromModel(lsvc)
select.fit(X_train, np.ravel(y_train))
names = X_train.columns.values[select.get_support(indices=True)]
X_train = pd.DataFrame(select.transform(X_train))
X_test = pd.DataFrame(select.transform(X_test))

In [247]:
names

array(['Moment_R_L01', 'Moment_R_L02', 'Moment_R_L20', 'Moment_G_L20',
       'Moment_B_L02', 'Moment_B_L20', 'average_red3', 'average_green3',
       'average_blue3', 'area_variance01', 'area_variance02',
       'area_variance03', 'area_variance1', 'area_variance2',
       'area_variance3', 'average_blue2', 'average_green2',
       'average_red2', 'contrast2', 'correlation2', 'dissimilarity2',
       'energy2', 'nu12', 'perimeter'], dtype=object)

In [253]:
X_train = preprocessing.scale(X_train)
X_test  = preprocessing.scale(X_test)

# Training

In [322]:
model = SVC(kernel='rbf', C=3, gamma = 'auto', probability = True)

In [323]:
model.fit(X_train,np.ravel(y_train))

SVC(C=3, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [325]:
prediction_probabilities = model.predict_proba(X_test)

In [326]:
probabs = pd.DataFrame(prediction_probabilities)
probabs.columns = ['akiec', 'bcc', 'bkl', 'df', 'mel', 'nv', 'vasc']
predictions = probabs.idxmax(axis=1)

In [327]:
confusion = confusion_matrix(y_test,predictions)
print(confusion)


[[  36   15   11    0    4   18    0]
 [  14   46   13    0    5   51    0]
 [  12   11   97    0   40  114    1]
 [   5    6    6    3    1    9    0]
 [   9    1   17    0   92  160    1]
 [   6   11   54    2   86 1518    0]
 [   0    1    5    0    4   14   12]]


In [None]:
# check if overfitting the data by looking at training dataset

#prediction_probabilities = model.predict_proba(X_train)
#probabs = pd.DataFrame(prediction_probabilities)
#probabs.columns = [False, True]
#predictions = probabs.idxmax(axis=1)
#confusion = confusion_matrix(y_train,predictions)
#print(confusion)


In [328]:
# classification metrics (https://stackoverflow.com/a/43331484/8614827)
FP = confusion.sum(axis=0) - np.diag(confusion)  
FN = confusion.sum(axis=1) - np.diag(confusion)
TP = np.diag(confusion)
TN = confusion.sum() - (FP + FN + TP)

# Sensitivity, hit rate, recall, or true positive rate
TPR = TP/(TP+FN)
# Specificity or true negative rate
TNR = TN/(TN+FP) 
# Precision or positive predictive value
PPV = TP/(TP+FP)
# Negative predictive value
NPV = TN/(TN+FN)
# Fall out or false positive rate
FPR = FP/(FP+TN)
# False negative rate
FNR = FN/(TP+FN)
# False discovery rate
FDR = FP/(TP+FP)

# Overall accuracy
ACC = (TP+TN)/(TP+FP+FN+TN)

In [329]:
# metrics used on MTEC website
print(ACC.mean()) #accuracy
print(TPR.mean()) #sensitivity
print(TNR.mean()) #specificity
print(metrics.f1_score(y_test, predictions, average=None).mean()) #F1-score

0.9195539625647152
0.4007114922729541
0.9159333617525623
0.4458782247005443


# Postprocessing and csv Creation

In [331]:
result = pd.DataFrame(prediction_probabilities)

In [332]:
result.columns = ['AKIEC','BCC','BKL','DF','MEL','NV','VASC']
result = result[['MEL','NV','BCC','AKIEC','BKL','DF','VASC']]

In [333]:
result.head()

Unnamed: 0,MEL,NV,BCC,AKIEC,BKL,DF,VASC
0,0.032456,0.142599,0.085633,0.071311,0.473621,0.190362,0.004017
1,0.15056,0.797608,0.004534,0.001266,0.042992,0.000951,0.002089
2,0.035165,0.31228,0.101272,0.02292,0.300031,0.223395,0.004936
3,0.140106,0.437406,0.070157,0.014706,0.312745,0.010002,0.014877
4,0.009591,0.968515,0.003097,0.002228,0.011212,0.004529,0.000827


In [347]:
image_label = pd.DataFrame(df_test.index)
image_label = image_label.sort_values(by = 'image_id')

In [335]:
image_label= image_label.reset_index(drop=True)

In [336]:
to_csv = image_label.join(result, how='outer')

In [337]:
to_csv.columns = ['image','MEL','NV','BCC','AKIEC','BKL','DF','VASC']

In [338]:
to_csv.to_csv('multiclass_classification', index = False)

In [339]:
to_csv.head()

Unnamed: 0,image,MEL,NV,BCC,AKIEC,BKL,DF,VASC
0,ISIC_0024312,0.032456,0.142599,0.085633,0.071311,0.473621,0.190362,0.004017
1,ISIC_0024317,0.15056,0.797608,0.004534,0.001266,0.042992,0.000951,0.002089
2,ISIC_0024318,0.035165,0.31228,0.101272,0.02292,0.300031,0.223395,0.004936
3,ISIC_0024324,0.140106,0.437406,0.070157,0.014706,0.312745,0.010002,0.014877
4,ISIC_0024328,0.009591,0.968515,0.003097,0.002228,0.011212,0.004529,0.000827


# Random Forrest

In [194]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=100)
rfc.fit(X_train, np.ravel(y_train))

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [195]:
prediction_probabilities = rfc.predict_proba(X_test)