# Import Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

### From sklearn - Preprocesing 
from sklearn import preprocessing

# Dimension reduction 

from sklearn.decomposition import TruncatedSVD

# Clustering 
from sklearn.manifold import TSNE

# K-fold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold, cross_val_score, GridSearchCV

# From sklearn - Model creation

from sklearn.tree import DecisionTreeClassifier 
from sklearn.model_selection import train_test_split 
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import multilabel_confusion_matrix

from sklearn import metrics 
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

-----
# Reading files and merging features with labels

In [2]:
#dataset = pd.read_csv("Genes\data.csv")
dataset = pd.read_csv('/Users/pedrorodriguezdeledesmajimenez/1_Coding/Datasets/RUG_Pattern-recognition_Assignment-2/Task 1/Genres/data.csv')

#labels = pd.read_csv("Genes\labels.csv")
dataset_labels =pd.read_csv('/Users/pedrorodriguezdeledesmajimenez/1_Coding/Datasets/RUG_Pattern-recognition_Assignment-2/Task 1/Genres/labels.csv')

In [3]:
# Merging both datasets and removing first column
dataset = pd.merge(dataset, dataset_labels, on='Unnamed: 0').drop("Unnamed: 0", axis=1)

----
# Analizing data

###  Encode the labels of the dataset

In [5]:
# Encode the labels and updaate values
le = preprocessing.LabelEncoder()
dataset["Class"] = le.fit_transform(dataset_labels.drop("Unnamed: 0", axis=1).values.ravel())

In [6]:
# Let´s store all the labels
keys = le.classes_

# And now store labels with their encoded value 
values = le.transform(le.classes_)
dictionary = dict(zip(keys, values))

{'BRCA': 0, 'COAD': 1, 'KIRC': 2, 'LUAD': 3, 'PRAD': 4}

### Normalize the dataset



In [7]:
# Lets store the dataset values without the columns titles
x = dataset.iloc[:, :-1].values 

In [8]:
# Let´s create an instance with the normalice function
min_max_scaler = preprocessing.MinMaxScaler()

# Let´s fit the normilice function
x_scaled = min_max_scaler.fit_transform(x)

# Let´s put in the dataframe the values scaled
dataset.iloc[:, :-1] = pd.DataFrame(x_scaled)

-----------
## Dimensionality reduction:  1.Principal Component Analysis

### Output of preprocesing 

In [25]:
dataset_reduced_PCA.shape

(801, 103)

--------------
# Model Implementation:  Esemble methods 

###  80% Training data, 20% test data

#For ORIGINAL data as input

input_data = dataset.iloc[:,:-1].values
label_data = dataset.iloc[:,-1].values

In [26]:
#For REDUCED data as input

input_data = dataset_reduced_PCA.iloc[:,:-1].values
label_data = dataset_reduced_PCA.iloc[:,-1].values

In [27]:
# We split the data  
X_train, X_test, y_train, y_test = train_test_split(input_data,label_data, test_size=0.2)

In [28]:
from sklearn.pipeline import Pipeline

# Pipeline 1: inside we are giving a list of tuples 
pipeline_lr = Pipeline([('lr_classifier', LogisticRegression())])

# Pipeline 2
pipeline_dt = Pipeline([('dt_classifier', DecisionTreeClassifier())])

# Pipeline 3
pipeline_randomforest = Pipeline([('rf_classifier', RandomForestClassifier())])

In [29]:
## Lets make a list of pipelines
pipelines = [pipeline_lr, pipeline_dt, pipeline_randomforest]

In [30]:
# Variable declaration

best_accuracy = 0.0
best_classifier = 0 
best_pipeline = ""

In [31]:
# Dictionary of pipelines and classifier types for ease of reference

pipe_dict = {0: 'Logistic Regression', 1: 'Decision Tree', 2: 'RandomForest'}

In [32]:
# Fit the pipelines
for pipe in pipelines:
    pipe.fit(X_train, y_train)
    

In [33]:
# Display

for i, model in enumerate(pipelines):
    print("{} Test accuracy: {}".format(pipe_dict[i], model.score(X_test,y_test)))

Logistic Regression Test accuracy: 1.0
Decision Tree Test accuracy: 0.937888198757764
RandomForest Test accuracy: 0.9813664596273292


In [34]:
for i, model in enumerate(pipelines):
    if model.score(X_test, y_test) > best_accuracy:
        best_accuracy = model.score(X_test, y_test)
        best_pipeline = model
        best_classifier = i
        
print('Classifier with the best accuracy: {}' .format(pipe_dict[best_classifier]))   

Classifier with the best accuracy: Logistic Regression


In [35]:
lr = LogisticRegression()
dt = DecisionTreeClassifier()
rf = RandomForestClassifier()

labels = ['Logistic Regression', 'Decision Tree', 'Random Forest']

In [36]:
from sklearn.ensemble import VotingClassifier

evc = VotingClassifier(estimators = [(labels[0],lr),(labels[1], dt), (labels[2], rf)], voting = 'hard')

evc.fit(X_train, y_train)

VotingClassifier(estimators=[('Logistic Regression', LogisticRegression()),
                             ('Decision Tree', DecisionTreeClassifier()),
                             ('Random Forest', RandomForestClassifier())])

In [37]:
evc.score(X_test, y_test)

0.9875776397515528

In [38]:
cross_val_score(evc,input_data,label_data, cv=5).mean()

0.9887732919254658