# Import Libraries

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

### From sklearn - Preprocesing 
from sklearn import preprocessing

# Dimension reduction 
from sklearn.decomposition import TruncatedSVD

# K-fold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold, cross_val_score, GridSearchCV

# From sklearn - Model creation

from sklearn.tree import DecisionTreeClassifier 
from sklearn.model_selection import train_test_split 
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier

from sklearn.metrics import multilabel_confusion_matrix

from sklearn import metrics 
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

-----
# Reading files and merging features with labels

In [3]:
dataset = pd.read_csv("data.csv")

dataset_labels = pd.read_csv("labels.csv")

In [4]:
# Merging both datasets and removing first column
dataset = pd.merge(dataset, dataset_labels, on='Unnamed: 0').drop("Unnamed: 0", axis=1)

----
# Analizing data

###  Encode the labels of the dataset

In [5]:
# Encode the labels and updaate values
le = preprocessing.LabelEncoder()
dataset["Class"] = le.fit_transform(dataset_labels.drop("Unnamed: 0", axis=1).values.ravel())

In [6]:
# Let´s store all the labels
keys = le.classes_

# And now store labels with their encoded value 
values = le.transform(le.classes_)
dictionary = dict(zip(keys, values))

### Normalize the dataset



In [7]:
# Lets store the dataset values without the columns titles
x = dataset.iloc[:, :-1].values 

In [8]:
# Let´s create an instance with the normalice function
min_max_scaler = preprocessing.MinMaxScaler()

# Let´s fit the normilice function
x_scaled = min_max_scaler.fit_transform(x)

# Let´s put in the dataframe the values scaled
dataset.iloc[:, :-1] = pd.DataFrame(x_scaled)

### Output of preprocesing 

In [9]:
dataset.shape

(801, 20532)

--------------
# Model Implementation:  Esemble methods 

###  80% Training data, 20% test data

In [15]:
# Selected value dimension reduction 
lda = LinearDiscriminantAnalysis(n_components=4)

# Apply PCA in input vector
lda.fit(dataset.iloc[:,:-1],dataset.iloc[:,-1]) 

# Save results in an np.array
reduced = lda.transform(dataset.iloc[:,:-1])

# Save labels in an np.array
x = dataset['Class'].to_numpy()

# Create final dataframe with reduced dimensions
dataset_reduced_LDA = pd.DataFrame(np.column_stack((reduced, x)))

#For origianl data as input
input_data = dataset.iloc[:,:-1].values
label_data = dataset.iloc[:,-1].values

#For PCA REDUCED data as input
#input_data = dataset_reduced_LDA.iloc[:,:-1].values
#label_data = dataset_reduced_LDA.iloc[:,-1].values

# We split the data  
X_train, X_test, y_train, y_test = train_test_split(input_data,label_data, test_size=0.2)

In [16]:
lr = LogisticRegression(max_iter=500)
dt = DecisionTreeClassifier(criterion='entropy', random_state=np.random.RandomState(5))
rf = RandomForestClassifier(n_estimators=100, random_state=np.random.RandomState(5))

labels = ['Logistic Regression', 'Decision Tree', 'Random Forest']

In [17]:
evc = VotingClassifier(estimators = [(labels[0],lr),(labels[1], dt), (labels[2], rf)], voting = 'hard')

evc.fit(X_train, y_train)

VotingClassifier(estimators=[('Logistic Regression',
                              LogisticRegression(max_iter=500)),
                             ('Decision Tree',
                              DecisionTreeClassifier(criterion='entropy',
                                                     random_state=RandomState(MT19937) at 0x7FADA9F00D40)),
                             ('Random Forest',
                              RandomForestClassifier(random_state=RandomState(MT19937) at 0x7FADA9F00E40))])

In [18]:
evc.score(X_test, y_test)

0.9937888198757764

In [None]:
cross_val_score(evc,input_data,label_data, cv=5).mean()