In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import confusion_matrix, precision_score, classification_report, accuracy_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler,normalize
from sklearn.ensemble import RandomForestClassifier
import seaborn as sns
import plotly.offline as py
import plotly.graph_objs as go

In [None]:
# Utilizando data set de Digit Recognizer
mnist_data = pd.read_csv("train_mnist.csv")

In [None]:
mnist_data.head()

In [None]:
# Veridicando se tem valores nulos
sum(mnist_data.isna().sum())

In [None]:
mnist_data['label'].value_counts()

In [None]:
X = mnist_data.iloc[:,1:]
# Normalizacao de X
X_norm = normalize(X)
y = mnist_data.iloc[:,:1].values
X_train, X_test, y_train, y_test = train_test_split(X_norm, y, 
                                                    test_size=0.3, 
                                                    random_state=13)

### LDA Classificador

In [None]:
# Padrão = Number of components for dimensionality reduction n_classes - 1
modelLDA = LinearDiscriminantAnalysis()
modelLDA.fit(X_train, y_train.ravel())

In [None]:
score=modelLDA.score(X_train, y_train)
ypred=modelLDA.predict(X_test)
cm = confusion_matrix(y_test, ypred)

print("Score: ",score)
print('Accuracy' + str(accuracy_score(y_test, ypred)))
print(cm)
#print(classification_report(y_test, ypred))

In [None]:
# Quanto maior o valor, maior é a separabilidade da classe
pd.Series(np.abs(modelLDA.coef_).sum(axis=1), index=modelLDA.classes_).sort_values().plot.bar(
    figsize=(12, 6), title="LDA Somatoria dos coeficientes para cada classe"
)

### LDA redução de dimensionalidade

### Random forest + LDA

In [None]:
sc = StandardScaler()
X_train2 = sc.fit_transform(X_train)
X_test2 = sc.transform(X_test)

In [None]:
# Padrão = Number of components for dimensionality reduction n_classes - 1
lda2 = LinearDiscriminantAnalysis()
X_train2 = lda2.fit_transform(X_train, y_train.ravel())
X_test2 = lda2.transform(X_test)

In [None]:
clfR2 = RandomForestClassifier()
clfR2.fit(X_train2, y_train.ravel())

In [None]:
score=clfR2.score(X_train2, y_train)
ypred=clfR2.predict(X_test2)
cm = confusion_matrix(y_test, ypred)

print("Score: ",score)
print('Accuracy: ' + str(accuracy_score(y_test, ypred)))
print(cm)

O princípio do LDA é maximizar a variância entre as classes e minimizar a variância dentro da classe

In [None]:
# For cluster coloring in our Plotly plots, remember to also restrict the target values
Target =  mnist_data['label'][:6000]
X2 = X[:6000]

In [None]:
X_std = StandardScaler().fit_transform(X2)

In [None]:
#5 discriminantes lineares
lda = LinearDiscriminantAnalysis(n_components=5)
X_LDA_2D = lda.fit_transform(X_std, Target.values.ravel())

In [None]:
# Apenas as duas principais projeções de componentes (discriminante linear)
traceLDA = go.Scatter(
    x = X_LDA_2D[:, 0],
    y = X_LDA_2D[:, 1],
    mode = 'markers',
    text = Target,
    showlegend = True,
    marker = dict(
        size = 8,
        color = Target,
        colorscale = 'Jet',
        line = dict(
            width = 2,
            color = 'rgb(255, 255, 255)'
        ),
        opacity = 0.8
    )
)
data = [traceLDA]

layout = go.Layout(
    title = 'Linear Discriminant Analysis (LDA)',
    hovermode = 'closest',
    xaxis = dict(
        title = 'First Linear Discriminant',
        ticklen = 5,
        zeroline = False,
        gridwidth = 2,
    ),
    yaxis = dict(
        title = 'Second Linear Discriminant',
        ticklen = 5,
        gridwidth = 2,
    ),
    showlegend = False
)

fig = dict(data=data, layout=layout)
py.iplot(fig, filename='styled-scatter')