Ref.: https://towardsdatascience.com/lda-linear-discriminant-analysis-how-to-improve-your-models-with-supervised-dimensionality-52464e73930f

In [13]:
# Data manipulation
import pandas as pd # for data manipulation

# Visualization
import plotly.express as px # for data visualization
import seaborn as sb

# Skleran
from sklearn.preprocessing import StandardScaler # for data standardization
from sklearn.metrics import classification_report # for model evaluation metrics
from sklearn.preprocessing import OrdinalEncoder # to encode categorical variables
from sklearn.tree import DecisionTreeClassifier # for decision tree models

# Sklearn dimensionality reduction
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA # for LDA analysis
from sklearn.decomposition import PCA # for PCA analysis
sb.set_style('darkgrid')


In [2]:
# Read in data
df = pd.read_csv('RealState.csv', encoding='utf-8')
# Print dataframe
df

Unnamed: 0,No,X1 transaction date,X2 house age,X3 distance to the nearest MRT station,X4 number of convenience stores,X5 latitude,X6 longitude,Y house price of unit area
0,1,2012.917,32.0,84.87882,10,24.98298,121.54024,37.9
1,2,2012.917,19.5,306.59470,9,24.98034,121.53951,42.2
2,3,2013.583,13.3,561.98450,5,24.98746,121.54391,47.3
3,4,2013.500,13.3,561.98450,5,24.98746,121.54391,54.8
4,5,2012.833,5.0,390.56840,5,24.97937,121.54245,43.1
...,...,...,...,...,...,...,...,...
409,410,2013.000,13.7,4082.01500,0,24.94155,121.50381,15.4
410,411,2012.667,5.6,90.45606,9,24.97433,121.54310,50.0
411,412,2013.250,18.8,390.96960,7,24.97923,121.53986,40.6
412,413,2013.000,8.1,104.81010,5,24.96674,121.54067,52.5


In [3]:
# Bin unit price into 33.3% quantiles
df['Price Band'] = pd.qcut(df['Y house price of unit area'], 3, labels=['1.Affordable (bottom 33%)', '2.Mid-range (middle 33%)', '3.Expensive (top 33%)'])
# Check distribution
df['Price Band'].value_counts().sort_index()

1.Affordable (bottom 33%)    141
2.Mid-range (middle 33%)     136
3.Expensive (top 33%)        137
Name: Price Band, dtype: int64

In [4]:
# Select encoding method
enc=OrdinalEncoder() 

# Encode categorical values
df['Price Band enc']=enc.fit_transform(df[['Price Band']])

# Check encoding results in a crosstab
pd.crosstab(df['Price Band'], df['Price Band enc'], margins=False)

Price Band enc,0.0,1.0,2.0
Price Band,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1.Affordable (bottom 33%),141,0,0
2.Mid-range (middle 33%),0,136,0
3.Expensive (top 33%),0,0,137


In [12]:
# Create a 3D scatter plot
fig = px.scatter_3d(df, 
                    x=df['X1 transaction date'], y=df['X2 house age'], z=df['X3 distance to the nearest MRT station'],
                    color=df['Price Band'],
                    color_discrete_sequence=['#636EFA','#EF553B','#00CC96'], 
                    hover_data=['X3 distance to the nearest MRT station', 'Y house price of unit area', 'Price Band enc'],
                    height=900, width=900
                   )

# Update chart looks
fig.update_layout(#title_text="Scatter 3D Plot",
                  showlegend=True,
                  legend=dict(orientation="h", yanchor="top", y=0, xanchor="center", x=0.5),
                  scene_camera=dict(up=dict(x=0, y=0, z=1), 
                                        center=dict(x=0, y=0, z=-0.2),
                                        eye=dict(x=-1.5, y=1.5, z=0.5)),
                                        margin=dict(l=0, r=0, b=0, t=0),
                  scene = dict(xaxis=dict(backgroundcolor='white',
                                          color='black',
                                          gridcolor='#f0f0f0',
                                          title_font=dict(size=10),
                                          tickfont=dict(size=10),
                                         ),
                               yaxis=dict(backgroundcolor='white',
                                          color='black',
                                          gridcolor='#f0f0f0',
                                          title_font=dict(size=10),
                                          tickfont=dict(size=10),
                                          ),
                               zaxis=dict(backgroundcolor='lightgrey',
                                          color='black', 
                                          gridcolor='#f0f0f0',
                                          title_font=dict(size=10),
                                          tickfont=dict(size=10),
                                         )))

# Update marker size
fig.update_traces(marker=dict(size=2))

fig.show()

In [6]:
# Select features
X=df[['X1 transaction date', 'X2 house age', 'X3 distance to the nearest MRT station']]
# Select target
y=df['Price Band enc'].values

# Get scaler
scaler=StandardScaler()
# Perform standard scaling on model features
X_std=scaler.fit_transform(X)

In [7]:
# Select the model and its parameters
lda = LDA(
    solver='eigen', #{‘svd’, ‘lsqr’, ‘eigen’}, default=’svd’
    n_components=2, #int, default=None
    #shrinkage=None, #‘auto’ or float, default=None
    #priors=None, #array-like of shape (n_classes,), default=None, The class prior probabilities. By default, the class proportions are inferred from the training data.
    #store_covariance=False, #bool, default=False, If True, explicitely compute the weighted within-class covariance matrix when solver is ‘svd’. 
    #tol=0.0001, #float, default=1.0e-4, Absolute threshold for a singular value of X to be considered significant, used to estimate the rank of X.
)

# Fit transform the data
X_trans_lda=lda.fit_transform(X_std,y)

# Print the results
print('*************** LDA Summary ***************')
print('Classes: ', lda.classes_)
print('Priors: ', lda.priors_)
print('Explained variance ratio: ', lda.explained_variance_ratio_)

*************** LDA Summary ***************
Classes:  [0. 1. 2.]
Priors:  [0.34057971 0.32850242 0.33091787]
Explained variance ratio:  [0.85239893 0.14760107]


In [8]:
# Create a scatter plot
fig = px.scatter(df, x=X_trans_lda[:,0], y=X_trans_lda[:,1], 
                 labels={
                     "x": "LD1",
                     "y": "LD2",
                 },
                 opacity=1, color=df['Price Band'])

# Change chart background color
fig.update_layout(dict(plot_bgcolor = 'white'))

# Update axes lines
fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='lightgrey', 
                 zeroline=True, zerolinewidth=1, zerolinecolor='lightgrey', 
                 showline=True, linewidth=1, linecolor='black')

fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='lightgrey', 
                 zeroline=True, zerolinewidth=1, zerolinecolor='lightgrey', 
                 showline=True, linewidth=1, linecolor='black')

# Set figure title
fig.update_layout(title_text="LDA scatter plot")

# Update marker size
fig.update_traces(marker=dict(size=3))

fig.show()

In [9]:
def fitting(X_in, y, criterion, splitter, mdepth, clweight, minleaf):

    # Fit the model
    model = DecisionTreeClassifier(criterion=criterion, 
                                   splitter=splitter, 
                                   max_depth=mdepth,
                                   class_weight=clweight,
                                   min_samples_leaf=minleaf, 
                                   random_state=0, 
                                  )
    clf = model.fit(X_in, y)

    # Predict class labels on training data
    pred_labels_tr = model.predict(X_in)

    # Use score method to get accuracy of the model
    print('*************** Tree Summary ***************')
    print('Classes: ', clf.classes_)
    print('Tree Depth: ', clf.tree_.max_depth)
    print('No. of leaves: ', clf.tree_.n_leaves)
    print('No. of features: ', clf.n_features_)
    #print('Feature Importance: ')
    #print(list(zip(X.columns, clf.feature_importances_)))
    print('--------------------------------------------------------')
    print("")
    
    print('*************** Evaluation on Training Data ***************')
    score_tr = model.score(X_in, y)
    print('Accuracy Score: ', score_tr)
    # Look at classification report to evaluate the model
    print(classification_report(y, pred_labels_tr))
    print('--------------------------------------------------------')
    
    # Return relevant data for chart plotting
    return clf

In [10]:
# Fit the model and display results
clf_lda = fitting(X_trans_lda, y, 'gini', 'best', mdepth=3, clweight=None, minleaf=10)

*************** Tree Summary ***************
Classes:  [0. 1. 2.]
Tree Depth:  3
No. of leaves:  7
No. of features:  2
--------------------------------------------------------

*************** Evaluation on Training Data ***************
Accuracy Score:  0.7874396135265701
              precision    recall  f1-score   support

         0.0       0.88      0.87      0.88       141
         1.0       0.70      0.72      0.71       136
         2.0       0.78      0.77      0.77       137

    accuracy                           0.79       414
   macro avg       0.79      0.79      0.79       414
weighted avg       0.79      0.79      0.79       414

--------------------------------------------------------



The attribute `n_features_` is deprecated in 1.0 and will be removed in 1.2. Use `n_features_in_` instead.

