<a href="https://colab.research.google.com/github/VasquezSRE/machinelearning-gtc/blob/main/1_Fisher_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from IPython.display import display_html

In [2]:
df = pd.read_csv('./data/CTGsmt with titles.csv')
dataset = df.copy()
dataset.head(2)

Unnamed: 0,LBE,LB,AC,FM,UC,ASTV,MSTV,ALTV,MLTV,DL,...,Min,Max,Nmax,Nzeros,Mode,Mean,Median,Variance,Tendency,NSP
0,0.259259,0.259259,0.0,0.0,0.0,0.813333,0.044118,0.472527,0.047337,0.0,...,0.110092,0.034483,0.111111,0.0,0.472441,0.587156,0.40367,0.271375,1.0,2
1,0.481481,0.481481,0.153846,0.0,0.173913,0.066667,0.279412,0.0,0.205128,0.125,...,0.165138,0.655172,0.333333,0.1,0.637795,0.577982,0.577982,0.04461,0.5,1


In [3]:
def fisherDiscriminantCoeficient(feature, y):
    if(np.max(feature) != 0):
        feature = feature/np.max(feature)
    classes = np.unique(y)
    fisherCoefficient = 0
    for i in classes:
        iIndex = np.where(y == i)
        iElem = feature[iIndex]
        for j in classes:
            if(j != i):
                jIndex = np.where(y == j)
                jElem = feature[jIndex]
                medianDist = (np.mean(iElem)-np.mean(jElem))**2
                stdSum = np.std(iElem)**2+np.std(jElem)**2
                if(stdSum != 0):
                    fisherCoefficient += medianDist/stdSum
    return fisherCoefficient

In [4]:
features = dataset.columns.values
fisher = []
out = dataset["NSP"].to_numpy()
out.shape

(4965,)

In [5]:
for k in features:
    feat = dataset[k].to_numpy()
    fisher.append(fisherDiscriminantCoeficient(feat, out))

In [6]:
def splitDataFrameIntoSmaller(df, chunkSize = 10000): 
    listOfDf = list()
    numberChunks = len(df) // chunkSize + 1
    for i in range(numberChunks):
        listOfDf.append(df[i*chunkSize:(i+1)*chunkSize])
    return listOfDf

In [7]:
df_types = pd.DataFrame({
    'Caracteristicas' : pd.Series(features)
   })
dataframes = splitDataFrameIntoSmaller(df_types, chunkSize=46)
df_styler = ""
nDataframes = range(len(dataframes))
for i, e in zip(dataframes, nDataframes):
    df_styler += i.style.set_table_attributes("style='display:inline'").set_caption(f'Tabla caracteristicas')._repr_html_()
display_html(df_styler, raw=True)

Unnamed: 0,Caracteristicas
0,LBE
1,LB
2,AC
3,FM
4,UC
5,ASTV
6,MSTV
7,ALTV
8,MLTV
9,DL


In [8]:
df_types = pd.DataFrame({
    'Caracteristicas' : pd.Series(features),
    'Indice de Fisher' : pd.Series(fisher),
   })

In [9]:
dataframes = splitDataFrameIntoSmaller(df_types, chunkSize=50)
df_styler = ""
nDataframes = range(len(dataframes))
for i, e in zip(dataframes, nDataframes):
    i.to_csv(f"./data/tabla{e}.csv")
    df_styler += i.style.set_table_attributes("style='display:inline'").set_caption(f'Tabla caracteristicas con fisher #{e}')._repr_html_()
display_html(df_styler, raw=True)

Unnamed: 0,Caracteristicas,Indice de Fisher
0,LBE,2.654882
1,LB,2.654882
2,AC,2.744241
3,FM,0.068982
4,UC,0.875662
5,ASTV,4.662637
6,MSTV,2.430003
7,ALTV,2.829432
8,MLTV,2.688951
9,DL,1.835397


In [10]:
fisher = np.array(fisher)
badFisher = np.where(fisher <= 0.5)
badFeatures = features[badFisher]
badFeaturesFisher =fisher[badFisher]

In [11]:
df_types = pd.DataFrame({
    'Caracteristicas' : pd.Series(badFeatures),
    'Indice de Fisher' : pd.Series(badFeaturesFisher),
   })

In [12]:
dataframes = splitDataFrameIntoSmaller(df_types, chunkSize=50)
df_styler = ""
nDataframes = range(len(dataframes))
for i, e in zip(dataframes, nDataframes):
    i.to_csv(f"./csv_analisis_ingenuo/badFeatures/tabla{e}.csv")
    df_styler += i.style.set_table_attributes("style='display:inline'").set_caption(f'Tabla caracteristicas con fisher #{e}')._repr_html_()
display_html(df_styler, raw=True)

Unnamed: 0,Caracteristicas,Indice de Fisher
0,FM,0.068982
1,DS,0.143263
2,DR,0.0
3,Max,0.03125
4,Nmax,0.187918
5,Nzeros,0.061807
6,NSP,0.0


In [13]:
fisher = np.array(fisher)
goodFisher = np.where(fisher > 0.5)
goodFeatures = features[goodFisher]
goodFeaturesFisher =fisher[goodFisher]
df_types = pd.DataFrame({
    'Caracteristicas' : pd.Series(goodFeatures),
    'Indice de Fisher' : pd.Series(goodFeaturesFisher),
   })
dataframes = splitDataFrameIntoSmaller(df_types, chunkSize=42)
df_styler = ""
nDataframes = range(len(dataframes))
for i, e in zip(dataframes, nDataframes):
    i.to_csv(f"./csv_analisis_ingenuo/goodFeatures/tabla{e}.csv")
    df_styler += i.style.set_table_attributes("style='display:inline'").set_caption(f'Tabla caracteristicas con fisher #{e}')._repr_html_()
display_html(df_styler, raw=True)

Unnamed: 0,Caracteristicas,Indice de Fisher
0,LBE,2.654882
1,LB,2.654882
2,AC,2.744241
3,UC,0.875662
4,ASTV,4.662637
5,MSTV,2.430003
6,ALTV,2.829432
7,MLTV,2.688951
8,DL,1.835397
9,DP,3.288284
