In [26]:
import pandas as pd
import numpy as np

RAMDOM_STATE = 11

In [27]:
df = pd.read_csv('https://raw.githubusercontent.com/VitorBonella/PL-Dataset/main/dataset.csv',sep=";") #Leitura dos dados para o pandas

In [28]:
df.set_index('id',inplace=True) #Transformando a coluna id no indice da tabela

#Observem que a classe esta separada em duas colunas então devemos concatenadas para formar uma coluna só chamada classe
df['classe'] = df['tipo_lampada'].str.replace(" ", "") + df['potencia'].astype(str) 

In [29]:
# Lista de Indices
HU = ['i1', 'i2', 'i3', 'i4','i5', 'i6', 'i7']
MY = HU

df[MY] = df[MY].apply(lambda x: x.str.replace(',', '.').astype(float), axis=1)

X = df[MY]
Y = df['classe']

In [30]:
results_df = pd.DataFrame(columns=['Método', 'Média', 'Desvio padrão', 'Limite inf.', 'Limite sup.'])

### zeroR

In [31]:
from sklearn.model_selection import cross_val_predict 
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from scipy import stats

zR = DummyClassifier(strategy='most_frequent')

cv_zR = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=RAMDOM_STATE)

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=RAMDOM_STATE)

scores_ZR = cross_val_score(zR, X_train, y_train, scoring='accuracy', cv=cv_zR)

inf, sup = stats.norm.interval(0.95, loc=scores_ZR.mean(), scale=scores_ZR.std()/np.sqrt(len(scores_ZR)))

results_df = pd.concat([results_df, pd.DataFrame({'Método': ['ZR'], 'Média': [scores_ZR.mean()], 'Desvio padrão': scores_ZR.std(), 'Limite inf.': inf, 'Limite sup.': sup})], ignore_index=True)

print(scores_ZR.mean())
print(type(scores_ZR))

0.1563405797101449
<class 'numpy.ndarray'>


In [32]:
from sklearn.model_selection import GridSearchCV, RepeatedStratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from scipy import stats

def train_models(model, params_grid, name):
    scalar = StandardScaler()
    pipe = Pipeline(steps=[('s',scalar), ('m', model)])

    inner_cv = RepeatedStratifiedKFold(n_splits=4, n_repeats=3, random_state=RAMDOM_STATE)

    outer_cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=RAMDOM_STATE)

    grid_search = GridSearchCV(pipe, param_grid=params_grid,  scoring='accuracy', cv=inner_cv, n_jobs=-1)

    scores = cross_val_score(grid_search, X, Y.values.ravel(), scoring='accuracy', cv=outer_cv, n_jobs=-1)

    inf, sup = stats.norm.interval(0.95, loc=scores.mean(), scale=scores.std()/np.sqrt(len(scores)))

    local_df = pd.concat([results_df, pd.DataFrame({'Método': [name], 'Média': [scores.mean()], 'Desvio padrão': scores.std(), 'Limite inf.': inf, 'Limite sup.': sup})], ignore_index=True)

    return local_df, scores

In [33]:
from sklearn.ensemble import BaggingClassifier


bg = BaggingClassifier(n_estimators=3, random_state=RAMDOM_STATE)

params_grid = {
    'm__n_estimators': [3,9,15,21]
    } 

results_df, scores_BA = train_models(bg, params_grid, 'BA')

In [34]:
from sklearn.ensemble import AdaBoostClassifier

ada = AdaBoostClassifier(n_estimators=3, random_state=RAMDOM_STATE)

params_grid = {
    'm__n_estimators': [3,9,15,21]
    }

results_df, scores_AD = train_models(ada, params_grid, 'AD')

In [35]:
from sklearn.ensemble import RandomForestClassifier


rf = RandomForestClassifier(random_state=RAMDOM_STATE)

params_grid = {
    'm__n_estimators': [3,9,15,21]
}

results_df, scores_RF = train_models(rf, params_grid, 'RF')

In [36]:
results_df

Unnamed: 0,Método,Média,Desvio padrão,Limite inf.,Limite sup.
0,ZR,0.156341,0.020735,0.148921,0.163761
1,BA,0.461571,0.096206,0.427145,0.495997
2,AD,0.257165,0.02772,0.247246,0.267084
3,RF,0.472797,0.074776,0.446039,0.499555


In [37]:
from sklearn.base import BaseEstimator
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.utils.validation import check_X_y
from sklearn.utils import resample
from collections import Counter


class HeterogeneousPooling(BaseEstimator):
    # define o construtor para o classificador
    def __init__(self, n_samples=3):
        self.classifiers = [DecisionTreeClassifier(random_state=RAMDOM_STATE), KNeighborsClassifier(), GaussianNB()]
        self.n_samples = n_samples
        self.trained_classifiers = []

    def fit(self, X, y):

         # se os dados forem uma matriz numpy, converta para dataframe pandas
        if isinstance(X, np.ndarray):
            X = pd.DataFrame(X)
        if isinstance(y, np.ndarray):
            y = pd.Series(y)

        # reset index
        X.reset_index(drop=True, inplace=True)
        y.reset_index(drop=True, inplace=True)
        
        self.counter = Counter(y)

        for i in range(self.n_samples):
            if i == 0:
                X_train = X
                y_train = y
            else:
                X_train, y_train = resample(X, y, replace=True, random_state=RAMDOM_STATE)

            self.train_classifiers(X_train, y_train)
        return self
    
    def train_classifiers(self, X, y):
        
        for classifier in self.classifiers:
            classifier.fit(X, y.ravel())
            self.trained_classifiers.append(classifier)

    def predict(self, X):
        predictions = []

        for _, row in X.iterrows():
            # print(row)
            predictions.append(self.unique_predict(row.values.reshape(1, -1)))

        print("predicts",predictions)

        return np.mdArray(predictions)

    def unique_predict(self, X):
        predictions = {}

        for classifier in self.trained_classifiers:
            pred = classifier.predict(X)
            print(pred)

            # try:
            predictions[pred[0]] += 1
            # except:
                # predictions[pred[0]] = 1
        
        # verificar se existe mais de uma classe com a mesma quantidade de votos
        max_votes = max(predictions.values())
        max_classes = [k for k, v in predictions.items() if v == max_votes]

        if len(max_classes) > 1:
            # se sim, retornar a classe mais frequente na base de treino
            return self.counter.most_common(1)[0][0]
        else:
            # se não, retornar a classe mais votada
            return max_classes[0]

        

In [38]:
hp = HeterogeneousPooling()
# hp.fit(X_train, y_train)

# print(hp.predict(X_test))


params_grid = {
    'm__n_samples': [1,3,5,7]
}

def train_models1(model, params_grid, name):
    scalar = StandardScaler()
    pipe = Pipeline(steps=[('s',scalar), ('m', model)])

    inner_cv = RepeatedStratifiedKFold(n_splits=4, n_repeats=3, random_state=RAMDOM_STATE)

    outer_cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=RAMDOM_STATE)

    grid_search = GridSearchCV(pipe, param_grid=params_grid,  scoring='accuracy', cv=inner_cv, n_jobs=-1)

    scores = cross_val_score(grid_search, X, Y.values.ravel(), scoring='accuracy', cv=outer_cv, n_jobs=-1)
    print(scores)

    inf, sup = stats.norm.interval(0.95, loc=scores.mean(), scale=scores.std()/np.sqrt(len(scores)))

    local_df = pd.concat([results_df, pd.DataFrame({'Método': [name], 'Média': [scores.mean()], 'Desvio padrão': scores.std(), 'Limite inf.': inf, 'Limite sup.': sup})], ignore_index=True)

    return local_df, scores

results_df, scores_HP = train_models1(hp, params_grid, 'HP')

[nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan]


In [39]:
results_df

Unnamed: 0,Método,Média,Desvio padrão,Limite inf.,Limite sup.
0,ZR,0.156341,0.020735,0.148921,0.163761
1,BA,0.461571,0.096206,0.427145,0.495997
2,AD,0.257165,0.02772,0.247246,0.267084
3,RF,0.472797,0.074776,0.446039,0.499555
4,HP,,,,


In [77]:
from scipy import stats 
# ttest and wilcoxon test

scores = [scores_ZR, scores_BA, scores_AD, scores_RF, scores_HP]
models_names = ['ZeroR', 'BG', 'AB', 'RF', 'HP']

qtd_models = 4

comparative_results = {}

for i in range(qtd_models):
    for j in range(i+1, qtd_models):
        comparative_results[i] = {}
        # ttest
        ttest = stats.ttest_rel(scores[i], scores[j])
        # wilcoxon
        wilcoxon = stats.wilcoxon(scores[i], scores[j], method='approx')
        # save results
        comparative_results[str(i)][str(j)] = {'ttest' = ttest[1], 'wilcoxon' = wilcoxon[1]}
        print(models_names[i] + ' x ' + models_names[j])
        print('ttest: ', comparative_results[i][j][0])
        print('wilcoxon: ', comparative_results[i][j][1])

for i in range(qtd_models):
    for j in range(i+1, qtd_models):
        print(models_names[i] + ' x ' + models_names[j])
        print('ttest: ', comparative_results[i][j][0])
        


# tabela com os resultados dos testes, t student no superior e wilcoxon no inferior
# table = pd.DataFrame(columns=models_names, index=models_names)

# for i in range(len(models_names)):
#     for j in range(i, len(models_names)):
#         # se o index for igual a coluna 
#         if i == j:
#             table.loc[models_names[i], models_names[j]] = models_names[i]

#         else:
#             table.loc[models_names[i], models_names[j]] = comparative_results[models_names[i] + ' x ' + models_names[j]][0][1]
#             table.loc[models_names[j], models_names[i]] = comparative_results[models_names[i] + ' x ' + models_names[j]][1][1]

# print(table)


# table.to_csv('tstudent_wilcoxon.csv', index=False)

KeyError: '0'