In [None]:
!pip install efficient-apriori

import pandas as pd, graphviz, numpy as np
from sklearn.metrics import f1_score
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.feature_selection import mutual_info_classif
from sklearn.model_selection import train_test_split, StratifiedKFold

from sklearn.neighbors import KNeighborsClassifier
from efficient_apriori import apriori
from sklearn.cluster import KMeans
from sklearn.datasets import load_iris
from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.cluster.hierarchy import cut_tree
from sklearn.metrics.pairwise import euclidean_distances
import matplotlib.pyplot as plt
from google.colab import files

# K-NN

## Aula 16 - Medidas de Distancia
*   Exercicio 2 do slide 34



In [None]:
data = [
    ["Estado", "Escolaridade", "Altura", "Salário", "Classe"],
    ["SP", "Médio", "180", "3000", "A"],
    ["RJ", "Médio", "180", "3000", "B"],
    ["RS", "Superior", "174", "7000", "B"],
    ["RJ", "Médio", "180", "600", "A"],
    ["SP", "Superior", "100", "5000", "A"],
    ["RJ", "Fundam.", "178", "1800", "A"],
    ["SP", "Fundam.", "188", "1800", "A"]
]

# Convertendo a lista de dados em um DataFrame
columns = data[0]
df = pd.DataFrame(data[1:], columns=columns)

df['Escolaridade'].replace({'Fundam.': 1, 'Médio': 2, 'Superior': 3}, inplace=True)
df['Classe'].replace({'A': 1, 'B': 2}, inplace=True)
onehot = pd.get_dummies(df['Estado'])
df = pd.concat([onehot.astype(int), df.drop('Estado', axis=1)], axis=1)
df

In [None]:
# Preparando dados para o treinamento
X = df[['RJ', 'RS', 'SP', 'Escolaridade', 'Altura', 'Salário']].astype(int)
y = df['Classe']

# Dividindo os dados em treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=87)

# Instanciando e treinando o modelo KNN
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.distance_metrics.html#sklearn.metrics.pairwise.distance_metrics
knn = KNeighborsClassifier(n_neighbors=3, metric='euclidean')
knn.fit(X_train, y_train)

# Avaliando o modelo
predictions = knn.predict(X_test)
print(f1_score(y_test, predictions))

In [None]:
for x, y in zip(X_test.values, predictions):
  print(x, y)

# Apriori


In [None]:
#@title dataset para o apriori

#baixar dataset (http://archive.ics.uci.edu/ml/datasets/Online+Retail/) (http://archive.ics.uci.edu/ml/machine-learning-databases/00352/)
!wget http://archive.ics.uci.edu/static/public/352/online+retail.zip
!unzip online+retail.zip

In [None]:
# carrega o dataset (demora um pouco)
df = pd.read_excel('Online Retail.xlsx')
df

In [None]:
# para ficar mais rápido, removemos todas as transações do Reino Unido
df.drop(df[df.Country == 'United Kingdom'].index, axis=0, inplace=True)

# removemos também as colunas que não são interessantes
df.drop(['StockCode', "Quantity", 'InvoiceDate', 'UnitPrice', 'CustomerID', 'Country'], axis=1, inplace=True)

df

In [None]:
# o apriori espera receber um conjunto de transações igual no exemplo abaixo
# exemplo "toy"
transactions = [['eggs', 'bacon', 'soup'],
                ['eggs', 'bacon', 'apple'],
                ['soup', 'bacon', 'banana'],
                ['eggs', 'bacon']]

In [None]:
# transformar o dataset para o formato esperado do apriori
transactions = []
last_id = ''

for _, line in df.iterrows():
  id, product = line
  if id != last_id:
    temp = []
    transactions.append(temp)
    last_id = id
  temp.append(product.strip())

transactions

In [None]:
# ordenar todas as transações em ordem alfabética
for t in transactions:
  t.sort()

In [None]:
# executar o apriori (vai demorar um pouco, talvez seja interessante usar o exemplo toy)
itemsets, rules = apriori(transactions, min_support=0.01, min_confidence=0.5)

rules

In [None]:
regras = []
for r in rules:
  print(r)
  print(r.lhs)
  print(r.rhs)
  print(r.lift)
  print(r.support)
  print(r.conviction)
  print(r.confidence)
  break

# Agrupamento

## K-means

In [None]:
data = load_iris()

print(data.keys())

dataset = data['data']

dataset[:10]

In [None]:
kmeans = KMeans(n_clusters=5, n_init=10)
kmeans.fit(dataset)

kmeans.labels_

## Hierarquico

In [None]:
distance = euclidean_distances(dataset)

distance

In [None]:
pd.DataFrame(distance)

In [None]:
con_distance = []
for i, l in enumerate(distance):
  con_distance.extend(l[i+1:])

In [None]:
z = linkage(con_distance, 'ward')

In [None]:
fig = plt.figure(figsize=(20, 10))

dn = dendrogram(z, color_threshold=4)
fig.savefig('plt.pdf', format='pdf', bbox_inches='tight')

# files.download('plt.pdf')

## Avaliação de agrupamento

In [None]:
def set_group(matrix, ids):
  for id1 in ids:
    for id2 in ids:
      if id1 != id2:
        matrix[id1][id2] += 1

In [None]:
m = np.zeros([8, 8])
m

In [None]:
#alg1
set_group(m, [1, 5, 6])
set_group(m, [3, 2, 7, 0, 4])

#alg2
set_group(m, [1, 5])
set_group(m, [3, 4, 6, 0])
set_group(m, [2, 7])
m

In [None]:
kmeans = KMeans(n_clusters=10)
kmeans.fit(dataset)

kmeans_t1 = {}

id = 0
for group in kmeans.labels_:
  if group not in kmeans_t1:
    kmeans_t1[group] = []
  kmeans_t1[group].append(id)
  id += 1

In [None]:
kmeans = KMeans(n_clusters=3)
kmeans.fit(dataset)

kmeans_t2 = {}

id = 0
for group in kmeans.labels_:
  if group not in kmeans_t2:
    kmeans_t2[group] = []
  kmeans_t2[group].append(id)
  id += 1

In [None]:
pd.DataFrame(z, columns=['Cluster1', 'Cluster2', 'Distance', 'cluster size'], dtype='object')

In [None]:
tree = cut_tree(z, n_clusters=[3, 6])

h_tree = [{}, {}]

id = 0
for l in tree:
  g3, g6 = l
  if g3 not in h_tree[0]:
    h_tree[0][g3] = []
  h_tree[0][g3].append(id)
  if g6 not in h_tree[1]:
    h_tree[1][g6] = []
  h_tree[1][g6].append(id)
  id += 1

In [None]:
m = np.zeros([len(dataset), len(dataset)])
m

In [None]:
for _, grupo in kmeans_t1.items():
  set_group(m, grupo)

In [None]:
for _, grupo in kmeans_t2.items():
  set_group(m, grupo)

In [None]:
for agrp in h_tree:
  for _, grupo in agrp.items():
    set_group(m, grupo)

In [None]:
m_dist = 1 / (m + 1)

m_dist

In [None]:
con_distance = []
t = 0
for l in m_dist:
  t += 1
  con_distance.extend(l[t:])

In [None]:
z = linkage(con_distance, method='average')


fig = plt.figure(figsize=(20, 10))

dn = dendrogram(z)
fig.savefig('plt.pdf', format='pdf', bbox_inches='tight')

# files.download('plt.pdf')