# 0. Preparando Ambiente

## Instalando Pacotes

In [None]:
!pip install ltn
!pip install pandas-ods-reader
!git clone https://github.com/VinnyHardth/trabalho-final-ia.git

Collecting ltn
  Downloading ltn-2.1-py3-none-any.whl.metadata (8.1 kB)
Downloading ltn-2.1-py3-none-any.whl (13 kB)
Installing collected packages: ltn
Successfully installed ltn-2.1
Collecting pandas-ods-reader
  Downloading pandas_ods_reader-1.0.1-py3-none-any.whl.metadata (2.4 kB)
Collecting ezodf>=0.3.2 (from pandas-ods-reader)
  Downloading ezodf-0.3.2.tar.gz (125 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m125.9/125.9 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading pandas_ods_reader-1.0.1-py3-none-any.whl (7.3 kB)
Building wheels for collected packages: ezodf
  Building wheel for ezodf (setup.py) ... [?25l[?25hdone
  Created wheel for ezodf: filename=ezodf-0.3.2-py2.py3-none-any.whl size=49002 sha256=7a43371db69654bfb70196f965a1fb8ff61810fd85832e025ecc3a155d7f1857
  Stored in directory: /root/.cache/pip/wheels/50/09/62/ea2c44e6ffdd067d6c8d3f557af0734a195bd252d6901f23ca
Successfully buil

## Importando Pacotes

In [None]:
#Basic imports
from pandas_ods_reader import read_ods
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns

#sklearn imports
from sklearn.decomposition import PCA #Principal Component Analysis
from sklearn.manifold import TSNE     #T-Distributed Stochastic Neighbor Embedding
from sklearn.cluster import KMeans    #K-Means Clustering
from sklearn.preprocessing import StandardScaler #used for 'Feature Scaling'

#plotly imports
import plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

## Carregando Dataset e configurando Seabornd e Pandas

In [None]:
sns.set()

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
data = pd.read_csv('/content/trabalho-final-ia/datasets/trains-updated.csv', sep=',')

In [None]:
data

Unnamed: 0,Number_of_cars,Number_of_different_loads,num_wheels1,length1,shape1,num_loads1,load_shape1,num_wheels2,length2,shape2,num_loads2,load_shape2,num_wheels3,length3,shape3,num_loads3,load_shape3,num_wheels4,length4,shape4,num_loads4,load_shape4,Rectangle_next_to_rectangle,Rectangle_next_to_triangle,Rectangle_next_to_hexagon,Rectangle_next_to_circle,Triangle_next_to_triangle,Triangle_next_to_hexagon,Triangle_next_to_circle,Hexagon_next_to_hexagon,Hexagon_next_to_circle,Circle_next_to_circle,Class_attribute
0,4,2.0,2,long,openrect,1.0,rectanglod,2,short,openrect,1.0,trianglod,2.0,long,closedrect,3.0,rectanglod,,,,,,0,1,0,0,0,0,0,0,0,0,east
1,4,2.0,2,short,openrect,2.0,circlelod,2,short,closedtrap,1.0,rectanglod,2.0,short,openrect,1.0,circlelod,,,,,,0,0,0,1,0,0,0,0,0,0,east
2,4,3.0,2,short,ushaped,1.0,trianglod,2,short,openrect,1.0,rectanglod,2.0,short,closedrect,1.0,circlelod,,,,,,0,1,0,1,0,0,0,0,0,0,east
3,5,3.0,2,short,opentrap,1.0,circlelod,2,short,ushaped,1.0,trianglod,2.0,long,openrect,3.0,rectanglod,2,short,openrect,1.0,rectanglod,1,1,0,0,0,0,1,0,0,0,east
4,4,3.0,2,short,closedrect,1.0,trianglod,2,short,closedtrap,1.0,circlelod,2.0,short,ushaped,1.0,rectanglod,,,,,,0,0,0,1,0,0,1,0,0,0,east
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,5,3.0,2,short,openrect,3.0,rectanglod,2,short,hexagon,1.0,circlelod,2.0,short,opentrap,1.0,trianglod,2,short,dblopnrect,1.0,trianglod,0,0,0,1,1,0,1,0,0,0,west
96,3,2.0,2,short,hexagon,2.0,circlelod,2,long,closedrect,3.0,rectanglod,,,,,,,,,,,0,0,0,1,0,0,0,0,0,0,west
97,3,2.0,2,short,slopetoprect,1.0,circlelod,3,long,closedrect,1.0,rectanglod,,,,,,,,,,,0,0,0,0,1,0,0,0,0,0,west
98,3,1.0,2,short,slopetoprect,1.0,trianglod,2,short,openrect,1.0,trianglod,,,,,,,,,,,0,0,0,0,1,0,0,0,0,0,west


# 1. Codificação e Padronização dos dados


In [None]:
# Dicionários de Mapeamento
length_mapping = {'short': 1, 'long': 2}

shape_mapping = {
    'closedblopnrect': 1,
    'closedrect': 2,
    'closedtrap': 3,
    'closedushaped': 4,
    'dblopnrect': 5,
    'ellipse': 6,
    'hexagon': 7,
    'jaggedrect': 8,
    'openrect': 9,
    'opentrap': 10,
    'slopetopdblopnrect': 11,
    'slopetoprect': 12,
    'slopetoptrap': 13,
    'slopetopushaped': 14,
    'ushaped': 15
}

load_shapes_mapping = {
    'circlelod': 1,
    'Hexagonlod': 2,
    'rectanglod': 3,
    'trianglod': 4
}

class_atribute = {
    'east': 0,
    'west': 1
}

In [None]:
# Transformando Strings "None" e "none" em NA
data.replace({"None": pd.NA, "none": pd.NA}, inplace=True)

In [None]:
# Transformando dados Categórios em Valores Numéricos
for col in ['length1', 'length2', 'length3', 'length4']:
    data[col] = data[col].map(length_mapping)

# Aplicar os mapeamentos nas colunas de forma
for col in ['shape1', 'shape2', 'shape3', 'shape4']:
    data[col] = data[col].map(shape_mapping)

# Aplicar os mapeamentos nas colunas de formato de carga
for col in ['load_shape1', 'load_shape2', 'load_shape3', 'load_shape4']:
    data[col] = data[col].map(load_shapes_mapping)

# Aplicar os mapeamentos na coluna da classe
data['Class_attribute'] = data['Class_attribute'].map(class_atribute)

# criando coluna para enumeração dos trens
data["Train"] = range(1, len(data) + 1)

In [None]:
# Transformando NA em -1
data.fillna(-1, inplace=True)

In [None]:
# Salvar o CSV transformado
data.to_csv('/content/trabalho-final-ia/datasets/trains-updated-code.csv', sep=',')

# 2. Clusterização

In [None]:
X = pd.read_csv("/content/trabalho-final-ia/datasets/trains-updated-code.csv", index_col=0, sep=',')

kmeans = KMeans(n_clusters=2)

kmeans.fit(X)

In [None]:
clusters = kmeans.predict(X)
#Add the cluster vector to our DataFrame, X
X["Cluster"] = clusters
X

Unnamed: 0,Number_of_cars,Number_of_different_loads,num_wheels1,length1,shape1,num_loads1,load_shape1,num_wheels2,length2,shape2,num_loads2,load_shape2,num_wheels3,length3,shape3,num_loads3,load_shape3,num_wheels4,length4,shape4,num_loads4,load_shape4,Rectangle_next_to_rectangle,Rectangle_next_to_triangle,Rectangle_next_to_hexagon,Rectangle_next_to_circle,Triangle_next_to_triangle,Triangle_next_to_hexagon,Triangle_next_to_circle,Hexagon_next_to_hexagon,Hexagon_next_to_circle,Circle_next_to_circle,Class_attribute,Train,Cluster
0,4,2.0,2,2,9,1.0,3.0,2,1,9,1.0,4.0,2.0,2.0,2.0,3.0,3.0,-1,-1.0,-1.0,-1.0,-1.0,0,1,0,0,0,0,0,0,0,0,0,1,0
1,4,2.0,2,1,9,2.0,1.0,2,1,3,1.0,3.0,2.0,1.0,9.0,1.0,1.0,-1,-1.0,-1.0,-1.0,-1.0,0,0,0,1,0,0,0,0,0,0,0,2,0
2,4,3.0,2,1,15,1.0,4.0,2,1,9,1.0,3.0,2.0,1.0,2.0,1.0,1.0,-1,-1.0,-1.0,-1.0,-1.0,0,1,0,1,0,0,0,0,0,0,0,3,0
3,5,3.0,2,1,10,1.0,1.0,2,1,15,1.0,4.0,2.0,2.0,9.0,3.0,3.0,2,1.0,9.0,1.0,3.0,1,1,0,0,0,0,1,0,0,0,0,4,0
4,4,3.0,2,1,2,1.0,4.0,2,1,3,1.0,1.0,2.0,1.0,15.0,1.0,3.0,-1,-1.0,-1.0,-1.0,-1.0,0,0,0,1,0,0,1,0,0,0,0,5,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,5,3.0,2,1,9,3.0,3.0,2,1,7,1.0,1.0,2.0,1.0,10.0,1.0,4.0,2,1.0,5.0,1.0,4.0,0,0,0,1,1,0,1,0,0,0,1,96,1
96,3,2.0,2,1,7,2.0,1.0,2,2,2,3.0,3.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1,-1.0,-1.0,-1.0,-1.0,0,0,0,1,0,0,0,0,0,0,1,97,1
97,3,2.0,2,1,12,1.0,1.0,3,2,2,1.0,3.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1,-1.0,-1.0,-1.0,-1.0,0,0,0,0,1,0,0,0,0,0,1,98,1
98,3,1.0,2,1,12,1.0,4.0,2,1,9,1.0,4.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1,-1.0,-1.0,-1.0,-1.0,0,0,0,0,1,0,0,0,0,0,1,99,1


In [None]:
plotX = pd.DataFrame(np.array(X.sample(100)))

#Rename plotX's columns since it was briefly converted to an np.array above
plotX.columns = X.columns # X.loc[0 ,:]    # X.columns
plotX

Unnamed: 0,Number_of_cars,Number_of_different_loads,num_wheels1,length1,shape1,num_loads1,load_shape1,num_wheels2,length2,shape2,num_loads2,load_shape2,num_wheels3,length3,shape3,num_loads3,load_shape3,num_wheels4,length4,shape4,num_loads4,load_shape4,Rectangle_next_to_rectangle,Rectangle_next_to_triangle,Rectangle_next_to_hexagon,Rectangle_next_to_circle,Triangle_next_to_triangle,Triangle_next_to_hexagon,Triangle_next_to_circle,Hexagon_next_to_hexagon,Hexagon_next_to_circle,Circle_next_to_circle,Class_attribute,Train,Cluster
0,4.0,2.0,2.0,1.0,9.0,1.0,3.0,2.0,1.0,7.0,1.0,1.0,2.0,1.0,13.0,1.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,58.0,1.0
1,4.0,1.0,2.0,1.0,2.0,1.0,1.0,3.0,2.0,2.0,-1.0,-1.0,2.0,1.0,12.0,1.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,54.0,1.0
2,4.0,2.0,2.0,1.0,15.0,1.0,1.0,3.0,2.0,8.0,3.0,3.0,2.0,1.0,2.0,1.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,65.0,1.0
3,3.0,2.0,2.0,1.0,1.0,1.0,3.0,2.0,1.0,15.0,1.0,4.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0
4,3.0,2.0,2.0,1.0,7.0,2.0,1.0,2.0,2.0,2.0,3.0,3.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,97.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,3.0,2.0,2.0,1.0,9.0,1.0,1.0,2.0,1.0,9.0,1.0,4.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,92.0,1.0
96,5.0,3.0,2.0,2.0,2.0,1.0,3.0,2.0,1.0,2.0,1.0,1.0,2.0,2.0,2.0,1.0,1.0,2.0,1.0,15.0,1.0,4.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,17.0,0.0
97,4.0,2.0,2.0,2.0,2.0,1.0,3.0,2.0,1.0,9.0,1.0,3.0,3.0,2.0,8.0,2.0,3.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,88.0,1.0
98,5.0,2.0,2.0,1.0,13.0,1.0,3.0,2.0,1.0,5.0,1.0,1.0,2.0,2.0,2.0,1.0,3.0,3.0,2.0,9.0,1.0,3.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,61.0,1.0


In [None]:
#PCA with one principal component
pca_1d = PCA(n_components=1)

#PCA with two principal components
pca_2d = PCA(n_components=2)

#PCA with three principal components
#pca_3d = PCA(n_components=3)

#PCA with three principal components
#pca_4d = PCA(n_components=4)


In [None]:
#This DataFrame holds that single principal component mentioned above
# PCs_1d = pd.DataFrame(pca_1d.fit_transform(plotX.drop(["Cluster"], axis=1)))
PCs_1d = pd.DataFrame(pca_1d.fit_transform(plotX))

#This DataFrame contains the two principal components that will be used
#for the 2-D visualization mentioned above
# PCs_2d = pd.DataFrame(pca_2d.fit_transform(plotX.drop(["Cluster"], axis=1)))
PCs_2d = pd.DataFrame(pca_2d.fit_transform(plotX))

#And this DataFrame contains three principal components that will aid us
#in visualizing our clusters in 3-D
# PCs_3d = pd.DataFrame(pca_3d.fit_transform(plotX.drop(["Cluster"], axis=1)))
#PCs_3d = pd.DataFrame(pca_3d.fit_transform(plotX))

In [None]:
PCs_1d.columns = ["PC1_1d"]

#"PC1_2d" means: 'The first principal component of the components created for 2-D visualization, by PCA.'
#And "PC2_2d" means: 'The second principal component of the components created for 2-D visualization, by PCA.'
PCs_2d.columns = ["PC1_2d", "PC2_2d"]

#PCs_3d.columns = ["PC1_3d", "PC2_3d", "PC3_3d"]

In [None]:
plotClusters = []
plotClusters = [plotX["Cluster"], PCs_1d,PCs_2d] #,PCs_3d]

#plotX = pd.concat([plotX,PCs_1d,PCs_2d,PCs_3d], axis=1, join='inner')
plotX = pd.concat([plotX,PCs_1d,PCs_2d], axis=1, join='inner')
plotX["dummy"] = 0
cluster0 = plotX[plotX["Cluster"] == 0]
cluster1 = plotX[plotX["Cluster"] == 1]
#cluster2 = plotX[plotX["Cluster"] == 2]
#cluster3 = plotX[plotX["Cluster"] == 3]

cluster0
# plotClusters[0]
#cluster3 = plotX[plotX["Cluster"] == 3]

Unnamed: 0,Number_of_cars,Number_of_different_loads,num_wheels1,length1,shape1,num_loads1,load_shape1,num_wheels2,length2,shape2,num_loads2,load_shape2,num_wheels3,length3,shape3,num_loads3,load_shape3,num_wheels4,length4,shape4,num_loads4,load_shape4,Rectangle_next_to_rectangle,Rectangle_next_to_triangle,Rectangle_next_to_hexagon,Rectangle_next_to_circle,Triangle_next_to_triangle,Triangle_next_to_hexagon,Triangle_next_to_circle,Hexagon_next_to_hexagon,Hexagon_next_to_circle,Circle_next_to_circle,Class_attribute,Train,Cluster,PC1_1d,PC1_2d,PC2_2d,dummy
3,3.0,2.0,2.0,1.0,1.0,1.0,3.0,2.0,1.0,15.0,1.0,4.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0,-42.240352,-42.240352,-8.691524,0
5,3.0,2.0,2.0,1.0,9.0,1.0,3.0,2.0,1.0,5.0,1.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,43.0,0.0,-7.088657,-7.088657,-8.100041,0
7,5.0,2.0,2.0,1.0,9.0,2.0,4.0,2.0,1.0,15.0,1.0,3.0,2.0,1.0,6.0,1.0,3.0,3.0,2.0,8.0,1.0,3.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,46.0,0.0,-4.849975,-4.849975,7.104669,0
10,5.0,3.0,2.0,1.0,15.0,1.0,3.0,2.0,2.0,9.0,3.0,3.0,2.0,1.0,1.0,1.0,3.0,2.0,1.0,10.0,1.0,4.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,18.0,0.0,-32.51643,-32.51643,4.731672,0
11,4.0,2.0,2.0,1.0,5.0,1.0,1.0,3.0,2.0,2.0,1.0,-1.0,2.0,1.0,9.0,1.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,28.0,0.0,-22.332678,-22.332678,-2.542968,0
13,4.0,2.0,2.0,1.0,8.0,1.0,3.0,2.0,1.0,9.0,1.0,4.0,2.0,2.0,2.0,3.0,3.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,38.0,0.0,-12.390671,-12.390671,-4.225701,0
15,5.0,3.0,2.0,1.0,11.0,1.0,1.0,2.0,1.0,9.0,1.0,4.0,2.0,1.0,9.0,1.0,4.0,2.0,2.0,8.0,1.0,3.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,14.0,0.0,-36.74664,-36.74664,7.032175,0
18,4.0,3.0,2.0,1.0,10.0,2.0,1.0,2.0,1.0,15.0,1.0,4.0,2.0,1.0,6.0,1.0,3.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,19.0,0.0,-31.727374,-31.727374,-2.736268,0
19,3.0,2.0,2.0,1.0,9.0,1.0,3.0,2.0,1.0,5.0,1.0,4.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,42.0,0.0,-8.087015,-8.087015,-8.202765,0
20,3.0,2.0,2.0,1.0,7.0,1.0,1.0,2.0,1.0,9.0,1.0,3.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,49.0,0.0,-1.180848,-1.180848,-7.772793,0


In [None]:
cluster0['Train'] = "Train"+cluster0['Train'].astype('str')
cluster1['Train'] = "Train"+cluster1['Train'].astype('str')
#cluster2['id'] = cluster2['id'].astype('int')
#cluster3['id'] = cluster3['id'].astype('int')


In [None]:
cluster0



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,Number_of_cars,Number_of_different_loads,num_wheels1,length1,shape1,num_loads1,load_shape1,num_wheels2,length2,shape2,num_loads2,load_shape2,num_wheels3,length3,shape3,num_loads3,load_shape3,num_wheels4,length4,shape4,num_loads4,load_shape4,Rectangle_next_to_rectangle,Rectangle_next_to_triangle,Rectangle_next_to_hexagon,Rectangle_next_to_circle,Triangle_next_to_triangle,Triangle_next_to_hexagon,Triangle_next_to_circle,Hexagon_next_to_hexagon,Hexagon_next_to_circle,Circle_next_to_circle,Class_attribute,Train,Cluster,PC1_1d,PC1_2d,PC2_2d,dummy
3,3.0,2.0,2.0,1.0,1.0,1.0,3.0,2.0,1.0,15.0,1.0,4.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Train8.0,0.0,-42.240352,-42.240352,-8.691524,0
5,3.0,2.0,2.0,1.0,9.0,1.0,3.0,2.0,1.0,5.0,1.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,Train43.0,0.0,-7.088657,-7.088657,-8.100041,0
7,5.0,2.0,2.0,1.0,9.0,2.0,4.0,2.0,1.0,15.0,1.0,3.0,2.0,1.0,6.0,1.0,3.0,3.0,2.0,8.0,1.0,3.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,Train46.0,0.0,-4.849975,-4.849975,7.104669,0
10,5.0,3.0,2.0,1.0,15.0,1.0,3.0,2.0,2.0,9.0,3.0,3.0,2.0,1.0,1.0,1.0,3.0,2.0,1.0,10.0,1.0,4.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Train18.0,0.0,-32.51643,-32.51643,4.731672,0
11,4.0,2.0,2.0,1.0,5.0,1.0,1.0,3.0,2.0,2.0,1.0,-1.0,2.0,1.0,9.0,1.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,Train28.0,0.0,-22.332678,-22.332678,-2.542968,0
13,4.0,2.0,2.0,1.0,8.0,1.0,3.0,2.0,1.0,9.0,1.0,4.0,2.0,2.0,2.0,3.0,3.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,Train38.0,0.0,-12.390671,-12.390671,-4.225701,0
15,5.0,3.0,2.0,1.0,11.0,1.0,1.0,2.0,1.0,9.0,1.0,4.0,2.0,1.0,9.0,1.0,4.0,2.0,2.0,8.0,1.0,3.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,Train14.0,0.0,-36.74664,-36.74664,7.032175,0
18,4.0,3.0,2.0,1.0,10.0,2.0,1.0,2.0,1.0,15.0,1.0,4.0,2.0,1.0,6.0,1.0,3.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,Train19.0,0.0,-31.727374,-31.727374,-2.736268,0
19,3.0,2.0,2.0,1.0,9.0,1.0,3.0,2.0,1.0,5.0,1.0,4.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,Train42.0,0.0,-8.087015,-8.087015,-8.202765,0
20,3.0,2.0,2.0,1.0,7.0,1.0,1.0,2.0,1.0,9.0,1.0,3.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,Train49.0,0.0,-1.180848,-1.180848,-7.772793,0


In [None]:
#cluster3['id'].values[0]
#country_cluster0 = {}
#for i in range(len(cluster0)):
#    country_cluster0[i] = cat_country[cluster0['id'].values[i]]

train_cluster0 = {}
for i in range(len(cluster0)):
    train_cluster0[i] = cluster0['Train'].values[i]

train_cluster1 = {}
for i in range(len(cluster1)):
    train_cluster1[i] = cluster1['Train'].values[i]

#country_cluster3 = {}
#for i in range(len(cluster3)):
#    country_cluster3[i] = cat_country[cluster3['id'].values[i]]



In [None]:
cluster1

Unnamed: 0,Number_of_cars,Number_of_different_loads,num_wheels1,length1,shape1,num_loads1,load_shape1,num_wheels2,length2,shape2,num_loads2,load_shape2,num_wheels3,length3,shape3,num_loads3,load_shape3,num_wheels4,length4,shape4,num_loads4,load_shape4,Rectangle_next_to_rectangle,Rectangle_next_to_triangle,Rectangle_next_to_hexagon,Rectangle_next_to_circle,Triangle_next_to_triangle,Triangle_next_to_hexagon,Triangle_next_to_circle,Hexagon_next_to_hexagon,Hexagon_next_to_circle,Circle_next_to_circle,Class_attribute,Train,Cluster,PC1_1d,PC1_2d,PC2_2d,dummy
0,4.0,2.0,2.0,1.0,9.0,1.0,3.0,2.0,1.0,7.0,1.0,1.0,2.0,1.0,13.0,1.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,Train58.0,1.0,7.230456,7.230456,0.965351,0
1,4.0,1.0,2.0,1.0,2.0,1.0,1.0,3.0,2.0,2.0,-1.0,-1.0,2.0,1.0,12.0,1.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Train54.0,1.0,3.56118,3.56118,-0.363908,0
2,4.0,2.0,2.0,1.0,15.0,1.0,1.0,3.0,2.0,8.0,3.0,3.0,2.0,1.0,2.0,1.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Train65.0,1.0,14.549994,14.549994,-4.121291,0
4,3.0,2.0,2.0,1.0,7.0,2.0,1.0,2.0,2.0,2.0,3.0,3.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,Train97.0,1.0,46.984306,46.984306,-6.773957,0
6,5.0,2.0,2.0,1.0,10.0,1.0,4.0,2.0,2.0,2.0,3.0,3.0,3.0,2.0,2.0,3.0,3.0,3.0,2.0,2.0,2.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,Train82.0,1.0,31.692084,31.692084,1.749498,0
8,3.0,1.0,2.0,1.0,12.0,1.0,4.0,2.0,1.0,9.0,1.0,4.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,Train99.0,1.0,48.65823,48.65823,-5.954957,0
9,5.0,1.0,2.0,1.0,5.0,1.0,1.0,3.0,2.0,2.0,1.0,1.0,3.0,2.0,2.0,-1.0,-1.0,2.0,1.0,4.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,Train93.0,1.0,42.829704,42.829704,0.91108,0
12,4.0,1.0,2.0,1.0,10.0,1.0,4.0,2.0,1.0,9.0,1.0,4.0,2.0,2.0,2.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,Train77.0,1.0,26.590182,26.590182,-4.276657,0
14,3.0,-1.0,3.0,2.0,9.0,0.0,-1.0,3.0,2.0,2.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Train69.0,1.0,19.009116,19.009116,-7.77385,0
16,4.0,3.0,2.0,1.0,10.0,1.0,3.0,2.0,1.0,10.0,1.0,1.0,2.0,1.0,15.0,1.0,4.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,Train100.0,1.0,48.939161,48.939161,4.26357,0


In [None]:
cluster_0 = cluster0.drop(columns=['Cluster', 'PC1_1d','PC1_2d','PC2_2d','dummy'])
cluster_1 = cluster1.drop(columns=['Cluster', 'PC1_1d','PC1_2d','PC2_2d','dummy'])
#cluster_2 = cluster2.drop(columns=['Cluster', 'PC1_1d','PC1_2d','PC2_2d','PC1_3d','PC2_3d','PC3_3d','dummy'])
#cluster_3 = cluster3.drop(columns=['Cluster', 'PC1_1d','PC1_2d','PC2_2d','PC1_3d','PC2_3d','PC3_3d','dummy'])

In [None]:
# Obtém os índices do DataFrame
indices = data.index

# Cria a lista de IDs utilizando os índices
train_ids = [f"Train{index}" for index in indices]

train_df = pd.DataFrame({'Train': train_ids})
#train_ids
train_df

Unnamed: 0,Train
0,Train0
1,Train1
2,Train2
3,Train3
4,Train4
...,...
95,Train95
96,Train96
97,Train97
98,Train98


In [None]:
A = np.array(plotX["Cluster"])
fA= pd.DataFrame(A.astype(int))
A


array([1., 1., 1., 0., 1., 0., 1., 0., 1., 1., 0., 0., 1., 0., 1., 0., 1.,
       1., 0., 0., 0., 0., 1., 1., 0., 1., 0., 1., 0., 0., 1., 0., 0., 1.,
       1., 0., 1., 1., 1., 1., 1., 1., 0., 1., 0., 1., 0., 1., 1., 1., 1.,
       0., 0., 1., 0., 1., 1., 1., 0., 0., 0., 0., 0., 1., 1., 1., 0., 1.,
       1., 1., 1., 0., 0., 0., 1., 1., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
       1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 1., 1., 0.])

In [None]:
A = np.array(plotX["Cluster"])
fA= pd.DataFrame(A.astype(int))
fA = pd.concat([fA, train_df,PCs_2d], axis=1, join='inner')
fA
#fA = pd.concat([fA, pais,PCs_2d], axis=1, join='inner')

#cluster_1 = fA[fA[0] == 0]
#cluster_2 = fA[fA[0] == 1]
#cluster_3 = fA[fA[0] == 2]
#cluster_4 = fA[fA[0] == 3]

#fA
#cluster_4
#fA["PC1_2d"]
#fA



Unnamed: 0,0,Train,PC1_2d,PC2_2d
0,1,Train0,7.230456,0.965351
1,1,Train1,3.561180,-0.363908
2,1,Train2,14.549994,-4.121291
3,0,Train3,-42.240352,-8.691524
4,1,Train4,46.984306,-6.773957
...,...,...,...,...
95,1,Train95,41.728446,-6.338305
96,0,Train96,-33.139594,7.524723
97,1,Train97,37.401809,0.228640
98,1,Train98,10.554536,5.750482


In [None]:
#for i in range(1, len(df.columns)):
#    col_name = 'S'+ str(i)
#    fig.add_trace(go.Scatter(x=df['DateTime'], y=df[col_name],
#                        mode='lines', # 'lines' or 'markers'
#                        name=col_name))
trains_cluster = []
colors = ['rgba(255, 128, 255, 0.8)', 'rgba(255, 128, 2, 0.8)', 'rgba(0, 255, 200, 0.8)','rgba(255, 55, 0, 0.8)']
#colors = ['rgba(255, 128, 255, 0.8)', 'rgba(255, 128, 2, 0.8)', 'rgba(0, 255, 200, 0.8)']

for i in range(100):
    trains_cluster.append(go.Scatter(
        x = [fA["PC1_2d"][i]],
        y = [fA["PC2_2d"][i]],
        mode = "markers",
        name = fA['Train'][i],
        marker = dict(color = colors[int(fA[0][i])]),
        #text = cat_country[i+1]
    ))

title = "Visualizing Clusters in One Dimension Using PCA"

data = trains_cluster #, trace4]

title = "Visualizing Clusters in Two Dimensions Using PCA"

layout = dict(title = title,
              xaxis= dict(title= 'PC1',ticklen= 5,zeroline= False),
              yaxis= dict(title= 'PC2',ticklen= 5,zeroline= False)
             )

fig = dict(data = data, layout = layout)

iplot(fig)

In [30]:
data = []
colors = ['rgba(255, 128, 255, 0.8)', 'rgba(255, 128, 2, 0.8)', 'rgba(0, 255, 200, 0.8)']

# Certifique-se de trabalhar apenas com índices disponíveis
for i in range(len(plotX)):
    data.append(go.Scatter(
        x=[plotX["PC1_2d"].iloc[i]],  # Use .iloc para acessar por posição
        y=[0],  # Mantém todos os pontos no eixo y = 0
        mode="markers",
        name='Cluster ' + str(int(plotX['Cluster'].iloc[i])),  # Use .iloc aqui também
        # marker=dict(color=colors[int(plotX['Cluster'].iloc[i])])  # Descomente se quiser aplicar cores
    ))

title = "Visualizing Clusters in One Dimension Using PCA"

layout = dict(
    title=title,
    xaxis=dict(title='PC1', ticklen=5, zeroline=False),
    yaxis=dict(title='', ticklen=5, zeroline=False)
)

fig = dict(data=data, layout=layout)
fig['data'][0]['showlegend'] = False

iplot(fig)


In [32]:
#Instructions for building the 1-D plot

#trace1 is for 'Cluster 0'
trace1 = go.Scatter(
                    x = cluster0["PC1_1d"], #x=df_X_reduced[0],
                    y = cluster0["dummy"],
                    mode = "markers",
                    name = "Cluster 0",
                    marker = dict(color = 'rgba(255, 128, 255, 0.8)'),
                    text = None)

#trace2 is for 'Cluster 1'
trace2 = go.Scatter(
                    x = cluster1["PC1_1d"],
                    y = cluster1["dummy"],
                    mode = "markers",
                    name = "Cluster 1",
                    marker = dict(color = 'rgba(255, 128, 2, 0.8)'),
                    text = None)



data = [trace1, trace2]#, trace3, trace4] #, trace4]

title = "Visualizing Clusters in One Dimension Using PCA"

layout = dict(title = title,
              xaxis= dict(title= 'PC1',ticklen= 5,zeroline= False),
              yaxis= dict(title= '',ticklen= 5,zeroline= False)
             )

fig = dict(data = data, layout = layout)

iplot(fig)

In [33]:
#Instructions for building the 2-D plot

#trace1 is for 'Cluster 0'
trace1 = go.Scatter(
                    x = cluster0["PC1_2d"],
                    y = cluster0["PC2_2d"],
                    mode = "markers",
                    name = "Cluster 0",
                    marker = dict(color = 'rgba(255, 128, 255, 0.8)'),
                    text = None)

#trace2 is for 'Cluster 1'
trace2 = go.Scatter(
                    x = cluster1["PC1_2d"],
                    y = cluster1["PC2_2d"],
                    mode = "markers",
                    name = "Cluster 1",
                    marker = dict(color = 'rgba(255, 128, 2, 0.8)'),
                    text = None)

data = [trace1, trace2] #, trace3, trace4] #, trace4]

title = "Visualizing Clusters in Two Dimensions Using PCA"

layout = dict(title = title,
              xaxis= dict(title= 'PC1',ticklen= 5,zeroline= False),
              yaxis= dict(title= 'PC2',ticklen= 5,zeroline= False)
             )

fig = dict(data = data, layout = layout)

iplot(fig)

In [35]:
data = []
colors = ['rgba(255, 128, 255, 0.8)', 'rgba(255, 128, 2, 0.8)', 'rgba(0, 255, 200, 0.8)']

# Certifique-se de que os índices são sequenciais
plotX = plotX.reset_index(drop=True)

# Ajustar o loop para o tamanho real do DataFrame
for i in range(len(plotX)):
    data.append(go.Scatter(
        x=[plotX["PC1_1d"].iloc[i]],  # Use iloc para acessar pela posição
        y=[plotX["PC2_2d"].iloc[i]],
        mode="markers",
        # name=cat_country[i+1],  # Descomente se 'cat_country' estiver definido
        # marker=dict(color=colors[int(plotX['Cluster'][i])]),
        # text=cat_country[i+1]
    ))

# Configuração do layout
title = "Visualizing Clusters in One Dimension Using PCA"
layout = dict(
    title=title,
    xaxis=dict(title='PC1', ticklen=5, zeroline=False),
    yaxis=dict(title='PC2', ticklen=5, zeroline=False)
)

# Criação da figura e exibição
fig = dict(data=data, layout=layout)
iplot(fig)

In [None]:
# Analisar os clusters
clusters = X['Cluster'].unique()
axioms = {}

for cluster in clusters:
    cluster_data = X[X['Cluster'] == cluster]
    mean_values = cluster_data.mean(numeric_only=True)
    mode_values = cluster_data.mode().iloc[0]
    
    axioms[cluster] = {
        'num_cars': mode_values['Number_of_cars'],
        'num_loads': mean_values['Number_of_different_loads'],
        'length': mode_values[['length1', 'length2', 'length3', 'length4']].to_dict(),
        'shape': mode_values[['shape1', 'shape2', 'shape3', 'shape4']].to_dict(),
        'relations': mode_values[['Rectangle_next_to_rectangle', 'Rectangle_next_to_triangle']].to_dict()
    }

# Propor axiomas com base nos padrões
for cluster, stats in axioms.items():
    print(f"Cluster {cluster}:")
    print(f"- Número predominante de vagões: {stats['num_cars']}")
    print(f"- Cargas médias diferentes: {stats['num_loads']}")
    print(f"- Comprimentos predominantes: {stats['length']}")
    print(f"- Formas predominantes: {stats['shape']}")
    print(f"- Relações predominantes: {stats['relations']}")
    print()

# Axiomas propostos para o Cluster 0

O **Cluster 0** agrupa trens com as seguintes características predominantes:
- Número moderado de vagões (geralmente 4).
- Menor diversidade de tipos de carga (em torno de 2).
- Formatos predominantes retangulares e comprimentos variados.
- Relações entre formas adjacentes, como "Retângulo ao lado de Triângulo".

### Axiomas:

1. **Se um trem tem 4 vagões e pelo menos 2 tipos de cargas diferentes, ele pertence ao Cluster 0.**
   - Lógica formal:  
     `num_cars(t, 4) ∧ num_loads(t, nl) ≥ 2 → Cluster0(t)`

2. **Se o primeiro vagão for longo (length1 = 2) e retangular (shape1 = 9), o trem pertence ao Cluster 0.**
   - Lógica formal:  
     `length(t, 1, 2) ∧ shape(t, 1, 9) → Cluster0(t)`

3. **Se há um vagão retangular adjacente a um triângulo, o trem pertence ao Cluster 0.**
   - Lógica formal:  
     `Rectangle_next_to_triangle(t, 1) → Cluster0(t)`


# Axiomas propostos para o Cluster 1

O **Cluster 1** agrupa trens com as seguintes características predominantes:
- Maior número de vagões (geralmente 5).
- Maior diversidade de tipos de carga (mais de 3).
- Predominância de formas triangulares e hexagonais.
- Tendência de vagões consecutivos curtos.

### Axiomas:

1. **Se um trem tem 5 vagões e mais de 3 tipos de cargas diferentes, ele pertence ao Cluster 1.**
   - Lógica formal:  
     `num_cars(t, 5) ∧ num_loads(t, nl) > 3 → Cluster1(t)`

2. **Se um trem possui pelo menos dois vagões consecutivos curtos, ele pertence ao Cluster 1.**
   - Lógica formal:  
     `length(t, c, 1) ∧ length(t, c+1, 1) → Cluster1(t)`

3. **Se a forma predominante dos vagões é triangular, o trem pertence ao Cluster 1.**
   - Lógica formal:  
     `shape(t, c, 4) ∧ num_wheels(t, c, 2) → Cluster1(t)`
