## Bibliotecas Necessárias

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.cluster import KMeans
from matplotlib import pyplot as plt
from sklearn.preprocessing import OrdinalEncoder

## Leitura , Visualização e Tratamentos dos Dados

### Carregar os Dados em um DataFrame do Pandas

In [None]:
df = pd.read_csv('data_scraping.csv')
df

### Selecionar as Colunas Desejadas para Análise e Modelagem

In [None]:
df_select = df.loc[: , 'word_count':'syntax']
df_select

### Renomear as Colunas para Melhorar a Compreensão

In [None]:
df_select = df_select.rename(columns={'word_count' : 'word count', 'word_diff_count' : 'different words'})
df_select

### Identificar Outliers - Abordagem exploratória

In [None]:
sns.boxplot(df_select['word count'])

### Capturar a Posição e Visualizar os Outliers

In [None]:
ol_word_count = np.where(df_select['word count'] > 6000) # posição

print(ol_word_count[0])

### Tratar Outliers (Remoção)

In [None]:
df_select.drop(ol_word_count[0], inplace = True)

sns.boxplot(df_select['word count']) # gráfico

### Aplicar Transformação Categórica na Coluna 'syntax'

In [None]:
df_select['syntax'] = df_select['syntax'].astype('category') 
df_select.dtypes

### Dicionário para Armazenar as Variáveis da coluna 'syntax'

In [None]:
dict_syntax = dict(enumerate(df_select['syntax'].cat.categories)) 
dict_syntax

### Codificação da Coluna 'syntax' para Abordagem de Aprendizado Supervisionado

In [None]:
df_select['syntax'] = df_select['syntax'].cat.codes # CODIFICAR A COLUNA SYNTAX (PADRÃO -> CAT - NUM)
df_select

### Revertendo a Codificação - *Obs: Coluna Volta a Ser do Tipo Object*

In [None]:
df_select['syntax'] = df_select['syntax'].map(dict_syntax) 
df_select

### Outra Forma de Aplicar a Codificação Categórica aos Atributos 

In [None]:
oe = OrdinalEncoder()

df_select['syntax_code'] = oe.fit_transform(df_select[['syntax']])
df_select[['syntax', 'syntax_code']].head(10)

## Modelos de ML e Resultados

### Aprendizado Não Supervisionado - Cluster K-Means

In [None]:
km = KMeans(n_clusters = 2, init = 'random', n_init = 10, max_iter = 300, tol = 1e-4, random_state = 0)

y_predicted = km.fit_predict(df_select[['word count','different words']])

y_predicted

In [None]:
df_select['cluster'] = y_predicted

df_select.head(10)

In [None]:
df_c1 = df_select[df_select.cluster == 0]
df_c2 = df_select[df_select.cluster == 1]

plt.scatter(df_c1['word count'],
            df_c1['different words'],
            s = 50, c = 'blue', marker = 's',
            edgecolor = 'black', label = 'cluster-1')

plt.scatter(df_c2['word count'],
            df_c2['different words'],
            s = 50, c = 'red', marker = 'o',
            edgecolor = 'black', label = 'cluster-2')

plt.scatter(
    km.cluster_centers_[:, 0], km.cluster_centers_[:, 1],
    s=250, marker='*',
    c='black', edgecolor='white',
    label='centroids'
)

plt.xlabel('Word Count')
plt.ylabel('Different Words')
plt.grid()
plt.legend(scatterpoints=1)
plt.show()

In [None]:
distortions = []

for i in range(1, 10):
    km = KMeans(n_clusters = i, init = 'random', n_init = 10, max_iter = 300, tol = 1e-4,
                random_state = 0)
    
    km.fit(df_select[['word count','different words']])
    distortions.append(km.inertia_)

plt.plot(range(1, 10), distortions, marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('Distortion')
plt.show()

In [None]:
sns.relplot(x='word count', y='different words', hue='syntax', style='cluster', data=df_select)