# Unsupervised classification with Word2

We use the document vectors derived from Word2Vec to classify newspaper articles according to the type of crime they report. In this notebook, we use unsupervised algorithms. We train an algorithm and see if the obtained clusters fits with the annotation provided by newspaper.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
from imblearn.over_sampling import SMOTE
from sklearn import metrics
from sklearn.cluster import KMeans, SpectralClustering, AgglomerativeClustering, MiniBatchKMeans
from matplotlib.patches import Patch
from gensim.models import KeyedVectors
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from mpl_toolkits import mplot3d
import operator

In [None]:
configuration = 3    # possible values are: 1, 2, 3
mean = 'simple'

###
# configuration=1 --> P1: tokenization, stop word removal
# configuration=2 --> P2: tokenization, stop word removal, lemmatization
# configuration=3 --> P3: tokenization, stop word removal, lemmatization, keyphrase extraction
###

folder="configuration_"+str(configuration)

if configuration == 1:
    lemmatization=False
    bigram=False
elif configuration == 2:
    lemmatization=True
    bigram=False
elif configuration == 3:
    lemmatization=True
    bigram=True

### Modify this line to use the type of embeddings you want

In [None]:
dataset = pd.read_csv(folder+"/"+mean+"/csv/retrained_vectors.csv")

In [None]:
dataset

In [None]:
colors = ["red", "gold", "green", "mediumblue", "orange", "darkturquoise", "lightgreen",
          "deeppink", "lightpink", "blueviolet", "purple", "gray", "darkgoldenrod"]

In [None]:
target_to_color = {'furto': 'red', 'truffa': 'gold', 'spaccio': 'green', 'maltrattamento': 'mediumblue', 'riciclaggio': 'orange', 'aggressione': 'darkturquoise', 'violenza sessuale': 'lightgreen', 'evasione': 'deeppink', 'droga': 'lightpink', 'frode': 'blueviolet', 'rapina': 'purple', 'sequestro': 'gray', 'omicidio': 'darkgoldenrod'}

In [None]:
dataset['color'] = dataset.apply(lambda row: target_to_color[row['target']], axis=1)

In [None]:
label = dataset['target'].drop_duplicates().to_list()

#zip_iterator = zip(label, colors)
#label_to_color = dict(zip_iterator)

In [None]:
embeddings = dataset.copy()
embeddings.drop(['url', 'title', 'newspaper', 'text', 'date', 'time',
       'preprocessed', 'target', 'color'], axis=1, inplace=True)

In [None]:
embeddings

We want to use only the news articles from "Modena Today" newspaper.

In [None]:
train = dataset[dataset['newspaper'] == 'ModenaToday']

In [None]:
train.shape

In [None]:
train = train.drop(columns=['newspaper'])

In [None]:
train = train.reset_index(drop=True)

In [None]:
Y_train = train['target']

In [None]:
counter = Counter(Y_train)
counter

In [None]:
label_to_index = {v: i for i, v in enumerate(dict(Counter(Y_train)).keys())}
label_to_index

In [None]:
index_to_label = {i: v for i, v in enumerate(dict(Counter(Y_train)).keys())}
index_to_label

In [None]:
Y_train = Y_train.map(label_to_index)
Y_train = Y_train.values

In [None]:
Y_train

In [None]:
info_train = train[['url', 'title', 'text', 'target']].copy()

In [None]:
X_train = train.drop(columns=['url', 'title', 'text', 'date', 'time', 'preprocessed', 'target', 'color']).values

In [None]:
X_train.shape

In [None]:
Counter(Y_train)

In [None]:
Y_train.shape

# Oversampling

To overcome the problem of unbalanced dataset we use SMOTE to create new elements and get the same number of elements for each category.

In [None]:
smt = SMOTE(random_state=0, k_neighbors=2)

In [None]:
X_train_SMOTE, Y_train_SMOTE = smt.fit_resample(X_train, Y_train)

In [None]:
X_trainb = pd.DataFrame(columns=range(300))
Y_trainb = pd.DataFrame()

In [None]:
Counter(Y_train_SMOTE)

In [None]:
for i in range(13):
    X_trainb = X_trainb.append(pd.DataFrame(X_train_SMOTE[Y_train_SMOTE == i][:], columns=range(300)))
    Y_trainb = Y_trainb.append(pd.DataFrame(Y_train_SMOTE[Y_train_SMOTE == i][:]))

In [None]:
X_trainb = X_trainb.reset_index(drop=True)
Y_trainb = Y_trainb.reset_index(drop=True)

In [None]:
X_trainb = X_trainb.values
Y_trainb = Y_trainb.values

In [None]:
Y_trainb = Y_trainb[:, 0]
Y_trainb

In [None]:
Y_trainb = np.vectorize(index_to_label.get)(Y_trainb)

In [None]:
Y_trainb

In [None]:
len(Y_trainb)

# Clustering

In [None]:
def generate_clusters(n_clusters, Y_train, prediction):
    clusters = []
    for cluster in range(n_clusters):
        elements = Y_train[np.where(prediction == cluster)]
        if len(elements) != 0:
            clusters.append(elements)
    return clusters

In [None]:
def matrix_clusters(clusters, n_clusters, labels):
    matrix = pd.DataFrame(columns=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13], index=labels)
    for cl in range(n_clusters):
        micro_counts = Counter(clusters[cl])
        for k, v in micro_counts.items():
            index = cl+1
            matrix.loc[k, index] = v
    return matrix.fillna(0)

In [None]:
def assign_labels(M_copy):
    
    max_values = {}
    accuracies = []
    labels = list(index_to_label.values())

    for c in list(M_copy.columns.values):
        column = M_copy[c]
        row_index = column.idxmax()
        max_v = M_copy.loc[row_index, c]
        max_values[str(row_index)+'_'+str(c)]=max_v

    max_max = max(max_values.items(), key=operator.itemgetter(1))[0]
    
    return max_max.split('_')[0], int(max_max.split('_')[1])

In [None]:
def plot_clusters(clusters, n_clusters, filename):
    
    fig, axs = plt.subplots(7, 2)
    fig.set_figwidth(20)
    fig.set_figheight(20)
    
    for cl in range(n_clusters):
        micro_counts = Counter(clusters[cl])

        axs[int(cl/2), int(cl%2)].set_title('Cluster {}'.format(cl+1), fontsize=20)
        
        labels = []

        for k, v in micro_counts.items():
            labels.append(k)
            axs[int(cl/2), int(cl%2)].bar(k, v, width=0.6, color=dataset[dataset['target']==k].color, align='center')
        axs[int(cl/2), int(cl%2)].set_xticklabels(labels, fontsize=15, rotation=30)
        
        fig.tight_layout(pad=1.0)
        
    fig.savefig(filename+'.png')

In [None]:
def plot_multiclusters(clusters, n_clusters, micro_to_macro, colors, c, filename):
    
    info_categories = {}
    macro_to_micro = {}
    i = 0
    
    fig, axs = plt.subplots(4, 2)
    fig.set_figwidth(20)
    fig.set_figheight(20)
    
    for k, v in micro_to_macro.items():
        info_categories[k] = [colors[i], c, i]
        i = i + 1

    for k, v in micro_to_macro.items():
        if v not in macro_to_micro:
            macro_to_micro[v] = [k]
        else:
            macro_to_micro[v].append(k)
        
    for cl in range(n_clusters):
        micro_counts = Counter(clusters[cl])
        x = np.arange(len(micro_counts))
        macro_keys = np.vectorize(micro_to_macro.get)(clusters[cl])
        macro_keys = list(dict(Counter(macro_keys)).keys())
        hist_elements = np.zeros((len(macro_keys), 13), dtype=np.int32)

        color_patches = []
        for i, el in enumerate(macro_keys):
            for m in macro_to_micro[el]:
                hist_elements[i][info_categories[m][2]] = micro_counts[m]
                if micro_counts[m] > 0:
                    color_patches.append(Patch(color=info_categories[m][0], label=m))

            count = 0
            for j in reversed(range(13)):
                count += hist_elements[i][j]
                hist_elements[i][j] = count

        axs[int(cl/2), int(cl%2)].set_title('Cluster {}'.format(cl+1), fontsize=20)
        ind = np.arange(len(macro_keys))

        b_width = 0.8

        
        for k, v in info_categories.items():
            axs[int(cl/2), int(cl%2)].bar(ind, hist_elements[:, v[2]], b_width, color=v[0])

        # axs[int(cl/2), int(cl%2)].set_xticklabels(labels, fontsize=15, rotation=30)
        
        fig.tight_layout(pad=1.0)
        
    fig.savefig(filename+'.png')

# Kmeans

In [None]:
%%time

kmeans_13 = KMeans(n_clusters=13)
kmeans_13.fit(X_trainb)
pred_classes_kmeans_13 = kmeans_13.predict(X_trainb)

In [None]:
clusters_kmeans_13 = generate_clusters(13, Y_trainb, pred_classes_kmeans_13)

In [None]:
from yellowbrick.cluster import SilhouetteVisualizer

visualizer = SilhouetteVisualizer(kmeans_13, colors='yellowbrick')
visualizer.fit(X_trainb)
visualizer.show()

In [None]:
# my_metrics=['euclidean', 'manhattan', 'cityblock', 'braycurtis', 'canberra', 'chebyshev', 'correlation', 'cosine', 'dice', 'hamming', 'jaccard', 'kulsinski', 'mahalanobis', 'matching', 'minkowski', 'rogerstanimoto', 'russellrao', 'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean', 'yule', 'wminkowski', 'nan_euclidean', 'haversine']
# for m in my_metrics:
#     print(m)
#     print(metrics.silhouette_score(X_trainb, kmeans_13.labels_, metric=m))

metrics.silhouette_score(X_trainb, kmeans_13.labels_, metric='euclidean')

In [None]:
M = matrix_clusters(clusters_kmeans_13, 13, list(index_to_label.values()))

In [None]:
M

In [None]:
# from sklearn.metrics.classification import accuracy_score
# accuracy_score(Y_trainb, np.vectorize(index_to_label.get)(kmeans_13.labels_))

In [None]:
clusters = range(1,14)
M_copy = M.copy()
assigned = {}

for i in clusters:
    row, column = assign_labels(M_copy)
    assigned[row] = column
    M_copy.drop(column, axis=1, inplace=True)
    M_copy.drop(row, inplace=True)
print(assigned)

In [None]:
overall_precision = 0
overall_recall = 0
total_sum = 0
for label, cluster in assigned.items():
    print(label)
    TP = M.loc[label, cluster]
    FP = M[cluster].sum() - M.loc[label, cluster]
    FN = M.loc[label].sum() - M.loc[label, cluster]
    precision = TP/(TP+FP)
    recall = TP/(TP+FN)
    print(precision)
    print(recall)
    sum = TP+FN
    total_sum += sum
    overall_precision += (precision*sum)
    overall_recall += (recall*sum)

overall_precision = overall_precision/total_sum
overall_recall = overall_recall/total_sum
f1=(2*overall_precision*overall_recall)/(overall_precision+overall_recall)
all=M.sum().sum()
TN=all-TP-FP-FN
accuracy=(TP+TN)/all
print(overall_precision)
print(overall_recall)
print(f1)
print(accuracy)

In [None]:
plot_clusters(clusters_kmeans_13, 13, 'kmeans_13clusters')

# AgglomerativeClustering

In [None]:
%%time
model = AgglomerativeClustering(n_clusters=13)
# model_fit = model.fit(X_trainb)
pred_classes_agglo_13 = model.fit_predict(X_trainb)

# pred_classes_agglo_13 = AgglomerativeClustering(n_clusters=13).fit_predict(X_trainb)

In [None]:
clusters_agglo_13 = generate_clusters(13, Y_trainb, pred_classes_agglo_13)

In [None]:
# visualizer = SilhouetteVisualizer(model, colors='yellowbrick')
# visualizer.fit_predict(X_trainb)
# visualizer.show()

In [None]:
metrics.silhouette_score(X_trainb, model.labels_, metric='euclidean')

In [None]:
M = matrix_clusters(clusters_agglo_13, 13, list(index_to_label.values()))

In [None]:
M

In [None]:
clusters = range(1,14)
M_copy = M.copy()
assigned = {}

for i in clusters:
    row, column = assign_labels(M_copy)
    assigned[row] = column
    M_copy.drop(column, axis=1, inplace=True)
    M_copy.drop(row, inplace=True)
print(assigned)

In [None]:
overall_precision = 0
overall_recall = 0
total_sum = 0
for label, cluster in assigned.items():
    print(label)
    TP = M.loc[label, cluster]
    FP = M[cluster].sum() - M.loc[label, cluster]
    FN = M.loc[label].sum() - M.loc[label, cluster]
    precision = TP/(TP+FP)
    recall = TP/(TP+FN)
    print(precision)
    print(recall)
    sum = TP+FN
    total_sum += sum
    overall_precision += (precision*sum)
    overall_recall += (recall*sum)

overall_precision = overall_precision/total_sum
overall_recall = overall_recall/total_sum
f1=(2*overall_precision*overall_recall)/(overall_precision+overall_recall)
all=M.sum().sum()
TN=all-TP-FP-FN
accuracy=(TP+TN)/all
print(overall_precision)
print(overall_recall)
print(f1)
print(accuracy)

In [None]:
plot_clusters(clusters_agglo_13, 13, "agglomerative_clustering_13clusters")

# MiniBatchKmeans

In [None]:
%%time

minikmeans_13 = MiniBatchKMeans(n_clusters=13, random_state=0, batch_size=6)
pred_classes_minikmeans_13 = minikmeans_13.fit_predict(X_trainb)

In [None]:
clusters_minibatch_13 = generate_clusters(13, Y_trainb, pred_classes_minikmeans_13)

In [None]:
# visualizer = SilhouetteVisualizer(minikmeans_13, colors='yellowbrick')
# visualizer.fit_predict(X_trainb)
# visualizer.show()

In [None]:
metrics.silhouette_score(X_trainb, minikmeans_13.labels_, metric='euclidean')

In [None]:
M = matrix_clusters(clusters_minibatch_13, 13, list(index_to_label.values()))

In [None]:
M

In [None]:
clusters = range(1,14)
M_copy = M.copy()
assigned = {}

for i in clusters:
    row, column = assign_labels(M_copy)
    assigned[row] = column
    M_copy.drop(column, axis=1, inplace=True)
    M_copy.drop(row, inplace=True)
print(assigned)

In [None]:
overall_precision = 0
overall_recall = 0
total_sum = 0
for label, cluster in assigned.items():
    print(label)
    TP = M.loc[label, cluster]
    FP = M[cluster].sum() - M.loc[label, cluster]
    FN = M.loc[label].sum() - M.loc[label, cluster]
    precision = TP/(TP+FP)
    recall = TP/(TP+FN)
    print(precision)
    print(recall)
    sum = TP+FN
    total_sum += sum
    overall_precision += (precision*sum)
    overall_recall += (recall*sum)

overall_precision = overall_precision/total_sum
overall_recall = overall_recall/total_sum
f1=(2*overall_precision*overall_recall)/(overall_precision+overall_recall)
all=M.sum().sum()
TN=all-TP-FP-FN
accuracy=(TP+TN)/all
print(overall_precision)
print(overall_recall)
print(f1)
print(accuracy)

In [None]:
plot_clusters(clusters_minibatch_13, 13, "mini_batch_kmeans_13clusters")

# Spectral clustering

In [None]:
%%time

spcl_13 = SpectralClustering(n_clusters=13, assign_labels="discretize", random_state=0)
pred_classes_splc_13 = spcl_13.fit_predict(X_trainb)

In [None]:
clusters_spectral_13=generate_clusters(13, Y_trainb, pred_classes_splc_13)

In [None]:
metrics.silhouette_score(X_trainb, spcl_13.labels_, metric='euclidean')

In [None]:
M = matrix_clusters(clusters_spectral_13, 13, list(index_to_label.values()))

In [None]:
M

In [None]:
clusters = range(1,14)
M_copy = M.copy()
assigned = {}

for i in clusters:
    row, column = assign_labels(M_copy)
    assigned[row] = column
    M_copy.drop(column, axis=1, inplace=True)
    M_copy.drop(row, inplace=True)
print(assigned)

In [None]:
overall_precision = 0
overall_recall = 0
total_sum = 0
for label, cluster in assigned.items():
    print(label)
    TP = M.loc[label, cluster]
    FP = M[cluster].sum() - M.loc[label, cluster]
    FN = M.loc[label].sum() - M.loc[label, cluster]
    precision = TP/(TP+FP)
    recall = TP/(TP+FN)
    print(precision)
    print(recall)
    sum = TP+FN
    total_sum += sum
    overall_precision += (precision*sum)
    overall_recall += (recall*sum)

overall_precision = overall_precision/total_sum
overall_recall = overall_recall/total_sum
f1=(2*overall_precision*overall_recall)/(overall_precision+overall_recall)
all=M.sum().sum()
TN=all-TP-FP-FN
accuracy=(TP+TN)/all
print(overall_precision)
print(overall_recall)
print(f1)
print(accuracy)

In [None]:
plot_clusters(clusters_spectral_13, 13, "spectral_clustering_13clusters")

# Clustering with macrocategories

We group the categories into macrocategories. Each macrocategory contains categories which are semantically similar, for example "Fraud", "Scam" and "Money Laundering".

The following data structures are needed for histogram visualization.

1. The first structure associates each category with its own macrocategory, the color in which it should be highlighted in the histograms, the number of elements contained by the training set and an index indicating its position in the histogram
2. The second structure associates each macro-category with all its sub-category

In [None]:
micro_to_macro = {
             'rapina':'furto e\nrapina',
             'furto': 'furto e\nrapina',
             'spaccio': 'spaccio e\ndroga',
             'droga': 'spaccio e\ndroga',
             'truffa': 'truffa, frode\ne riciclaggio',
             'riciclaggio': 'truffa, frode\ne riciclaggio',
             'frode': 'truffa, frode\ne riciclaggio',
             'aggressione': 'maltrattamento,\naggressione e\nviolenza sessuale',
             'maltrattamento': 'maltrattamento,\naggressione e\nviolenza sessuale',
             'violenza sessuale': 'maltrattamento,\naggressione e\nviolenza sessuale',
             'sequestro': 'sequestro',
             'omicidio': 'omicidio',
             'evasione': 'evasione'
        }

In [None]:
colors = ["red", "gold", "green", "mediumblue", "orange", "darkturquoise", "lightgreen",
          "deeppink", "lightpink", "blueviolet", "purple", "gray", "darkgoldenrod"]

In [None]:
c = int(len(Y_train_SMOTE)/13)

In [None]:
def matrix_multiclusters(clusters, n_clusters):
    matrix = pd.DataFrame(columns=[1, 2, 3, 4, 5, 6, 7], index=['1', '2', '3', '4', '5', '6', '7'])
    for cl in range(n_clusters):
        micro_counts = Counter(clusters[cl])
        for k, v in micro_counts.items():
            index = cl+1
            matrix.loc[k, index] = v
    return matrix.fillna(0)

# Kmeans

In [None]:
%%time

kmeans_7clusters = KMeans(n_clusters=7).fit(X_trainb)
pred_classes_kmeans_7clusters = kmeans_7clusters.predict(X_trainb)

In [None]:
clusters_kmeans_7 = generate_clusters(7, Y_trainb, pred_classes_kmeans_7clusters)

In [None]:
plot_multiclusters(clusters_kmeans_7, 7, micro_to_macro, colors, c, "kmeans_7clusters")

In [None]:
from yellowbrick.cluster import SilhouetteVisualizer

visualizer = SilhouetteVisualizer(kmeans_7clusters, colors='yellowbrick')
visualizer.fit(X_trainb)
visualizer.show()

In [None]:
metrics.silhouette_score(X_trainb, kmeans_7clusters.labels_, metric='euclidean')

In [None]:
new_clusters = []
for cluster in clusters_kmeans_7:
  new_cluster = np.where(cluster == 'rapina', 1, cluster)
  new_cluster = np.where(new_cluster == 'furto', 1, new_cluster)
  new_cluster = np.where(new_cluster == 'spaccio', 2, new_cluster)
  new_cluster = np.where(new_cluster == 'droga', 2, new_cluster)
  new_cluster = np.where(new_cluster == 'truffa', 3, new_cluster)
  new_cluster = np.where(new_cluster == 'riciclaggio', 3, new_cluster)
  new_cluster = np.where(new_cluster == 'frode', 3, new_cluster)
  new_cluster = np.where(new_cluster == 'aggressione', 4, new_cluster)
  new_cluster = np.where(new_cluster == 'maltrattamento', 4, new_cluster)
  new_cluster = np.where(new_cluster == 'violenza sessuale', 4, new_cluster)
  new_cluster = np.where(new_cluster == 'sequestro', 5, new_cluster)
  new_cluster = np.where(new_cluster == 'omicidio', 6, new_cluster)
  new_cluster = np.where(new_cluster == 'evasione', 7, new_cluster)

  new_clusters.append(new_cluster)

In [None]:
M = matrix_multiclusters(new_clusters, 7)
M

# AgglomerativeClustering

In [None]:
%%time

model = AgglomerativeClustering(n_clusters=7)
pred_classes_agglo7 = model.fit_predict(X_trainb)

In [None]:
clusters_agglo_7 = generate_clusters(7, Y_trainb, pred_classes_agglo7)

In [None]:
plot_multiclusters(clusters_agglo_7, 7, micro_to_macro, colors, c, "agglomerative_clustering_7clusters")

In [None]:
new_clusters = []
for cluster in clusters_agglo_7:
  new_cluster = np.where(cluster == 'rapina', 1, cluster)
  new_cluster = np.where(new_cluster == 'furto', 1, new_cluster)
  new_cluster = np.where(new_cluster == 'spaccio', 2, new_cluster)
  new_cluster = np.where(new_cluster == 'droga', 2, new_cluster)
  new_cluster = np.where(new_cluster == 'truffa', 3, new_cluster)
  new_cluster = np.where(new_cluster == 'riciclaggio', 3, new_cluster)
  new_cluster = np.where(new_cluster == 'frode', 3, new_cluster)
  new_cluster = np.where(new_cluster == 'aggressione', 4, new_cluster)
  new_cluster = np.where(new_cluster == 'maltrattamento', 4, new_cluster)
  new_cluster = np.where(new_cluster == 'violenza sessuale', 4, new_cluster)
  new_cluster = np.where(new_cluster == 'sequestro', 5, new_cluster)
  new_cluster = np.where(new_cluster == 'omicidio', 6, new_cluster)
  new_cluster = np.where(new_cluster == 'evasione', 7, new_cluster)

  new_clusters.append(new_cluster)

In [None]:
M = matrix_multiclusters(new_clusters, 7)
M

In [None]:
metrics.silhouette_score(X_trainb, model.labels_, metric='euclidean')

# MiniBatchKmeans

In [None]:
%%time

minikmeans_7clusters = MiniBatchKMeans(n_clusters=7, random_state=0, batch_size=6)
pred_classes_7clusters = minikmeans_7clusters.fit_predict(X_trainb)

In [None]:
clusters_minibatch_7 = generate_clusters(7, Y_trainb, pred_classes_7clusters)

In [None]:
plot_multiclusters(clusters_minibatch_7, 7, micro_to_macro, colors, c, "mini_batch_kmeans_7clusters")

In [None]:
metrics.silhouette_score(X_trainb, minikmeans_7clusters.labels_, metric='euclidean')

In [None]:
new_clusters = []
for cluster in clusters_minibatch_7:
  new_cluster = np.where(cluster == 'rapina', 1, cluster)
  new_cluster = np.where(new_cluster == 'furto', 1, new_cluster)
  new_cluster = np.where(new_cluster == 'spaccio', 2, new_cluster)
  new_cluster = np.where(new_cluster == 'droga', 2, new_cluster)
  new_cluster = np.where(new_cluster == 'truffa', 3, new_cluster)
  new_cluster = np.where(new_cluster == 'riciclaggio', 3, new_cluster)
  new_cluster = np.where(new_cluster == 'frode', 3, new_cluster)
  new_cluster = np.where(new_cluster == 'aggressione', 4, new_cluster)
  new_cluster = np.where(new_cluster == 'maltrattamento', 4, new_cluster)
  new_cluster = np.where(new_cluster == 'violenza sessuale', 4, new_cluster)
  new_cluster = np.where(new_cluster == 'sequestro', 5, new_cluster)
  new_cluster = np.where(new_cluster == 'omicidio', 6, new_cluster)
  new_cluster = np.where(new_cluster == 'evasione', 7, new_cluster)

  new_clusters.append(new_cluster)

In [None]:
M = matrix_multiclusters(new_clusters, 7)
M

# Spectral clustering

In [None]:
%%time

spcl_7clusters = SpectralClustering(n_clusters=7, assign_labels="discretize", random_state=0)
pred_classes_spcl_7clusters = spcl_7clusters.fit_predict(X_trainb)

In [None]:
clusters_splc_7= generate_clusters(7, Y_trainb, pred_classes_spcl_7clusters)

In [None]:
plot_multiclusters(clusters_splc_7, 7, micro_to_macro, colors, c, "spectral_clustering_7clusters")

In [None]:
new_clusters = []
for cluster in clusters_splc_7:
  new_cluster = np.where(cluster == 'rapina', 1, cluster)
  new_cluster = np.where(new_cluster == 'furto', 1, new_cluster)
  new_cluster = np.where(new_cluster == 'spaccio', 2, new_cluster)
  new_cluster = np.where(new_cluster == 'droga', 2, new_cluster)
  new_cluster = np.where(new_cluster == 'truffa', 3, new_cluster)
  new_cluster = np.where(new_cluster == 'riciclaggio', 3, new_cluster)
  new_cluster = np.where(new_cluster == 'frode', 3, new_cluster)
  new_cluster = np.where(new_cluster == 'aggressione', 4, new_cluster)
  new_cluster = np.where(new_cluster == 'maltrattamento', 4, new_cluster)
  new_cluster = np.where(new_cluster == 'violenza sessuale', 4, new_cluster)
  new_cluster = np.where(new_cluster == 'sequestro', 5, new_cluster)
  new_cluster = np.where(new_cluster == 'omicidio', 6, new_cluster)
  new_cluster = np.where(new_cluster == 'evasione', 7, new_cluster)

  new_clusters.append(new_cluster)

In [None]:
M = matrix_multiclusters(new_clusters, 7)
M