# 1. Data preprocessing and visualization

## 1.1. Importing libraries and dataset

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [None]:
dataset = pd.read_csv('crimes.csv',sep =';')
dataset.head()

In [None]:
dataset.shape

## 1.2. Statistical description

In [None]:
dataset.describe()

## 1.3. Boxplot

In [None]:
sns.set(style = 'whitegrid')
sns.boxplot(data = dataset)
plt.show()

In [None]:
dataset[dataset['Viol'] > 2300]

## 1.4. Scatterplot variables

In [None]:
sns.pairplot(dataset)
plt.show()

## 1.5. Correlation

In [None]:
correlations = dataset.corr(method = 'pearson')
correlations

In [None]:
f, ax = plt.subplots(figsize=(5,5))
sns.heatmap(correlations, annot =True)
plt.show()

## 1.6. Independent variables and labels

In [None]:
X = dataset.iloc[:, 1:8].values
labels = dataset.iloc[:,0].values
print(labels)

## 1.7. Principal Component Analysis (PCA) and standardization

In [None]:
from IPython.display import Image
Image('pca.png')

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [None]:
scaler = StandardScaler()
X_scaler = scaler.fit_transform(X)
pca = PCA(n_components=None)
X_pca = pca.fit_transform(X_scaler)

In [None]:
print(pca.explained_variance_ratio_)

In [None]:
sum(pca.explained_variance_ratio_[:2])

In [None]:
X_pca.shape

In [None]:
X_pca[0,:]

## 1.8. Visualization

In [None]:
plt.scatter(X_pca[:,0],X_pca[:,1])
for label, x,y in zip(labels,X_pca[:,0],X_pca[:,1]):
    plt.annotate(label, xy=(x,y))
plt.xlabel('PCA1')
plt.ylabel('PCA2')
plt.show()

# 2. KMeans Clustering

## 2.2. Make KMeans

In [None]:
from sklearn.cluster import KMeans

In [None]:
wcss = []
for k in range(1,11):
    kmeans = KMeans(n_clusters=k,init='k-means++',random_state=0)
    kmeans.fit(X)
    wcss.append(kmeans.inertia_)

In [None]:
wcss

In [None]:
plt.plot(range(1,11),wcss, 'bx-')
plt.title('The Elbow method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()

In [None]:
from kneed import KneeLocator

In [None]:
k = KneeLocator(range(1,11),wcss, curve = 'convex', direction = 'decreasing')
k.elbow

In [None]:
k.plot_knee()
plt.show()

In [None]:
kmeans = KMeans(n_clusters=4,init='k-means++',random_state=0)
kmeans.fit(X)

In [None]:
y_kmeans = kmeans.labels_
y_kmeans

In [None]:
dataset.head()

## 1.3. Clusters visualization

In [None]:
plt.scatter(X_pca[y_kmeans==0,0],X_pca[y_kmeans==0,1], c='red',label='Cluster 0')
plt.scatter(X_pca[y_kmeans==1,0],X_pca[y_kmeans==1,1], c='blue',label='Cluster 1')
plt.scatter(X_pca[y_kmeans==2,0],X_pca[y_kmeans==2,1], c='green',label='Cluster 2')
plt.scatter(X_pca[y_kmeans==3,0],X_pca[y_kmeans==3,1], c='yellow',label='Cluster 3')
for label, x,y in zip(labels,X_pca[:,0],X_pca[:,1]):
    plt.annotate(label, xy=(x,y))
plt.xlabel('PCA1')
plt.ylabel('PCA2')
plt.title('Clustering with X')
plt.legend()
plt.show()

In [None]:
kmeans2 = KMeans(n_clusters=4,init='k-means++',random_state=0)
kmeans2.fit(X_scaler)

In [None]:
y_kmeans_sc = kmeans2.labels_
y_kmeans_sc

In [None]:
plt.scatter(X_pca[y_kmeans_sc==0,0],X_pca[y_kmeans_sc==0,1], c='red',label='Cluster 0')
plt.scatter(X_pca[y_kmeans_sc==1,0],X_pca[y_kmeans_sc==1,1], c='blue',label='Cluster 1')
plt.scatter(X_pca[y_kmeans_sc==2,0],X_pca[y_kmeans_sc==2,1], c='green',label='Cluster 2')
plt.scatter(X_pca[y_kmeans_sc==3,0],X_pca[y_kmeans_sc==3,1], c='yellow',label='Cluster 3')
for label, x,y in zip(labels,X_pca[:,0],X_pca[:,1]):
    plt.annotate(label, xy=(x,y))
plt.xlabel('PCA1')
plt.ylabel('PCA2')
plt.title('Clustering with X_scaler')
plt.legend()
plt.show()

In [None]:
kmeans3 = KMeans(n_clusters=4,init='k-means++',random_state=0)
kmeans3.fit(X_pca)

In [None]:
y_kmeans_pca = kmeans3.labels_
y_kmeans_pca

In [None]:
plt.scatter(X_pca[y_kmeans_pca==0,0],X_pca[y_kmeans_pca==0,1], c='red',label='Cluster 0')
plt.scatter(X_pca[y_kmeans_pca==1,0],X_pca[y_kmeans_pca==1,1], c='blue',label='Cluster 1')
plt.scatter(X_pca[y_kmeans_pca==2,0],X_pca[y_kmeans_pca==2,1], c='green',label='Cluster 2')
plt.scatter(X_pca[y_kmeans_pca==3,0],X_pca[y_kmeans_pca==3,1], c='yellow',label='Cluster 3')
for label, x,y in zip(labels,X_pca[:,0],X_pca[:,1]):
    plt.annotate(label, xy=(x,y))
plt.xlabel('PCA1')
plt.ylabel('PCA2')
plt.title('Clustering with X_pca')
plt.legend()
plt.show()

In [None]:
sum(y_kmeans_sc - y_kmeans_pca)

## 1.4. Profiling 

In [None]:
kmeans2.cluster_centers_.shape

In [None]:
kmeans2.cluster_centers_

In [None]:
dataset.columns

In [None]:
col_names = ['Meutre', 'Rapt', 'Vol', 'Attaque', 'Viol', 'Larcin','Auto_Theft']

In [None]:
cluster_centers = pd.DataFrame(data = kmeans2.cluster_centers_, columns = col_names)
cluster_centers

In [None]:
cluster_centers = scaler.inverse_transform(cluster_centers)

In [None]:
cluster_centers = pd.DataFrame(data = cluster_centers, columns = col_names)
cluster_centers

In [None]:
dataset.describe()

## 1.5. Rules extraction 

In [None]:
dataset_cluster = pd.concat([dataset, pd.DataFrame({'Cluster' : y_kmeans_sc})], axis=1)

In [None]:
dataset_cluster.head()

In [None]:
Xc = dataset_cluster.iloc[:,1:8].values
yc = dataset_cluster.iloc[:,8].values

In [None]:
import graphviz
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier, export_graphviz

In [None]:
model = DecisionTreeClassifier(max_depth = 5)
model.fit(Xc,yc)

In [None]:
tree.export_graphviz(model, feature_names = col_names, out_file = 'tree.dot', label='all', 
                     filled = True, rounded = True)

In [None]:
Image('tree.png')

# 3. Hierarchical Clustering

## 3.1. HC intuition 

In [None]:
import scipy.cluster.hierarchy as sch
dendrogram = sch.dendrogram(sch.linkage(X_scaler, method = 'ward'))
plt.title('Dendrogram')
plt.xlabel('Clusters')
plt.ylabel('Euclidean distance')
plt.show()

In [None]:
from sklearn.cluster import AgglomerativeClustering
hc = AgglomerativeClustering(n_clusters = 2, affinity='euclidean',linkage='ward')
hc.fit(X_scaler)

In [None]:
y_hc = hc.labels_
y_hc

In [None]:
plt.scatter(X_pca[y_hc==0,0],X_pca[y_hc==0,1], c='red',label='Cluster 0')
plt.scatter(X_pca[y_hc==1,0],X_pca[y_hc==1,1], c='blue',label='Cluster 1')
for label, x,y in zip(labels,X_pca[:,0],X_pca[:,1]):
    plt.annotate(label, xy=(x,y))
plt.xlabel('PCA1')
plt.ylabel('PCA2')
plt.title('Clustering with X_pca')
plt.legend()
plt.show()

# 4. Silhouette score

https://fr.wikipedia.org/wiki/Silhouette_(clustering)

In [None]:
from sklearn.metrics import silhouette_score

In [None]:
cluster_range = range(2,11)
random_range = range(0,10)

silhouette = []
for i in cluster_range:
    for j in random_range:
        cluster = KMeans(n_clusters = i, random_state = j)
        cluster.fit(X_scaler)
        cluster_labels = cluster.labels_
        silhouette_avg = silhouette_score(X_scaler,cluster_labels)
        print('For n_clusters = ', i, 'and seed = ',j, 'silhouette avg = ', silhouette_avg)
        silhouette.append([i,j,silhouette_avg])

In [None]:
silhouette = pd.DataFrame(silhouette, columns=['n_clusters','seed','silhouette_score'])
silhouette

In [None]:
pivot_silhouette = pd.pivot_table(silhouette,index='n_clusters',columns ='seed', values = 'silhouette_score')
pivot_silhouette

In [None]:
plt.figure(figsize=(10,6))
sns.heatmap(pivot_silhouette,annot=True, fmt = '.3f', cmap = sns.cm.rocket_r)
plt.show()