In [None]:
import pickle
import os
import pandas as pd
import numpy as np
import time
import math 
import seaborn as sns

In [None]:
from collections import Counter
from sklearn.manifold import TSNE
from gensim.models import Word2Vec
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

In [None]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
font = fm.FontProperties(fname= '../../Google_review_code/Handwriting.ttf')

In [None]:
path = "/Users/nadia/Desktop/google reviews/dic"

In [None]:
# define function to read pickle
def read_pickle(path):
    if path[-4:] != '.pkl':
        raise ValueError
    with open(path, 'rb') as fd:
        f = pickle.load(fd)
    return f

In [None]:
# to union each review for each shop
def union_ws_result(ckip):
    doc = []
    for ws in ckip["ws"]:
        doc.append(ws)
    return doc

In [None]:
def clear(ckip):
    ckip_ = ckip.copy()
    pos_dic = {"A", "Na", "Nb", "Nc", "Ncd", "Nv", 
               "VA", "VAC", "VB", "VC", "VCL", "VD", "VF", "VG", "VH", "VHC", 
               "VI", "VJ", "VK", "VL", "V_2"
               }

    wss = []
    poss = []
    for ws, pos in zip(ckip_["ws"], ckip_["pos"]):
        wss_ = []
        poss_ = []
        for i, j in zip(ws, pos):
            i = i.replace(" ", "")
            if (i != "") & (len(i)>1) & (j in pos_dic):
                wss_.append(i)
                poss_.append(j)
        wss.append(wss_)
        poss.append(poss_)
        
    ckip_["ws"] = wss
    ckip_["pos"] = poss
    return ckip_

In [None]:
data = []
# read all pickle files
for filename in os.listdir(path):
    if filename.endswith(".pkl"):
        ckip = read_pickle(os.path.join(path,filename))
        ckip = clear(ckip)
        #union result to dictionary
        result = union_ws_result(ckip)
        data.append(result)

In [None]:
all_data = []
for i in data:
    for j in i:
        all_data.append(j)

In [None]:
model = Word2Vec.load(os.path.join("../../Google_review_code/model/","word2vec_model"))

In [None]:
# fit a 2D PCA model to the vectors
vectors = model.wv.vectors
words = list(model.wv.key_to_index)
pca = PCA(n_components=2)
PCA_result = pca.fit_transform(vectors)

In [None]:
# prepare a dataframe
words = pd.DataFrame(words)
PCA_result = pd.DataFrame(PCA_result)
PCA_result['x_values'] =PCA_result.iloc[0:, 0]
PCA_result['y_values'] =PCA_result.iloc[0:, 1]
PCA_final = pd.merge(words, PCA_result, left_index=True, right_index=True)
PCA_final['word'] =PCA_final.iloc[0:, 0]
PCA_data_complet =PCA_final[['word','x_values','y_values']]
# PCA_data_complet.to_csv(os.path.join("../../Google_review_code/processing_files/",
#                                      'PCA_data_complet.csv'),index=False)

In [None]:
# count TF
def count_words(data):
    count = Counter()
    for text in data:
        for word in text:
            count[word] +=1
    return count

In [None]:
count_all = count_words(all_data)
words_all = count_all.most_common(100)

In [None]:
words_all= pd.DataFrame(words_all)
words_all['word'] = words_all.iloc[0:, 0]
words_all['count'] = words_all.iloc[0:, 1]
words_all = words_all[['word','count']]
# words_all.to_csv(os.path.join("../../Google_review_code/processing_files/",
#                               'top100_words.csv'), columns=['word','count'], index=False)

In [None]:
# restriction to 100 most frequent words
top100_words = pd.read_csv(os.path.join("../../Google_review_code/processing_files/",
                                        'top100_words.csv'))
PCA_data_top_100 = PCA_data_complet.merge(top100_words, how='inner', left_on='word',right_on='word')
# PCA_data_top_100.to_csv(os,path.join("../../Google_review_code/processing_files/",
#                                      'PCA_data_top_100.csv'), index=False)

In [None]:
cluster_data = PCA_data_top_100.to_numpy()[:, 1:3].astype(np.float64)
k = 4
I = np.random.randint(0, len(cluster_data), k)

cluster_data[I]

In [None]:
kmeans = KMeans(n_clusters=3, random_state=216).fit(cluster_data)
kmeans_labels = np.array(kmeans.labels_)
kmeans_labels

In [None]:
PCA_words = PCA_data_top_100["word"].to_numpy()
PCA_words

In [None]:
words_catagory = np.vstack((PCA_words, kmeans_labels)).T
words_catagory

In [None]:
PCA_data_top_100["cluster"]= words_catagory[:,1]
# PCA_data_top_100.to_csv(os.path.join("../../Google_review_code/processing_files/", 
#                                      "PCA_data_top_100_with_cluster.csv"), index=None)

In [None]:
centers = np.vstack(kmeans.cluster_centers_).T
txt = PCA_data_top_100["word"]

X=PCA_data_top_100["x_values"]
Y=PCA_data_top_100["y_values"]

plt.figure(dpi=150)
plt.scatter(X, Y, c=PCA_data_top_100["cluster"], cmap="summer")
plt.scatter(centers[0], centers[1], marker="X", s=50, c="b")
for i in range(len(X)):
    plt.annotate(txt[i], xy=(X[i], Y[i]), xytext=(X[i], Y[i]), fontproperties=font)
plt.legend(["Group", "Center"])
#plt.title("Word2vec with PCA and Kmeans", size=17)

plt.show()

---


Inertia :
Inertia measures how well a dataset was clustered by K-Means.

In [None]:
kmeans_list = [KMeans(n_clusters=k, random_state=46).fit(cluster_data)
                for k in range(1, 10)]
inertias = [model.inertia_ for model in kmeans_list]
inertias

In [None]:
plt.rcParams['figure.figsize'] = [8,8]
sns.set_style("whitegrid")
colors = plt.rcParams['axes.prop_cycle'].by_key()['color']

inertia_list = []
for num_clusters in range(1, 11):
    kmeans_model = KMeans(n_clusters=num_clusters, init="k-means++")
    kmeans_model.fit(cluster_data)
    inertia_list.append(kmeans_model.inertia_)
    
# plot the inertia curve
plt.figure(dpi=150)
plt.plot(range(1,11),inertia_list)
plt.scatter(range(1,11),inertia_list)
#plt.scatter(3, inertia_list[2], marker="X", s=300, c="r")
plt.xlabel("Number of Clusters", size=13)
plt.ylabel("Inertia Value", size=13)
#plt.title("Different Inertia Values for Different Number of Clusters", size=17)


---

Silhouette Coefficient:

In [None]:
silhouette_avg = []
for i in range(2,11):
    kmeans_fit = KMeans(n_clusters = i).fit(cluster_data)
    silhouette_avg.append(silhouette_score(cluster_data, kmeans_fit.labels_))

plt.figure(dpi=150)
plt.plot(range(2,11), silhouette_avg)
plt.xlabel("Number of Clusters", size=13)
plt.ylabel("Silhouette Score", size=13)
#plt.title("Different Silhouette Coefficient Score for Different Number of Clusters", size=17)

---

Try all data

In [None]:
all_PCA_data = pd.read_csv(os.path.join("../../Google_review_code/processing_files/",
                           'PCA_data_complet.csv'))
all_cluster_data = all_PCA_data.to_numpy()[:, 1:3].astype(np.float64)
KMEANS = KMeans(n_clusters=3).fit(all_cluster_data)
KMEANS_labels = np.array(KMEANS.labels_)
ALL_PCA_words = all_PCA_data["word"].to_numpy()
WORDS_catagory = np.vstack((ALL_PCA_words, KMEANS_labels)).T
df = pd.DataFrame(WORDS_catagory, columns = ['word','catagory'])

all_PCA_data_with_cluster= PCA_data_complet.merge(df, how='inner', left_on='word',right_on='word')
# all_PCA_data_with_cluster.to_excel(os.path.join("../../Google_review_code/processing_files/",
#                           "all_PCA_data_with_cluster.xlsx"), index=False)

In [None]:
all_PCA_data_with_cluster

In [None]:
centers = np.vstack(kmeans.cluster_centers_).T
txt = all_PCA_data_with_cluster["word"]

X=all_PCA_data_with_cluster["x_values"]
Y=all_PCA_data_with_cluster["y_values"]

plt.figure(dpi=150)
plt.scatter(X, Y, c=all_PCA_data_with_cluster["catagory"], cmap="summer")
plt.scatter(centers[0], centers[1], marker="X", s=50, c="b")
for i in range(len(X)):
    plt.annotate(txt[i], xy=(X[i], Y[i]), xytext=(X[i], Y[i]), fontproperties=font)
plt.legend(["Group", "Center"])
#plt.title("Word2vec with PCA and Kmeans", size=17)

plt.show()

In [None]:
silhouette_avg2 = []
for i in range(2,11):
    kmeans_fit = KMeans(n_clusters = i).fit(all_cluster_data)
    silhouette_avg2.append(silhouette_score(all_cluster_data, kmeans_fit.labels_))

plt.figure(dpi=150)
plt.plot(range(2,11), silhouette_avg2)
plt.xlabel("Number of Clusters", size=13)
plt.ylabel("Silhouette Score", size=13)
#plt.title("Different Silhouette Coefficient Score for Different Number of Clusters", size=15)