In [6]:
import pandas as pd
import numpy as np

from gensim.models import Word2Vec
import nltk
from nltk.tokenize import word_tokenize

In [None]:
nltk.download("punkt")

[nltk_data] Downloading package punkt to /home/felixs/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [8]:
base_cc_frame = pd.read_csv("../raw_data/base_cc_frame.csv")
word2vec_frame = base_cc_frame.copy()

In [9]:
# replace NaN with empty string
word2vec_frame["chiefcomplaint"] = word2vec_frame["chiefcomplaint"].fillna("")

In [10]:
word2vec_frame["cc"] = word2vec_frame["chiefcomplaint"].apply(word_tokenize)

In [11]:
model = Word2Vec(
    word2vec_frame["chiefcomplaint"].str.split(),
    vector_size=100,
    window=5,
    min_count=1,
    workers=4,
)

In [12]:
def sentence_vector(sentence, model):
    vectors = [model.wv[word] for word in sentence if word in model.wv]
    if len(vectors) == 0:
        return np.zeros(model.vector_size)
    else:
        return np.mean(vectors, axis=0)

In [13]:
word2vec_frame["cc_vector"] = word2vec_frame["cc"].apply(
    lambda x: sentence_vector(x, model)
)

In [14]:
vectors = np.array(word2vec_frame["cc_vector"].to_list())
word2vec_frame["cc_vector"] = [vector for vector in vectors]

In [9]:
word2vec_frame.to_csv("../processed_data/word2vec_frame.csv", index=False)

In [15]:
# cluster the vectors
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=1000, random_state=0).fit(vectors)
word2vec_frame["cluster"] = kmeans.labels_

In [16]:
word2vec_frame["cluster"].value_counts()

cluster
9      1981
52      572
171     543
523     536
896     500
       ... 
951       1
716       1
841       1
893       1
761       1
Name: count, Length: 1000, dtype: int64

In [17]:
word2vec_frame = word2vec_frame.drop(columns=["cc", "cc_vector"])

In [12]:
word2vec_frame.to_csv("../processed_data/word2vec_frame_with_cluster.csv", index=False)