In [None]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plot
import matplotlib.cm as cm
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, ENGLISH_STOP_WORDS
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.manifold import MDS

data = pd.read_json("dataset.json")
stop_words = ENGLISH_STOP_WORDS.union(["2019", "2020", "http", "https", "www", "com", "net", "org"])
t_v = TfidfVectorizer(stop_words=stop_words, max_df=5)
t = t_v.fit_transform(data["title"])
a_v = TfidfVectorizer(stop_words=stop_words, max_df=5)
a = a_v.fit_transform(data["abstract"])
r_v = TfidfVectorizer(stop_words=stop_words, max_df=5)
r = r_v.fit_transform([item for sublist in data["references"] for item in sublist])

c_c = {0: "#e7298a", 1: "#1b9e77", 2: "#7570b3"}
c_n = {0: "Küme 1", 1: "Küme 2", 2: "Küme 3"}

for name in ("t", "a", "r"):
    km = KMeans(n_clusters = 3, init = "k-means++", max_iter = 500, n_init = 1).fit(globals()[name])
    o = km.cluster_centers_.argsort()[:, ::-1]
    mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1)
    dist = 1 - cosine_similarity(globals()[name])
    pos = mds.fit_transform(dist)
    xs, ys = pos[:, 0], pos[:, 1]
    key = [x for x in data.keys() if x.startswith(name)]
    dat = pd.DataFrame(dict(x=xs, y=ys, label=km.labels_.tolist(), title=data[key[0]] if name != "r" else [item for sublist in data["references"] for item in sublist]))
    fig, ax = plot.subplots(figsize=(20, 15))
    for k, i in dat.groupby("label"):
        ax.plot(i.x, i.y, marker="o", ms=15, label=c_n[k], color=c_c[k], mec="none")
        ax.set_aspect("auto")
        ax.tick_params(axis= "x", which="both", bottom="off", top="off",labelbottom="off")
        ax.tick_params(axis= "y", which="both", left="off", top="off", labelleft="off")
    ax.legend(numpoints=1, loc="upper left", fontsize="medium")
    ax.axis("off")

    if name != "r":
        for i in range(len(dat)):
            title = dat.loc()[i]["title"]
            title = title if len(title) < 100 else title[0:100]
            ax.text(dat.loc()[i]["x"], dat.loc()[i]["y"], title, size=10)
    plot.savefig("{}.png".format(name), dpi=72, bbox_inches='tight')
    plot.show()