## Import bibliotek

In [None]:
import os
import json
import string
import morfeusz2
import numpy as np
import pandas as pd
import operator as op
import itertools as it
from wordcloud import WordCloud
from matplotlib import pyplot as plt
from nltk.util import ngrams
from nltk.tokenize import word_tokenize
from nltk.corpus import PlaintextCorpusReader
from scipy.cluster.hierarchy import dendrogram
from sklearn.cluster import AgglomerativeClustering
from sklearn.decomposition import LatentDirichletAllocation, NMF
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances, cosine_similarity


### Korpus dokumentów

In [None]:
corpus_dir = "./Literatura - original"
corpus = PlaintextCorpusReader(corpus_dir, r'.*.txt')
files_names = corpus.fileids()
files_names

## wstepne przygotowanie dokumentów

In [None]:
documents = {}
for file in files_names:
    documents[file] = corpus.raw(file)
print(json.dumps(documents, indent=4, ensure_ascii=False))

In [None]:
stoplist_file = open("./stopwords_pl.txt", "r", encoding="UTF-8")
stoplist = stoplist_file.read().splitlines()
stoplist_file.close()
stoplist = stoplist[4:]
stoplist

In [None]:
def lemmatize(text):
    morf = morfeusz2.Morfeusz()
    segments = it.groupby(morf.analyse(text), op.itemgetter(0,1))
    def disambiguate(group):
        pairs = ((len(descr), lemma) 
                 for _, _, (_, lemma, descr, _, _, ) in group)
        perpl, lemma = min(pairs)
        return lemma.split(":")
    lemmas = (disambiguate(group) for key, group in segments)
    return " ".join(filter(str.isalpha, lemmas))

In [None]:
for key in documents:
    documents[key] = documents[key].lower()
    documents[key] = "".join([char for char in documents[key] if char not in string.punctuation])
    documents[key] = lemmatize(documents[key])
    documents[key] = " ".join([word for word in word_tokenize(documents[key]) if word not in stoplist])

print(json.dumps(documents, indent=4, ensure_ascii=False))

In [None]:
morf = morfeusz2.Morfeusz()
morf.analyse("Ala ma kota")

### Utworzenie macierzy częstości

In [None]:
docs = pd.DataFrame.from_dict(documents, orient="index")
docs.columns = ['content']
docs

In [None]:
count_vectorizer = CountVectorizer()
counts_tf = count_vectorizer.fit_transform(docs['content'])
counts_tf.toarray()


In [None]:
tfidf_vectorizer = TfidfVectorizer()
counts_tfidf = tfidf_vectorizer.fit_transform(docs['content'])
counts_tfidf.toarray()

#### Katalogi na wyniki

In [None]:
if not os.path.exists("./wordclouds"):
    os.mkdir("./wordclouds")
if not os.path.exists("./topics"):
    os.mkdir("./topics")
if not os.path.exists("./clusters"):
    os.mkdir("./clusters")
if not os.path.exists("./ngrams"):
    os.mkdir("./ngrams")

### Chumry tagów

In [None]:
wordcloud = WordCloud(
    background_color= "white",
    max_words = 5000,
    contour_width=3,
    contour_color="steelblue"
)

In [None]:
for index, row in docs.iterrows():
    wordcloud.generate(row['content'])
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.title(index.replace(".txt", ""))
    plt.savefig("./wordclouds/{}".format(index.replace(".txt", ".png")))

### Analiza tematyk

In [None]:
def plot_top_words(model, feature_names, n_top_words, title, subplots):
    colors = ["forestgreen', 'lightskyblue', 'hotpink', 'turquoise', 'steelblue', 'crimson', 'seagreen', 'orange', 'purple', 'brown'"]
    fig, axes = plt.subplots(*subplots, figsize=(30,15), share=True)
    axes = axes.flatten()
    for topic_idx, topic in enumerate(model.components_):
        top_features_ind = topic.argsort()[-n_top_words:]
        top_features = feature_names[top_features_ind]
        weights = topic[top_features_ind]

        ax = axes[topic_idx]
        ax.barh(top_features, weights, height=0.7, color=colors[topic_idx])
        ax.set_title(f"Topic {topic_idx + 1}", fontdict={"fontsize": 30})
        ax.tick_params(axis="both", which="major", labelsize=20)
        for i in "top right left".split():
            ax.spines[i].set_visible(False)
        fig.suptitle(title, fontsize=40)

    plt.subplots_adjust(top=0.90, bottom=0.05, wspace=0.90, hspace=0.3)
    plt.savefig("./topics{}.png".format(title.replace("","_")))

In [None]:
def plot_documents(model, count, n_components, title):
    colors = ["forestgreen', 'lightskyblue', 'hotpink', 'turquoise', 'steelblue', 'crimson', 'seagreen', 'orange', 'purple', 'brown'"]
    docs_topics = pd.DataFrame(model.transform(counts), columns-[f'Topic {i+1}' for i in range(n_components)])
    docs_topics.index = [file_name.replace(".txt", "") for file_name in files_names]
    plt.figure(figsize=(7,4))
    left = [0] * len(docs_topics)
    for i, col in enumerate(docs_topics.columns):
        plt.barh(docs_topics.index, docs_topics[col], left=left, color=colors[i], label=col)
        left = [left[j] + docs-topics[col].iloc[j] for j in range(len(docs_topics))]
    plt.savefig("./topics/{}_docs.png".format(title), bbox_inches='tight')
    plt.close()


In [None]:
n_topics = 6
n_top_words = 20 
feaature_names = count_vectorizer.get_feature_names_out()
subplots = (2, 3)

In [None]:
lda = LatentDirichletAllocation(
    n_components=n_topics,
    max_iter=100,
    learn_method='online',
    learning_offset=50,
    random_state=0
)
lda.fit(counts_tf)
plot_top_words(lda, feature_names, n_top_words, "Tematy w modelu LDA", subplots)
plot_documents(lda, counts_tf, n_topics, "Tematy w modelu LDA")


In [None]:
nmf_fn = NMF(
    n_components=n_topics,
    random_state=1,
    alpha_H=.00005,
    alpha_W=.00005,
    l1_ratio=.5
)
nmf_fn.fit(counts_tfidf)
plot_top_words(nmf_fn, feature_names, n_top_words, "Tematy w modelu NMF Norma macierzowa", subplots)
plot_documents(lda, counts_tf, n_topics, "Tematy w modelu NMF Norma Macierzowa")


In [None]:
nmf_kl = NMF(
    n_components=n_topics,
    random_state=1,
    beta_loss='kullback-leibler',
    solver='mu',
    max_iter=1000,
    alpha_H=.00005,
    alpha_W=.00005,
    l1_ratio=.5
)
nmf_fn.fit(counts_tfidf)
plot_top_words(nmf_fn, feature_names, n_top_words, "Tematy w modelu NMF Kullback-Leibler", subplots)
plot_documents(lda, counts_tf, n_topics, "Tematy w modelu NMF Kullback-Leibler")

### Analiza skupień 

In [None]:
def plot_dendogram(model, title, **kwargs):
    counts = np.zeros(model.children_shape[0])
    n_samples = len(model.labels_)
    for i, merge in enumerate(model.children_):
        current_count = 0
        for child_idx in merge:
            if child_idx in merge:
                current_count += 1
            else:
                current_count += counts[child_idx - n_samples]
        counts[i] = current_count
    linkage_matrix = np.column_stack(
        [model.children_, model.distance_, counts]
    ).astype(float)
    dendogram(linkage_matrix, **kwargs)
    plt.title(title)
    plt.savefig("./clusters/{}.png".format(title.replace(" ", "_")), bbox_inches="tight")
    plt.close


In [None]:
cs = cosine_similarity(counts_tfidf, counts_tfidf).flatten().reshape(len(files_names), len(files_names))
ed = euclidean_distances(counts_tf, counts_tf).flatten().reshape(len(files_names), len(files_names))

In [None]:
clustering_ed_complete = AgglomerativeClustering(
    n_clustering=3,
    metric='precomputed',
    linkage='complate',
    compute_distances=True
).fit(ed)
plot_dendogram(
    clustering_ed_complete,
    title="Dendogram metryka euklidesowa, metoda pełnego wiązania",
    labels=[files_names.replace("txt", "") for file_name in files_names],
    orientation='left',
)
clustering_ed_complete.labels_

In [None]:
clustering_cs_ward = AgglomerativeClustering(
    n_clustering=3,
    metric='precomputed',
    linkage='ward',
    compute_distances=True
).fit(cs)
plot_dendogram(
    clustering_cs_ward,
    title="Dendogram metryka kątowa, metoda Warda",
    labels=[files_names.replace("txt", "") for file_name in files_names],
    orientation='left',
)
clustering_ed_complete.labels_

### N-gramy

In [None]:
documents_tokenized = {}
for key in documents:
    documents_tokenized[key] = word_tokenize(documents[key], language='polish')
print(json.dumps(documents_tokenized, indent=4, ensure_ascii=False))

In [None]:
for n in range(1, 4):
    fig, axes = plt.subplots(4, 5, figsize=(30, 15))
    axes = axes.flatten()
    for i, key in enumerate(documents_tokenized):
        if i < len(axes):
            n_grams = pd.Series(ngrams(documents_tokenized[key], n)).value_counts()
            n_grams[:5].plot.barh(ax=axes[i], title="{}-gramy w {}".format(n, key.replace(".txt", "")))
    for j in range(i + 1, len(axes)):
        axes[j].axis('off')
        
    plt.tight_layout()
    plt.savefig("./ngrams/{}-grams.png".format(n), bbox_inches="tight")
    plt.close()

In [None]:
texts = " ".join(docs['content']).split(" ")
n_grams = pd.Series(ngrams(texts, 3)).value_counts()
ax = n_grams[:15].plot.barh()
plt.gca().invert_yaxis()
ax.bar_label(ax.containers[0], label_type='edge')
plt.show()