In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
import itertools
from pathlib import Path
from sklearn.decomposition import NMF
import openTSNE
import pickle

In [None]:
DATA_FOLDER = Path("data")

Loading all corpus files:

In [None]:
corpus_files = DATA_FOLDER.glob("*_ids_corpus_resolution_4_6*.txt")
keys_corpus = (line.strip().split("\t") for line in itertools.chain.from_iterable((open(file) for file in corpus_files)))
keys, corpus = itertools.tee(keys_corpus)
keys = [k[0] for k in keys]
corpus = (k[1] for k in corpus)

Calculating the TFIDF matrix:

In [None]:
print(f"Getting TFIDF matrix for {len(keys)} proteins...")
vectorizer = TfidfVectorizer(min_df=2)
tfidf_matrix = vectorizer.fit_transform(corpus)

Fitting NMF model:

In [None]:
num_topics = 250
topic_model = NMF(n_components=num_topics,
            random_state=42,
            solver='cd', tol=0.0005,
            max_iter=500,
            alpha=.1,
            l1_ratio=.5,
            verbose=1)
w_matrix = topic_model.fit_transform(tfidf_matrix)

Normalizing $W$ matrix for plotting:

In [None]:
scaler = StandardScaler()
w_matrix_norm = scaler.fit_transform(w_matrix)

Fitting t-SNE model initialized with PCA on $W$ matrix:

In [None]:
tsne_reducer = openTSNE.TSNE(
        perplexity=50,
        initialization="pca",
        metric="cosine",
        n_jobs=14,
        random_state=42,
        n_iter=1000,
        verbose=True
    )
reduced = tsne_reducer.fit(w_matrix_norm)

Saving everything:

In [None]:
with open(DATA_FOLDER / "topic_modelling_data.pkl", "wb") as f:
    pickle.dump((keys,
                 vectorizer, tfidf_matrix,
                 topic_model, w_matrix,
                 scaler, w_matrix_norm,
                 tsne_reducer, reduced), f)