In [None]:
!pip -q install pandas numpy scikit-learn spacy gensim pyLDAvis tqdm sentence-transformers
!python -m spacy download en_core_web_sm -q

In [None]:
import pandas as pd

df = pd.read_csv("rows.csv")

print("Form der Daten:", df.shape)
print("Spaltennamen:")
print(df.columns)

df.head()

In [5]:
text_col = "Consumer complaint narrative"

df = df[df[text_col].notna()].copy()
df = df[[text_col]].rename(columns={text_col: "text"})

print(df.shape)
df.head()

(315298, 1)


Unnamed: 0,text
29904,The Summer of XX/XX/2018 I was denied a mortga...
30629,There are many mistakes appear in my report wi...
30735,There are many mistakes appear in my report wi...
30795,There are many mistakes appear in my report wi...
30807,There are many mistakes appear in my report wi...


In [6]:
df = df.sample(10000, random_state=42)
print("Neue Größe:", df.shape)

Neue Größe: (10000, 1)


In [7]:
import re
import spacy
from tqdm import tqdm

nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])

def clean_to_tokens(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\.\S+", " ", text)
    text = re.sub(r"[^a-z\s]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()

    doc = nlp(text)
    tokens = [
        t.lemma_ for t in doc
        if not t.is_stop and len(t.lemma_) > 2
    ]
    return tokens

tqdm.pandas()
df["tokens"] = df["text"].progress_apply(clean_to_tokens)
df["clean_text"] = df["tokens"].apply(lambda x: " ".join(x))

df[["text", "clean_text"]].head()

100%|██████████| 10000/10000 [02:00<00:00, 82.83it/s]


Unnamed: 0,text,clean_text
374887,I have a big problem with freedom mortgage im ...,big problem freedom mortgage try save home ask...
302576,I pulled my credit on XXXX XXXX and saw a hard...,pull credit xxxx xxxx see hard inquiry citi ba...
511776,I asked them if I could pay this debt or a set...,ask pay debt settlement pay deletion say yes w...
742168,"XX/XX/XXXX/XX/XX/XXXX, I submitted an IBR re-r...",xxxx xxxx submit ibr request loan account loan...
496043,1. My FCRA rights being violated by TransUnion...,fcra right violate transunion transunion remov...


In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(
    max_df=0.9,
    min_df=5,
    ngram_range=(1,2)
)

X_tfidf = tfidf.fit_transform(df["clean_text"])

print("Form der TF-IDF Matrix:", X_tfidf.shape)

Form der TF-IDF Matrix: (10000, 26413)


In [9]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-MiniLM-L6-v2")
X_emb = model.encode(df["text"].tolist(), show_progress_bar=True)

print("Form der Embedding-Matrix:", X_emb.shape)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/313 [00:00<?, ?it/s]

Form der Embedding-Matrix: (10000, 384)


In [10]:
from gensim import corpora
from gensim.models import LdaModel, CoherenceModel

dictionary = corpora.Dictionary(df["tokens"])
corpus = [dictionary.doc2bow(toks) for toks in df["tokens"]]

def lda_coherence(k):
    lda = LdaModel(
        corpus=corpus,
        id2word=dictionary,
        num_topics=k,
        random_state=42,
        passes=10
    )
    coherence = CoherenceModel(
        model=lda,
        texts=df["tokens"],
        dictionary=dictionary,
        coherence="c_v"
    ).get_coherence()
    return lda, coherence

scores = []
models = {}

for k in range(5, 16):
    lda_model, score = lda_coherence(k)
    scores.append((k, score))
    models[k] = lda_model
    print(f"k={k}, coherence={score}")

k=5, coherence=0.4142737518407249
k=6, coherence=0.4267637747803221
k=7, coherence=0.4222401736075092
k=8, coherence=0.451770008829862
k=9, coherence=0.46617014977711363
k=10, coherence=0.46157171990929224
k=11, coherence=0.454492511271655
k=12, coherence=0.4634602651756219
k=13, coherence=0.4653480113158768
k=14, coherence=0.47278265763032123
k=15, coherence=0.4707527032113118


In [11]:
best_k = max(scores, key=lambda x: x[1])[0]
print("Bestes k laut Coherence:", best_k)

Bestes k laut Coherence: 14


In [12]:
lda = models[best_k]

for i, topic in lda.print_topics(num_topics=best_k, num_words=10):
    print(f"\nTopic {i}:")
    print(topic)


Topic 0:
0.061*"account" + 0.053*"report" + 0.052*"credit" + 0.034*"collection" + 0.028*"letter" + 0.026*"send" + 0.026*"debt" + 0.023*"company" + 0.021*"xxxx" + 0.018*"receive"

Topic 1:
0.097*"payment" + 0.047*"loan" + 0.045*"pay" + 0.023*"month" + 0.021*"late" + 0.019*"xxxx" + 0.017*"interest" + 0.015*"balance" + 0.014*"time" + 0.013*"year"

Topic 2:
0.032*"xxxx" + 0.022*"charge" + 0.022*"card" + 0.017*"account" + 0.016*"receive" + 0.012*"service" + 0.012*"fraud" + 0.011*"email" + 0.010*"customer" + 0.010*"purchase"

Topic 3:
0.136*"car" + 0.091*"finance" + 0.085*"vehicle" + 0.058*"citibank" + 0.051*"lease" + 0.040*"auto" + 0.036*"purchase" + 0.026*"financial" + 0.022*"ally" + 0.022*"dealership"

Topic 4:
0.228*"fargo" + 0.218*"wells" + 0.034*"pmi" + 0.013*"sale" + 0.013*"like" + 0.013*"well" + 0.013*"appraisal" + 0.011*"cash" + 0.011*"match" + 0.011*"copy"

Topic 5:
0.182*"inquiry" + 0.097*"credit" + 0.065*"report" + 0.044*"remove" + 0.033*"authorize" + 0.031*"hard" + 0.028*"ident

In [13]:
from sklearn.cluster import KMeans

k2 = 8

kmeans = KMeans(n_clusters=k2, random_state=42, n_init="auto")
clusters = kmeans.fit_predict(X_emb)

df["cluster"] = clusters

df["cluster"].value_counts().sort_index()

Unnamed: 0_level_0,count
cluster,Unnamed: 1_level_1
0,1259
1,944
2,1411
3,1130
4,1475
5,1853
6,1350
7,578


In [14]:
for c in range(k2):
    print("\nCLUSTER", c)
    sample_texts = df[df["cluster"] == c]["text"].head(3).tolist()
    for t in sample_texts:
        print("-", t[:200].replace("\n"," "), "...")


CLUSTER 0
- I opened an auto loan with XXXX XXXX in XX/XX/XXXX.  XXXX XXXX XXXX XXXX XXXX XXXX XXXX, XXXX, CA XXXX I paid every month and was not late once, EVER. As of this year I filed bankruptcy on XX/XX/XXXX  ...
- I was reading my credit report from XXXX and found that my credit score is affected by an account of a credit card that is not mine. The account is now closed, the balance is higher that the credit li ...
- My BLOOMINGDALES credit card account had a {$420.00} credit that was stolen evidently by a Bloomingdale employee. I 've contacted customer service and had no results.  We were in XXXX XXXX XXXX/XXXX/X ...

CLUSTER 1
- I accepted a loan from AvailBlue for {$350.00}. I was told when they would drafting the first payment, amounts etc. Prior to accepting the loan I spoke with the Agent and asked him about early pay off ...
- I attended to XXXX college for six months.. XXXX made me take out a private loan called American student financial group ; direct lending program ..

In [15]:
lda = models[14]

for i, topic in lda.print_topics(num_topics=14, num_words=10):
    print(f"\nTopic {i}:")
    print(topic)


Topic 0:
0.061*"account" + 0.053*"report" + 0.052*"credit" + 0.034*"collection" + 0.028*"letter" + 0.026*"send" + 0.026*"debt" + 0.023*"company" + 0.021*"xxxx" + 0.018*"receive"

Topic 1:
0.097*"payment" + 0.047*"loan" + 0.045*"pay" + 0.023*"month" + 0.021*"late" + 0.019*"xxxx" + 0.017*"interest" + 0.015*"balance" + 0.014*"time" + 0.013*"year"

Topic 2:
0.032*"xxxx" + 0.022*"charge" + 0.022*"card" + 0.017*"account" + 0.016*"receive" + 0.012*"service" + 0.012*"fraud" + 0.011*"email" + 0.010*"customer" + 0.010*"purchase"

Topic 3:
0.136*"car" + 0.091*"finance" + 0.085*"vehicle" + 0.058*"citibank" + 0.051*"lease" + 0.040*"auto" + 0.036*"purchase" + 0.026*"financial" + 0.022*"ally" + 0.022*"dealership"

Topic 4:
0.228*"fargo" + 0.218*"wells" + 0.034*"pmi" + 0.013*"sale" + 0.013*"like" + 0.013*"well" + 0.013*"appraisal" + 0.011*"cash" + 0.011*"match" + 0.011*"copy"

Topic 5:
0.182*"inquiry" + 0.097*"credit" + 0.065*"report" + 0.044*"remove" + 0.033*"authorize" + 0.031*"hard" + 0.028*"ident

In [16]:
for k in [10, 11, 12]:
    lda = models[k]
    print("\n==== k =", k, "====")
    for i, topic in lda.print_topics(num_topics=k, num_words=6):
        print(i, topic)


==== k = 10 ====
0 0.082*"credit" + 0.075*"report" + 0.059*"account" + 0.024*"dispute" + 0.021*"remove" + 0.018*"xxxx"
1 0.088*"payment" + 0.045*"loan" + 0.042*"pay" + 0.021*"month" + 0.021*"late" + 0.017*"xxxx"
2 0.048*"account" + 0.048*"card" + 0.027*"credit" + 0.026*"bank" + 0.023*"xxxx" + 0.023*"charge"
3 0.100*"car" + 0.062*"vehicle" + 0.045*"citibank" + 0.037*"lease" + 0.034*"bonus" + 0.032*"purchase"
4 0.063*"information" + 0.036*"equifax" + 0.033*"identity" + 0.032*"report" + 0.032*"bankruptcy" + 0.028*"theft"
5 0.212*"inquiry" + 0.119*"credit" + 0.046*"report" + 0.041*"hard" + 0.035*"authorize" + 0.032*"remove"
6 0.034*"xxxx" + 0.032*"call" + 0.029*"tell" + 0.020*"say" + 0.016*"ask" + 0.015*"time"
7 0.046*"debt" + 0.019*"consumer" + 0.017*"law" + 0.017*"provide" + 0.014*"collection" + 0.014*"information"
8 0.730*"xxxx" + 0.009*"account" + 0.008*"date" + 0.005*"credit" + 0.004*"address" + 0.004*"request"
9 0.054*"xxxx" + 0.029*"mortgage" + 0.026*"loan" + 0.015*"home" + 0.010*"

In [18]:
import re
import spacy
from tqdm import tqdm

nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])

def clean_to_tokens(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\.\S+", " ", text)
    text = re.sub(r"[^a-z\s]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()

    doc = nlp(text)

    custom_stopwords = {"xxxx", "xxxxxxxx"}

    tokens = [
        t.lemma_ for t in doc
        if not t.is_stop
        and len(t.lemma_) > 2
        and t.lemma_ not in custom_stopwords
    ]

    return tokens

tqdm.pandas()
df["tokens"] = df["text"].progress_apply(clean_to_tokens)
df["clean_text"] = df["tokens"].apply(lambda x: " ".join(x))

100%|██████████| 10000/10000 [02:25<00:00, 68.71it/s]


In [19]:
from gensim import corpora

# Dictionary neu erstellen
dictionary = corpora.Dictionary(df["tokens"])

# Bag-of-Words Corpus erstellen
corpus = [dictionary.doc2bow(toks) for toks in df["tokens"]]

print("Anzahl einzigartiger Wörter:", len(dictionary))

Anzahl einzigartiger Wörter: 13883


In [20]:
from gensim.models import LdaModel, CoherenceModel

def compute_coherence_values(start=11, limit=14):
    models = {}
    scores = []

    for k in range(start, limit):
        lda_model = LdaModel(
            corpus=corpus,
            id2word=dictionary,
            num_topics=k,
            random_state=42,
            passes=10
        )

        coherence_model = CoherenceModel(
            model=lda_model,
            texts=df["tokens"],
            dictionary=dictionary,
            coherence='c_v'
        )

        coherence_score = coherence_model.get_coherence()

        models[k] = lda_model
        scores.append((k, coherence_score))

        print(f"k={k}, coherence={coherence_score}")

    return models, scores

models, scores = compute_coherence_values()

k=11, coherence=0.47930440681546693
k=12, coherence=0.4491337823837361
k=13, coherence=0.46998513378014006


In [None]:
best_k = max(scores, key=lambda x: x[1])[0]
print("Bestes k laut Coherence:", best_k)

In [21]:
lda = models[11]

for i, topic in lda.print_topics(num_topics=11, num_words=8):
    print(f"\nTopic {i}:")
    print(topic)


Topic 0:
0.101*"report" + 0.099*"credit" + 0.070*"account" + 0.031*"remove" + 0.022*"dispute" + 0.015*"information" + 0.013*"file" + 0.011*"show"

Topic 1:
0.083*"payment" + 0.052*"loan" + 0.039*"pay" + 0.020*"month" + 0.016*"interest" + 0.015*"year" + 0.014*"late" + 0.013*"time"

Topic 2:
0.072*"credit" + 0.070*"card" + 0.035*"inquiry" + 0.025*"account" + 0.010*"charge" + 0.009*"chase" + 0.009*"capital" + 0.008*"score"

Topic 3:
0.043*"call" + 0.040*"tell" + 0.028*"say" + 0.024*"ask" + 0.020*"time" + 0.019*"phone" + 0.016*"send" + 0.015*"speak"

Topic 4:
0.045*"mortgage" + 0.033*"loan" + 0.030*"home" + 0.020*"property" + 0.018*"modification" + 0.015*"request" + 0.014*"sale" + 0.012*"document"

Topic 5:
0.096*"debt" + 0.047*"collection" + 0.025*"owe" + 0.024*"company" + 0.020*"pay" + 0.018*"letter" + 0.018*"receive" + 0.017*"send"

Topic 6:
0.018*"court" + 0.016*"complaint" + 0.015*"file" + 0.013*"case" + 0.012*"law" + 0.012*"document" + 0.010*"claim" + 0.010*"attorney"

Topic 7:
0.03

In [22]:
df["cluster"].value_counts().sort_index()

Unnamed: 0_level_0,count
cluster,Unnamed: 1_level_1
0,1259
1,944
2,1411
3,1130
4,1475
5,1853
6,1350
7,578


In [23]:
for c in sorted(df["cluster"].unique()):
    print("\n====================")
    print("CLUSTER", c)
    print("====================")

    sample_texts = df[df["cluster"] == c]["text"].head(2).tolist()

    for t in sample_texts:
        print("-", t[:300].replace("\n", " "), "...\n")


CLUSTER 0
- I opened an auto loan with XXXX XXXX in XX/XX/XXXX.  XXXX XXXX XXXX XXXX XXXX XXXX XXXX, XXXX, CA XXXX I paid every month and was not late once, EVER. As of this year I filed bankruptcy on XX/XX/XXXX Due to the issues I continually had with the car, my lawyer and myself thought it would be best if I ...

- I was reading my credit report from XXXX and found that my credit score is affected by an account of a credit card that is not mine. The account is now closed, the balance is higher that the credit limit, the account does n't receive a payment since may and my credit is suffering. It says that I am  ...


CLUSTER 1
- I accepted a loan from AvailBlue for {$350.00}. I was told when they would drafting the first payment, amounts etc. Prior to accepting the loan I spoke with the Agent and asked him about early pay off. He told me I could in fact pay the loan off early and just paying a small amount above the {$350.0 ...

- I attended to XXXX college for six months.. XXXX mad