# CFPB Student Loans complaints HDP

https://www.consumerfinance.gov/data-research/consumer-complaints/#download-the-data

In [1]:
import numpy as np
import spacy
nlp = spacy.load('en_core_web_sm')
file_student='../data/consumercomplaints/student_comp_narrative.txt'
f=open(file_student,'r',encoding='utf-8')
student_text=f.read()
f.close()

In [3]:
doc_student=nlp(student_text)
student_pos_list=['NOUN']
student_preproc_text=[]
student_preproc_sent=[]

for token in doc_student:
    if token.text!='\n':
        if not(token.is_stop) and not(token.is_punct) and token.pos_ in student_pos_list:
            student_preproc_sent.append(token.lemma_)
    else:
        student_preproc_text.append(student_preproc_sent)
        student_preproc_sent=[]

student_preproc_text.append(student_preproc_sent) #last sentence

print(student_preproc_text)



In [7]:
import tomotopy as tp
mdl = tp.HDPModel(alpha=0.1,seed=0)

for line in student_preproc_text:
    mdl.add_doc(line)

mdl.train(50)
print('Log Perplexity=', mdl.ll_per_word)

for k in range(mdl.k):
    print('Top 10 words of topic #{}'.format(k))
    print(mdl.get_topic_words(k, top_n=10))


Log Perplexity= -5.74896161113102
Top 10 words of topic #0
[('payment', 0.07293784618377686), ('loan', 0.07257163524627686), ('account', 0.02941964566707611), ('time', 0.024841995909810066), ('month', 0.02063055709004402), ('credit', 0.017945000901818275), ('information', 0.017883965745568275), ('interest', 0.015991870313882828), ('student', 0.015259445644915104), ('day', 0.014099773950874805)]
Top 10 words of topic #1
[('loan', 0.11652614921331406), ('payment', 0.06274520605802536), ('student', 0.03194756805896759), ('interest', 0.029189569875597954), ('time', 0.028500070795416832), ('year', 0.02788718231022358), ('month', 0.021451856940984726), ('school', 0.019996246322989464), ('repayment', 0.013943973928689957), ('rate', 0.013943973928689957)]
Top 10 words of topic #2
[('loan', 0.04834238067269325), ('entity', 0.02423146553337574), ('time', 0.02423146553337574), ('physician', 0.02423146553337574), ('interest', 0.02423146553337574), ('accredition', 0.02423146553337574), ('repayment'

In [8]:
mdl.save('../data/consumercomplaints/hdp_model.bin')

In [9]:
mdl = tp.HDPModel.load('../data/consumercomplaints/hdp_model.bin')

In [13]:
bag_of_words=[word for sent in student_preproc_text for word in sent]
doc_inst = mdl.make_doc(bag_of_words)
np.argsort(np.array(mdl.infer(doc_inst)[0]))[::-1]

array([ 5,  7,  4,  6,  0,  1, 11,  9, 14,  2, 18, 17, 10,  8, 12, 13, 15,
        3, 16], dtype=int64)

In [14]:
print(mdl.get_topic_words(5, top_n=7))

[('school', 0.05379803851246834), ('aid', 0.05379803851246834), ('password', 0.03592493385076523), ('username', 0.03592493385076523), ('information', 0.03592493385076523), ('direction', 0.03592493385076523), ('bus', 0.03592493385076523)]


In [15]:
print(mdl.get_topic_words(7, top_n=7))

[('graduate', 0.061739806085824966), ('program', 0.061739806085824966), ('assistance', 0.04634334146976471), ('loan', 0.03094688430428505), ('placement', 0.03094688430428505), ('school', 0.03094688430428505), ('world', 0.03094688430428505)]


In [16]:
print(mdl.get_topic_words(4, top_n=7))

[('employer', 0.03343059867620468), ('graduation', 0.03343059867620468), ('book', 0.03343059867620468), ('diploma', 0.025093790143728256), ('debt', 0.025093790143728256), ('education', 0.025093790143728256), ('college', 0.025093790143728256)]


# CFPB Student Loans complaints LDA

In [18]:
NUM_TOPICS=20

mdl = tp.LDAModel(k=NUM_TOPICS,alpha=0.1,seed=0)

for line in student_preproc_text:
    mdl.add_doc(line)

mdl.train(50)
print('Log Perplexity=', mdl.ll_per_word)

for k in range(mdl.k):
    print('Top 10 words of topic #{}'.format(k))
    print(mdl.get_topic_words(k, top_n=10))


Log Perplexity= -6.423249988462239
Top 10 words of topic #0
[('forbearance', 0.1309245228767395), ('time', 0.08555688709020615), ('month', 0.07453903555870056), ('payment', 0.03630059212446213), ('situation', 0.033708155155181885), ('income', 0.03111572004854679), ('year', 0.024634627625346184), ('option', 0.022690299898386), ('paperwork', 0.018801646307110786), ('end', 0.016857318580150604)]
Top 10 words of topic #1
[('credit', 0.14882682263851166), ('loan', 0.0883079543709755), ('score', 0.059536680579185486), ('account', 0.04961555451154709), ('report', 0.03671808913350105), ('letter', 0.033741749823093414), ('status', 0.029773302376270294), ('customer', 0.021836400032043457), ('day', 0.018860062584280968), ('service', 0.01687583699822426)]
Top 10 words of topic #2
[('payment', 0.09068136662244797), ('customer', 0.05741218104958534), ('service', 0.047627128660678864), ('time', 0.0469747893512249), ('request', 0.04567011445760727), ('rep', 0.04175609350204468), ('day', 0.040451418608

In [19]:
mdl.save('../data/consumercomplaints/lda_model.bin')

In [20]:
mdl = tp.LDAModel.load('../data/consumercomplaints/lda_model.bin')

In [22]:
bag_of_words=[word for sent in student_preproc_text for word in sent]
doc_inst = mdl.make_doc(bag_of_words)
np.argsort(np.array(mdl.infer(doc_inst)[0]))[::-1]

array([17,  7,  6,  8, 12,  0,  2,  4, 10,  5, 18, 14, 13, 11, 16, 15,  9,
        3,  1, 19], dtype=int64)

In [23]:
print(mdl.get_topic_words(17, top_n=7))

[('interest', 0.20065094530582428), ('loan', 0.16345429420471191), ('payment', 0.152724489569664), ('rate', 0.07046262919902802), ('balance', 0.04184982180595398), ('year', 0.0314776748418808), ('principal', 0.025755111128091812)]


In [24]:
print(mdl.get_topic_words(7, top_n=7))

[('loan', 0.14698922634124756), ('year', 0.09230735898017883), ('repayment', 0.08487062156200409), ('payment', 0.08312080055475235), ('plan', 0.07349679619073868), ('income', 0.05074914172291756), ('month', 0.03981276974081993)]


In [25]:
print(mdl.get_topic_words(6, top_n=7))

[('loan', 0.24387744069099426), ('time', 0.06379450112581253), ('student', 0.051527272909879684), ('m', 0.05103658139705658), ('money', 0.04514831304550171), ('payment', 0.03239039331674576), ('collection', 0.027974190190434456)]
