# 01: Data Loading & Inspection

This notebook loads the loan document text files, applies labels, and combines everything into a single DataFrame for modeling.

In [8]:
import pandas as pd
import os

In [16]:
# Load labels CSV
labels_df = pd.read_csv("../data/metadata/data_labels.csv", names=["filename", "source", "label"])

# Remove the first row if it contains header information
if labels_df.iloc[0, 0] == "filename":
    labels_df = labels_df.iloc[1:]

labels_df['label'] = labels_df['label'].str.strip()

labels_df

Unnamed: 0,filename,source,label
1,speedy_cash_payday_desc.txt,Speedy Cash,predatory
2,big_picture_loans_info.txt,Big Picture Loans,predatory
3,ace_cash_express_loan_page.txt,ACE Cash Express,predatory
4,check_city_payday_loan_info.txt,Check City,predatory
5,cashnetusa_alternative_marketing.txt,CashNetUSA,predatory
6,check_n_go_payday_loan_page.txt,Check 'n Go,predatory
7,ficus_bank_loan_estimate.txt,FICUS,non_predatory
8,idbglobal_personal_loan_disclosure.txt,IDB Global,non_predatory
9,msu_disclosure_loan_agreement.txt,MSU,non_predatory
10,psecu_consumer_agreement.txt,PSECU,non_predatory


In [20]:
texts = []

for idx, row in labels_df.iterrows():
    fname = row["filename"]
    label = row["label"]

    if label == "predatory":
        file_path = os.path.join("..", "data", "raw", "loan_documents", "predatory", fname)
    else:
        file_path = os.path.join("..", "data", "raw", "loan_documents", "non_predatory", fname)

    try:
        with open(file_path, "r", encoding="utf-8") as f:
            text = f.read()
    except FileNotFoundError:
        print(f"File not found: {file_path}")
        text = ""

    texts.append(text)



In [23]:
# Attach the text to your dataframe
labels_df['text'] = texts
labels_df

Unnamed: 0,filename,source,label,text
1,speedy_cash_payday_desc.txt,Speedy Cash,predatory,PAYDAY LOANS\nConvenient.\nQuick. Reliable.\na...
2,big_picture_loans_info.txt,Big Picture Loans,predatory,\nBig Picture Loans\nHomeRequirements\nRequire...
3,ace_cash_express_loan_page.txt,ACE Cash Express,predatory,Payday Loans at ACE Cash Express\nPayday loans...
4,check_city_payday_loan_info.txt,Check City,predatory,Payday Loans Online Apply Today!\nNeed cash be...
5,cashnetusa_alternative_marketing.txt,CashNetUSA,predatory,Menu\nLog In\nCashNetUSA\nPayday Loan Alternat...
6,check_n_go_payday_loan_page.txt,Check 'n Go,predatory,Skip to main content\nCheck N Go\n\nPayday Loa...
7,ficus_bank_loan_estimate.txt,FICUS,non_predatory,Loan Terms Can this amount increase after clos...
8,idbglobal_personal_loan_disclosure.txt,IDB Global,non_predatory,PERSONAL LOANS UNSECURED VARIABLE RATE DISCLOS...
9,msu_disclosure_loan_agreement.txt,MSU,non_predatory,FINANCE CHARGES HOW TO COMPUTE THE FINANCE CHA...
10,psecu_consumer_agreement.txt,PSECU,non_predatory,ONSUMER CREDIT CARD AGREEMENT AND DISCLOSURE C...


In [22]:
labels_df.isnull().sum()

filename    0
source      0
label       0
text        0
dtype: int64

In [None]:
reddit_df = pd.read_csv("../data/processed/reddit_predatory_loan_posts.csv")

#keeping only the relevant columns
reddit_df = reddit_df[['post_text', 'label']]
reddit_df = reddit_df.rename(columns={'post_text': 'text'})
reddit_df['source'] = 'reddit'
reddit_df.head()


In [None]:
complaints_df = pd.read_csv("../data/processed/complaints(small sample)-2025-04-25_15_02.csv")

#keeping only the relevant columns
complaints_df = complaints_df[['Consumer complaint narrative']]
complaints_df = complaints_df.rename(columns={'Consumer complaint narrative': 'text'})
complaints_df['label'] = 'predatory'
complaints_df['source'] = 'complaints'
complaints_df.head()


In [None]:
#combine all data
# loan_docs_df is the loan documents DataFrame (after loading txt files)

full_df = pd.concat([
    labels_df[['text', 'label', 'source']],
    reddit_df[['text', 'label', 'source']],
    complaints_df[['text', 'label', 'source']]
], ignore_index=True)

print(full_df.shape)
full_df.head()


In [None]:
full_df.to_csv("../data/metadata/full_loan_texts.csv", index=False)
