# 01: Data Loading & Inspection

This notebook loads the loan document text files, applies labels, and combines everything into a single DataFrame for modeling.

In [11]:
import pandas as pd
import os

In [26]:
# Load labels CSV
labels_df = pd.read_csv("../data/metadata/data_labels.csv", names=["filename", "source", "label"])
# Remove the first row if it contains header information
if labels_df.iloc[0, 0] == "filename":
    labels_df = labels_df.iloc[1:]
labels_df.head()

Unnamed: 0,filename,source,label
1,speedy_cash_payday_desc.txt,Speedy Cash,predatory
2,big_picture_loans_info.txt,Big Picture Loans,predatory
3,ace_cash_express_loan_page.txt,ACE Cash Express,predatory
4,check_city_payday_loan_info.txt,Check City,predatory
5,cashnetusa_alternative_marketing.txt,CashNetUSA,predatory


In [31]:
texts = []

for idx, row in labels_df.iterrows():
    fname = row["filename"]
    label = row["label"]
    folder = "predatory" if label == "predatory" else "non_predatory"
    file_path = os.path.join("..", "data", "raw", "loan_documents", folder, fname)
    
    print(file_path)  # <--- ADD THIS

    try:
        with open(file_path, "r", encoding="utf-8") as f:
            text = f.read()
    except FileNotFoundError:
        print(f"File not found: {file_path}")
        text = ""
    texts.append(text)



../data/raw/loan_documents/non_predatory/speedy_cash_payday_desc.txt
File not found: ../data/raw/loan_documents/non_predatory/speedy_cash_payday_desc.txt
../data/raw/loan_documents/non_predatory/big_picture_loans_info.txt
File not found: ../data/raw/loan_documents/non_predatory/big_picture_loans_info.txt
../data/raw/loan_documents/non_predatory/ace_cash_express_loan_page.txt
File not found: ../data/raw/loan_documents/non_predatory/ace_cash_express_loan_page.txt
../data/raw/loan_documents/non_predatory/check_city_payday_loan_info.txt
File not found: ../data/raw/loan_documents/non_predatory/check_city_payday_loan_info.txt
../data/raw/loan_documents/non_predatory/cashnetusa_alternative_marketing.txt
File not found: ../data/raw/loan_documents/non_predatory/cashnetusa_alternative_marketing.txt
../data/raw/loan_documents/non_predatory/check_n_go_payday_loan_page.txt
File not found: ../data/raw/loan_documents/non_predatory/check_n_go_payday_loan_page.txt
../data/raw/loan_documents/non_predato

In [28]:
# Attach the text to your dataframe
labels_df['text'] = texts
labels_df.head(2)

Unnamed: 0,filename,source,label,text
1,speedy_cash_payday_desc.txt,Speedy Cash,predatory,
2,big_picture_loans_info.txt,Big Picture Loans,predatory,


In [29]:
labels_df.isnull().sum()

filename    0
source      0
label       0
text        0
dtype: int64

In [30]:
# Save to CSV for later modeling
labels_df.to_csv("../data/metadata/loan_texts.csv", index=False)
