In [None]:
import pandas as pd
import numpy as np
from scipy.io import mmread

# === Step 1: Load matrices ===
content = mmread("cornell_content.mtx").tocsr()    # shape: (3312, 3703)
inbound = mmread("cornell_inbound.mtx").tocsr()    # shape: (3312, 3312)
outbound = mmread("cornell_outbound.mtx").tocsr()  # shape: (3312, 3312)
cites = mmread("cornell_cites.mtx").tocsr()        # shape: (3312, 3312)

print("Shapes:")
print("content:", content.shape)
print("inbound:", inbound.shape)
print("outbound:", outbound.shape)
print("cites:", cites.shape)

# === Step 2: Load labels ===
labels = pd.read_csv("cornell_act.txt", header=None, sep=r"\s+")[0].values  # document label indices
label_names = pd.read_csv("labels.txt", header=None)[0].values           # actual label names

# Map numeric labels to text
labels_text = [label_names[i-1] for i in labels]

# === Step 3: Convert to DataFrames ===
df_content = pd.DataFrame(content.toarray())
df_content["label"] = labels_text

df_inbound = pd.DataFrame(inbound.toarray())
df_inbound["label"] = labels_text

df_outbound = pd.DataFrame(outbound.toarray())
df_outbound["label"] = labels_text

df_cites = pd.DataFrame(cites.toarray())
df_cites["label"] = labels_text

# === Step 4: Save to CSV ===
df_content.to_csv("cornell_content.csv", index=False)
df_inbound.to_csv("cornell_inbound.csv", index=False)
df_outbound.to_csv("cornell_outbound.csv", index=False)
df_cites.to_csv("cornell_cites.csv", index=False)

print("✅ Conversion complete! CSV files saved:")
print("cornell_content.csv, cornell_inbound.csv, cornell_outbound.csv, cornell_cites.csv")


Shapes:
content: (195, 1703)
inbound: (195, 195)
outbound: (195, 195)
cites: (195, 195)
✅ Conversion complete! CSV files saved:
cornell_content.csv, cornell_inbound.csv, cornell_outbound.csv, cornell_cites.csv
