In [1]:
import sys
import pandas as pd
import numpy as np
import re
import spacy
import nltk
from nltk.corpus import stopwords

print("pandas:", pd.__version__)
print("numpy:", np.__version__)
print("spacy:", spacy.__version__)
print("nltk:", nltk.__version__)

nlp = spacy.load("en_core_web_sm", disable=["parser", "tagger", "lemmatizer"])
print("spaCy model loaded:", nlp.pipe_names)
print("Python version:", sys.version)

pandas: 2.3.3
numpy: 2.4.0
spacy: 3.8.11
nltk: 3.9.2
spaCy model loaded: ['tok2vec', 'attribute_ruler', 'ner']
Python version: 3.13.7 (tags/v3.13.7:bcee1c3, Aug 14 2025, 14:15:11) [MSC v.1944 64 bit (AMD64)]


In [2]:
input_csv = sys.argv[1]
output_csv = sys.argv[2]

df = pd.read_csv(input_csv)

# Display basic information about the dataset
# print(df.head())

IndexError: list index out of range

In [None]:
# Run once; safe to leave but it will check/download each time.
# nltk.download("stopwords", quiet=True)

In [None]:
EMAIL_RE = re.compile(r"\b[\w\.-]+@[\w\.-]+\.\w+\b")
PHONE_RE = re.compile(r"(\+?\d{1,2}\s*)?(\(?\d{3}\)?[\s.-]?)\d{3}[\s.-]?\d{4}")
URL_RE   = re.compile(r"https?://\S+|www\.\S+")

STOPWORDS = set(stopwords.words("english"))

In [None]:
STOPWORDS = set(stopwords.words("english"))

# IMPORTANT for sentiment: keep negations
for w in ["no", "nor", "not", "never", "n't"]:
    STOPWORDS.discard(w)

In [None]:
def anonymize_text(text: str) -> str:
    text = EMAIL_RE.sub("[EMAIL]", text)
    text = PHONE_RE.sub("[PHONE]", text)
    text = URL_RE.sub("[URL]", text)

    doc = nlp(text)
    redacted = text

    for ent in sorted(doc.ents, key=lambda e: e.start_char, reverse=True):
        if ent.label_ in {"PERSON", "ORG", "GPE", "LOC"}:
            redacted = redacted[:ent.start_char] + f"[{ent.label_}]" + redacted[ent.end_char:]

    return redacted

def clean_text(text: str) -> str:
    text = text.lower()
    # keep placeholders like [email] by allowing brackets
    text = re.sub(r"[^\w\s\[\]]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()

    tokens = [
        t for t in text.split()
        if t not in STOPWORDS and len(t) > 2
    ]
    return " ".join(tokens)

def preprocess(val) -> str:
    # Handles NaN and non-strings safely
    if not isinstance(val, str):
        return ""
    if not val.strip():
        return ""
    text = anonymize_text(val)
    return clean_text(text)

In [None]:
# -------- DETECT FREE-TEXT COLUMNS --------
text_cols = [
    c for c in df.columns
    if any(k in c.lower() for k in ["comment", "comments", "what", "other", "appreciate", "improve"])
]

rows = []

for _, row in df.iterrows():
    instructor_name = str(row.get("crs_dir", "") or "").strip()
    course_number = str(row.get("crsnum", "") or "").strip()
    course_name = str(row.get("crsname", "") or "").strip()

    for col in text_cols:
        raw_text = row.get(col, "")
        cleaned = preprocess(raw_text)
        
        # Skip very short cleaned strings
        if len(cleaned.split()) < 3:
            continue

        col_l = col.lower()
        target = "Instructor" if any(k in col_l for k in ["instructor", "professor", "teacher", "faculty"]) else "Course"

        rows.append({
            "TargetType": target,
            "InstructorName": instructor_name,
            "CourseNumber": course_number,
            "CourseName": course_name,
            "RawText": anonymize_text(raw_text) if isinstance(raw_text, str) else "",
            "TextClean": cleaned
        })

out_df = pd.DataFrame(rows)
out_df.to_csv(output_csv, index=False)

print(f"Exported {len(out_df)} cleaned text rows")

Exported 2331 cleaned text rows
