In [None]:
# ============================================================
# FAQ Dataset Preprocessing Pipeline
# Input:  dataset/single_qna.csv  (raw Amazon product Q&A)
# Output: dataset/faq_data.csv    (clean question + answer pairs)
# ============================================================

%pip install beautifulsoup4 --quiet

import pandas as pd
import numpy as np
import re
from bs4 import BeautifulSoup
import warnings
warnings.filterwarnings('ignore')

# Load raw data
raw_df = pd.read_csv("dataset/single_qna.csv")
print(f"Raw dataset shape: {raw_df.shape}")
print(f"\nColumns: {raw_df.columns.tolist()}")
raw_df.head(3)

In [None]:
# ============================================================
# STEP 1: Keep only relevant columns & initial inspection
# ============================================================

df = raw_df[['Question', 'Answer', 'Category']].copy()
print(f"Shape after selecting columns: {df.shape}")
print(f"\nNull values:\n{df.isnull().sum()}")
print(f"\nDuplicate rows: {df.duplicated().sum()}")
print(f"\nCategories: {df['Category'].nunique()}")
print(f"\nSample categories: {df['Category'].value_counts().head(10)}")

In [None]:
# ============================================================
# STEP 2: Drop nulls and duplicates
# ============================================================

df = df.dropna(subset=['Question', 'Answer'])
print(f"After dropping nulls: {df.shape}")

df = df.drop_duplicates(subset=['Question', 'Answer'])
print(f"After dropping duplicates: {df.shape}")

In [None]:
# ============================================================
# STEP 3: Text cleaning function
# ============================================================

def clean_text(text):
    """Clean a single text string for FAQ use."""
    if not isinstance(text, str):
        return ""
    
    # Remove HTML tags
    text = BeautifulSoup(text, "html.parser").get_text()
    
    # Replace multiple whitespace/newlines with single space
    text = re.sub(r'\s+', ' ', text)
    
    # Remove leading/trailing whitespace
    text = text.strip()
    
    return text

# Test the cleaner
sample = raw_df['Answer'].iloc[6]
print("BEFORE:", sample[:200])
print("\nAFTER:", clean_text(sample)[:200])

In [None]:
# ============================================================
# STEP 4: Apply cleaning to Question and Answer columns
# ============================================================

df['Question'] = df['Question'].apply(clean_text)
df['Answer'] = df['Answer'].apply(clean_text)

print("Cleaning complete!")
df.head(5)

In [None]:
# ============================================================
# STEP 5: Filter out junk / too-short answers
# ============================================================

# Remove rows where answer is just "?" or empty-ish
df = df[df['Answer'].str.strip() != '?']
df = df[df['Answer'].str.strip() != '']
df = df[df['Question'].str.strip() != '']

# Remove very short answers (< 10 chars) — likely useless
df = df[df['Answer'].str.len() >= 10]

# Remove very short questions (< 10 chars)
df = df[df['Question'].str.len() >= 10]

print(f"After filtering junk: {df.shape}")
print(f"\nSample questions:")
for q in df['Question'].head(5).tolist():
    print(f"  - {q[:100]}")

In [None]:
# ============================================================
# STEP 6: Remove duplicate questions (keep best answer — longest)
# ============================================================

# For FAQ, we want unique questions with the best answer
# Sort by answer length (descending) so the longest answer is kept
df['answer_len'] = df['Answer'].str.len()
df = df.sort_values('answer_len', ascending=False)

# Drop duplicate questions, keeping the first (longest answer)
df = df.drop_duplicates(subset=['Question'], keep='first')
df = df.drop(columns=['answer_len'])

print(f"After dedup by question: {df.shape}")

In [None]:
# ============================================================
# STEP 7: Truncate very long answers to avoid SBERT token limits
# ============================================================

MAX_ANSWER_LEN = 512  # characters — keeps it manageable for the model

df['Answer'] = df['Answer'].str[:MAX_ANSWER_LEN]

print(f"Answer length stats after truncation:")
print(df['Answer'].str.len().describe())

In [None]:
# ============================================================
# STEP 8: Sample to a manageable size (optional — for faster indexing)
# ============================================================
# The full dataset is ~1M+ rows. 
# For a production FAQ demo, 50K-100K is plenty.
# Adjust SAMPLE_SIZE as needed.

SAMPLE_SIZE = 50_000

if len(df) > SAMPLE_SIZE:
    df = df.sample(n=SAMPLE_SIZE, random_state=42)
    print(f"Sampled down to: {df.shape[0]} rows")
else:
    print(f"Dataset small enough ({df.shape[0]} rows), no sampling needed")

In [None]:
# ============================================================
# STEP 9: Rename columns to standard FAQ format & reset index
# ============================================================

faq_df = df[['Question', 'Answer']].copy()
faq_df.columns = ['question', 'answer']
faq_df = faq_df.reset_index(drop=True)

print(f"Final FAQ dataset shape: {faq_df.shape}")
print(f"Nulls: {faq_df.isnull().sum().sum()}")
faq_df.head(10)

In [None]:
# ============================================================
# STEP 10: Save the final FAQ dataset
# ============================================================

faq_df.to_csv("dataset/faq_data.csv", index=False)
print(f"Saved to dataset/faq_data.csv")
print(f"Total FAQ pairs: {len(faq_df)}")
print(f"\nFile size: {pd.io.common.file_exists('dataset/faq_data.csv')}")

In [None]:
# ============================================================
# STEP 11: Quick sanity check — verify the saved file
# ============================================================

check = pd.read_csv("dataset/faq_data.csv")
print(f"Loaded back: {check.shape}")
print(f"Columns: {check.columns.tolist()}")
print(f"\nRandom samples:")
for i, row in check.sample(5, random_state=1).iterrows():
    print(f"\n  Q: {row['question'][:80]}...")
    print(f"  A: {row['answer'][:80]}...")