In [None]:
import pandas as pd
import json
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from collections import Counter
import nltk
from nltk.corpus import stopwords
import re

In [None]:
import json

# Update the correct file path after checking the folder structure
jsonl_file_path = "/kaggle/input/trec2023/TREC2023 Data/TREC-ToT/TREC-TOT/corpus.jsonl"  

# Read and print a single line from JSONL file
with open(jsonl_file_path, 'r', encoding='utf-8') as f:
    first_line = json.loads(f.readline())

# Display the structure of the first document
print(first_line)


# **DATA PRE-PROCESSING**

In [None]:
# Define the output path for the preprocessed corpus
OUTPUT_PATH = "/kaggle/working/preprocessed_corpus.jsonl"  

In [None]:
def clean_wikipedia_markup(text):
    """Remove Wikipedia markup, templates, references, and normalize text."""
    text = re.sub(r'\{\{.*?\}\}', '', text)  # Remove {{templates}}
    text = re.sub(r'\[\[([^\|\]]+\|)?([^\]]+)\]\]', r'\2', text)  # Convert [[link|text]] to "text"
    text = re.sub(r'<ref.*?>.*?</ref>', '', text)  # Remove <ref> tags
    text = re.sub(r'\s+', ' ', text).strip()  # Remove excessive whitespace
    return text.lower()  # Convert text to lowercase

In [None]:
def preprocess_corpus(file_path, output_path):
    """Load, clean, and save corpus.jsonl while retaining doc_id, text, and page_title."""
    processed_corpus = []

    with open(file_path, 'r', encoding='utf-8') as f, open(output_path, 'w', encoding='utf-8') as out_f:
        for line in f:
            doc = json.loads(line)
            clean_text = clean_wikipedia_markup(doc.get("text", ""))
            page_title = doc.get("page_title", "").lower().strip()  # Convert title to lowercase

            if clean_text:  # Ensure we don't keep empty documents
                cleaned_doc = {
                    "doc_id": doc["doc_id"],
                    "page_title": page_title,  # Include lowercase page title
                    "text": clean_text  # Store cleaned and lowercase text
                }
                processed_corpus.append(cleaned_doc)
                out_f.write(json.dumps(cleaned_doc) + "\n")  # Save line-by-line

    return processed_corpus

In [None]:
# Run Preprocessing
CORPUS_PATH = "/kaggle/input/trec2023/TREC2023 Data/TREC-ToT/TREC-TOT/corpus.jsonl"  # Adjust file path as needed
preprocessed_corpus = preprocess_corpus(CORPUS_PATH, OUTPUT_PATH)

# Display confirmation message
OUTPUT_PATH

In [None]:
with open(OUTPUT_PATH, 'r') as f:
    for _ in range(5):
        print(json.loads(f.readline()))


In [None]:
import pandas as pd
import json

csv_file_path = "/kaggle/working/preprocessed_corpus.csv"  # New path to save CSV

# Read and convert JSONL to DataFrame
data = []
with open(OUTPUT_PATH, 'r', encoding='utf-8') as f:
    for line in f:
        data.append(json.loads(line))

df = pd.DataFrame(data)
df.to_csv(csv_file_path, index=False)  # Save CSV

# Display first few rows
print(df.head())

print(f"CSV file successfully saved at: {csv_file_path}")


# **EDA** 

In [None]:

# Download stopwords if not already available
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))


In [None]:
# Load preprocessed corpus
preprocessed_corpus_path = "/kaggle/working/preprocessed_corpus.jsonl"

In [None]:
# Read JSONL into DataFrame
data = []
with open(preprocessed_corpus_path, 'r', encoding='utf-8') as f:
    for line in f:
        data.append(json.loads(line))

df = pd.DataFrame(data)

In [None]:
# Basic dataset overview
dataset_info = {
    "Total Documents": len(df),
    "Missing Titles": df['page_title'].isna().sum(),
    "Missing Texts": df['text'].isna().sum(),
    "Average Text Length": df['text'].apply(lambda x: len(x.split())).mean(),
    "Max Text Length": df['text'].apply(lambda x: len(x.split())).max(),
    "Min Text Length": df['text'].apply(lambda x: len(x.split())).min()
}

In [None]:
print(dataset_info)

In [None]:
# Add text length column
df["text_length"] = df["text"].apply(lambda x: len(x.split()))

# Extract most frequent words (excluding stopwords)
all_words = " ".join(df["text"]).split()
filtered_words = [word for word in all_words if word.lower() not in stop_words]
word_counts = Counter(filtered_words)
most_common_words = word_counts.most_common(20)  # Top 20 words


In [None]:
print(word_counts)

In [None]:
print(most_common_words)