In [None]:
!pip install PyPDF2
!pip install scikit-learn
!pip install tqdm

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/232.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m15.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [None]:
import os
import PyPDF2
import pandas as pd
import re
import warnings
import numpy as np
import torch
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import zipfile
from google.colab import files
from transformers import BertTokenizer, BertModel
import joblib

In [None]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [None]:
def sanitize_text(text):
    # remove phone numbers??? sensitive information
    text = re.sub(r"\b\d{3}[-.]?\d{3}[-.]?\d{4}\b", "[REDACTED]", text)
    return text

def get_bert_embeddings(text):
    inputs = tokenizer(text, padding=True, truncation=True, return_tensors="pt", max_length=512)
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        with torch.no_grad():
            outputs = model(**inputs)
    # get the [CLS] token embedding
    embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
    return embeddings[0]

In [None]:
def process_file(filepath):
    try:
        if filepath.endswith(".txt"):
            with open(filepath, "r", encoding="utf-8") as file:
                return file.read()
        elif filepath.endswith(".pdf"):
            with open(filepath, "rb") as f:
                pdf_reader = PyPDF2.PdfReader(f)
                text = ""
                for page in pdf_reader.pages:
                    text += page.extract_text() or ""
                return text
        else:
            return None  # unsupported file type

    except (FileNotFoundError, PyPDF2.errors.PdfReadError, Exception) as e:
        print(f"Error reading {filepath}: {e}")
        return None


In [None]:
def process_folder(folder_path):
    # Process files in folder; use file extension to determine coherence labels.
    all_data = []

    # Check if folder exists
    if not os.path.isdir(folder_path):
        print(f"Error: {folder_path} is not a valid directory")
        return pd.DataFrame()

    for filename in os.listdir(folder_path):
        # process only supported file types
        if filename.endswith((".txt", ".pdf")):
            filepath = os.path.join(folder_path, filename)
            text = process_file(filepath)

            if text and len(text.strip()) > 0:
                # Only process if we have meaningful text
                if filename.endswith(".pdf"):
                  all_data.append({"text": text, "label": 1, "filename": filename})
                elif filename.endswith(".txt"):
                  all_data.append({"text": text, "label": 0, "filename": filename})

    if not all_data:
        print("No valid files found or processed")
    # Print dataset statistics
    if all_data:
        df = pd.DataFrame(all_data)
        coherent_count = sum(df['label'])
        incoherent_count = len(df) - coherent_count
        print(f"Created dataset with {len(df)} examples ({coherent_count} coherent, {incoherent_count} incoherent)")

    return pd.DataFrame(all_data)

In [None]:
def extract_features(df):
    """Extract features from text data."""
    # Get BERT embeddings
    print("Extracting BERT embeddings...")
    embeddings = []
    for text in tqdm(df['text'], desc="Processing documents"):
        embedding = get_bert_embeddings(text)
        embeddings.append(embedding)

    # Convert to numpy array
    return np.array(embeddings)

In [None]:
def evaluate_model(model, X_test, y_test):
    """Detailed evaluation of the model performance."""
    # Make predictions
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]  # Probability for the positive class

    # Calculate accuracy
    accuracy = np.mean(y_pred == y_test)

    # Get classification report
    report = classification_report(y_test, y_pred, output_dict=True)

    # Extract metrics more safely by checking all possible key formats
    positive_class_keys = [1, '1', 1.0, '1.0']

    # Find the first valid key or use default values
    precision, recall, f1 = 0, 0, 0
    for key in positive_class_keys:
        if str(key) in report:
            precision = report[str(key)]['precision']
            recall = report[str(key)]['recall']
            f1 = report[str(key)]['f1-score']
            break

    # Print results
    print(f"Model Evaluation Results:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print("\nDetailed Classification Report:")
    print(classification_report(y_test, y_pred))

    return accuracy, precision, recall, f1

In [None]:
def train_and_evaluate_model(X, y):
    """Train and evaluate the coherence detection model."""
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    # Train model
    print("Training model...")
    model_coherence = LogisticRegression(max_iter=2000, class_weight='balanced')
    model_coherence.fit(X_train, y_train)

    # Evaluate
    print("Coherence Model Evaluation:")
    evaluate_model(model_coherence, X_test, y_test)

    return model_coherence, X_test, y_test

In [None]:
def predict_coherence(model, text):
    """Predict coherence for a new text."""
    # Get features
    embedding = get_bert_embeddings(text)
    embedding = embedding.reshape(1, -1)

    # Predict
    prediction = model.predict(embedding)
    confidence = model.predict_proba(embedding)[0][prediction[0]]

    return prediction[0], confidence

In [None]:
# upload dataset
# if using Google Colab
uploaded = files.upload()
for filename in uploaded.keys():
    if filename.endswith('.zip'):
        sample_dir = '/content/sample_pdfs'
        with zipfile.ZipFile(filename, 'r') as zip_ref:
            zip_ref.extractall(sample_dir)
        print(f"Extracted {filename} to {sample_dir}")

Saving sample.zip to sample.zip
Extracted sample.zip to /content/sample_pdfs


In [None]:
if __name__ == "__main__":
    # Get folder path
    folder_path = input("Enter the path to the folder containing files: ")
    if not os.path.isdir(folder_path):
        print("Invalid folder path")
        exit()

    # Process folder - PDFs are coherent, TXTs are incoherent
    df = process_folder(folder_path)

    # Check if we have enough data
    if len(df) < 4:  # Need at least two coherent and two incoherent examples
        print("Not enough valid files found for training")
        exit()

    # Extract features
    X = extract_features(df)
    y = df['label']

    # Train and evaluate model
    model_coherence, X_test, y_test = train_and_evaluate_model(X, y)

    # Save the model
    model_filename = "coherence_model.joblib"
    joblib.dump(model_coherence, model_filename)
    print(f"Model saved to {model_filename}")

Enter the path to the folder containing files: /content/sample_pdfs/QA_test_dataset




Created dataset with 49 examples (25 coherent, 24 incoherent)
Extracting BERT embeddings...


Processing documents: 100%|██████████| 49/49 [00:57<00:00,  1.17s/it]

Training model...
Coherence Model Evaluation:
Model Evaluation Results:
Accuracy: 0.7000
Precision: 0.6667
Recall: 0.8000
F1 Score: 0.7273

Detailed Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.60      0.67         5
           1       0.67      0.80      0.73         5

    accuracy                           0.70        10
   macro avg       0.71      0.70      0.70        10
weighted avg       0.71      0.70      0.70        10

Model saved to coherence_model.joblib





In [None]:
# Example: Evaluate a new text file
while True:
  test_file_path = input("Enter path to a test file (or 'quit' to exit): ")
  if test_file_path.lower() == 'quit':
    break
  if os.path.exists(test_file_path):
    test_text = process_file(test_file_path)
    if test_text:
      prediction, confidence = predict_coherence(model_coherence, test_text)
      print(f"Text coherence prediction: {'Coherent' if prediction == 1 else 'Incoherent'}")
      print(f"Confidence: {confidence:.2f}")
    else:
      print("Could not read the file.")
  else:
    print("File not found.")

Enter path to a test file (or 'quit' to exit): /content/sample_pdfs/sample/2004.14646.pdf
Text coherence prediction: Coherent
Confidence: 0.53
Enter path to a test file (or 'quit' to exit): /content/sample_pdfs/sample/incoherent_1909.12200.txt
Text coherence prediction: Incoherent
Confidence: 0.66
Enter path to a test file (or 'quit' to exit): /content/sample_pdfs/sample/2005.07186.pdf
Text coherence prediction: Coherent
Confidence: 0.84
Enter path to a test file (or 'quit' to exit): /content/sample_pdfs/sample/incoherent_1909.13739v1.txt
Text coherence prediction: Incoherent
Confidence: 0.78
Enter path to a test file (or 'quit' to exit): quit
