<a href="https://colab.research.google.com/github/TamannaAhmad/research-paper-optimizer/blob/main/quality_assurance.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import PyPDF2
import pandas as pd
import re
import warnings
import numpy as np
import torch
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import streamlit as st
import tempfile
# from google.colab import files
# from google.colab import drive
from transformers import BertTokenizer, BertModel
import joblib
from huggingface_hub import hf_hub_download

In [None]:
# drive.mount('/content/drive')

Mounted at /content/drive


In [17]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")

In [None]:
def sanitize_text(text):
    # remove phone numbers sensitive information
    text = re.sub(r"\b\d{3}[-.]?\d{3}[-.]?\d{4}\b", "[REDACTED]", text)
    return text

def get_bert_embeddings(text):
    inputs = tokenizer(text, padding=True, truncation=True, return_tensors="pt", max_length=512)
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        with torch.no_grad():
            outputs = model(**inputs)
    # get the [CLS] token embedding
    embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
    return embeddings[0]

In [19]:
def process_file(filepath):
    try:
        if filepath.endswith(".txt"):
            with open(filepath, "r", encoding="utf-8") as file:
                return file.read()
        elif filepath.endswith(".pdf"):
            with open(filepath, "rb") as f:
                pdf_reader = PyPDF2.PdfReader(f)
                text = ""
                for page in pdf_reader.pages:
                    text += page.extract_text() or ""
                return text
        else:
            return None  # unsupported file type

    except (FileNotFoundError, PyPDF2.errors.PdfReadError, Exception) as e:
        print(f"Error reading {filepath}: {e}")
        return None


In [20]:
def process_folder(folder_path):
    # Process files in folder; use file extension to determine coherence labels.
    all_data = []

    # Check if folder exists
    if not os.path.isdir(folder_path):
        print(f"Error: {folder_path} is not a valid directory")
        return pd.DataFrame()

    for filename in os.listdir(folder_path):
        # process only supported file types
        if filename.endswith((".txt", ".pdf")):
            filepath = os.path.join(folder_path, filename)
            text = process_file(filepath)

            if text and len(text.strip()) > 0:
                # Only process if we have meaningful text
                if filename.endswith(".pdf"):
                  all_data.append({"text": text, "label": 1, "filename": filename})
                elif filename.endswith(".txt"):
                  all_data.append({"text": text, "label": 0, "filename": filename})

    if not all_data:
        print("No valid files found or processed")
    # Print dataset statistics
    if all_data:
        df = pd.DataFrame(all_data)
        coherent_count = sum(df['label'])
        incoherent_count = len(df) - coherent_count
        print(f"Created dataset with {len(df)} examples ({coherent_count} coherent, {incoherent_count} incoherent)")

    return pd.DataFrame(all_data)

In [21]:
def extract_features(df):
    """Extract features from text data."""
    # Get BERT embeddings
    print("Extracting BERT embeddings...")
    embeddings = []
    for text in tqdm(df['text'], desc="Processing documents"):
        embedding = get_bert_embeddings(text)
        embeddings.append(embedding)

    # Convert to numpy array
    return np.array(embeddings)

In [22]:
def evaluate_model(model, X_test, y_test):
    """Detailed evaluation of the model performance."""
    # Make predictions
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]  # Probability for the positive class

    # Calculate accuracy
    accuracy = np.mean(y_pred == y_test)

    # Get classification report
    report = classification_report(y_test, y_pred, output_dict=True)

    # Extract metrics more safely by checking all possible key formats
    positive_class_keys = [1, '1', 1.0, '1.0']

    # Find the first valid key or use default values
    precision, recall, f1 = 0, 0, 0
    for key in positive_class_keys:
        if str(key) in report:
            precision = report[str(key)]['precision']
            recall = report[str(key)]['recall']
            f1 = report[str(key)]['f1-score']
            break

    # Print results
    print(f"Model Evaluation Results:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print("\nDetailed Classification Report:")
    print(classification_report(y_test, y_pred))

    return accuracy, precision, recall, f1

In [23]:
def train_and_evaluate_model(X, y):
    """Train and evaluate the coherence detection model."""
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    # Train model
    print("Training model...")
    model_coherence = LogisticRegression(max_iter=2000, class_weight='balanced')
    model_coherence.fit(X_train, y_train)

    # Evaluate
    print("Coherence Model Evaluation:")
    evaluate_model(model_coherence, X_test, y_test)

    return model_coherence, X_test, y_test

In [24]:
def predict_coherence(model, text):
    """Predict coherence for a new text."""
    # Get features
    embedding = get_bert_embeddings(text)
    embedding = embedding.reshape(1, -1)

    # Predict
    prediction = model.predict(embedding)
    confidence = model.predict_proba(embedding)[0][prediction[0]]

    return prediction[0], confidence

In [26]:
if __name__ == "__main__":
    train_folder_path = '/content/drive/MyDrive/Research_Paper_Optimizer/datasets/QA_training_dataset'
    test_folder_path = '/content/drive/MyDrive/Research_Paper_Optimizer/datasets/QA_test_dataset'

    # Process training folder - PDFs are coherent, TXTs are incoherent
    print("Processing training dataset...")
    train_df = process_folder(train_folder_path)

    # Check if we have enough data
    if len(train_df) < 4:  # Need at least two coherent and two incoherent examples
        print("Not enough valid files found for training")
        exit()

    # Extract features
    X_train = extract_features(train_df)
    y_train = train_df['label']

    print("Training model...")
    model_coherence = LogisticRegression(max_iter=2000, class_weight='balanced')
    model_coherence.fit(X_train, y_train)

    print("Processing test dataset...")
    test_df = process_folder(test_folder_path)

    if len(test_df) > 0:
        # Extract features from test data
        X_test = extract_features(test_df)
        y_test = test_df['label']

        # Evaluate model on test data
        print("Evaluating model on test dataset:")
        evaluate_model(model_coherence, X_test, y_test)
    else:
        print("No test data found for evaluation")

    # Save the model to Google Drive
    model_filename = "/content/drive/MyDrive/Research_Paper_Optimizer/Quality_Assurance/coherence_model.joblib"
    joblib.dump(model_coherence, model_filename)
    print(f"Model saved to {model_filename}")

Processing training dataset...
Created dataset with 200 examples (100 coherent, 100 incoherent)
Extracting BERT embeddings...


Processing documents: 100%|██████████| 200/200 [05:43<00:00,  1.72s/it]


Training model...
Processing test dataset...




Created dataset with 49 examples (25 coherent, 24 incoherent)
Extracting BERT embeddings...


Processing documents: 100%|██████████| 49/49 [01:26<00:00,  1.76s/it]

Evaluating model on test dataset:
Model Evaluation Results:
Accuracy: 0.6327
Precision: 0.6296
Recall: 0.6800
F1 Score: 0.6538

Detailed Classification Report:
              precision    recall  f1-score   support

           0       0.64      0.58      0.61        24
           1       0.63      0.68      0.65        25

    accuracy                           0.63        49
   macro avg       0.63      0.63      0.63        49
weighted avg       0.63      0.63      0.63        49

Model saved to /content/drive/MyDrive/Research_Paper_Optimizer/Quality_Assurance/coherence_model.joblib





In [None]:
def run_quality_assurance(uploaded_file = None):
    model_path = "models/coherence_model.joblib"
    try:
        repo_id = "TamannaAhmad/logical_coherence_model"
        filename = "coherence_model.joblib"
        model_path = hf_hub_download(repo_id=repo_id, filename=filename)
        model = joblib.load(model_path)
    except Exception as e:
        st.error(f"Failed to load model from Hugging Face: {e}")
        return
    
    # Process the file
    st.title("Quality Assurance")
    if not uploaded_file:
        st.info("Please upload a PDF file in the sidebar to begin")
        return
    temp_file_path = uploaded_file.name
    with open(temp_file_path, "wb") as f:
        f.write(uploaded_file.getbuffer())

    text = process_file(temp_file_path)
    if text is None:
        st.error("Failed to process the file. Please check if it's a valid PDF or TXT file.")
        return
    else:
        st.info("Processed the file.")
    try:
        if st.button("Perform Coherence Analysis"):
            with st.spinner("Analysing..."):
                prediction, confidence = predict_coherence(model, text)
            
            # Display results
            st.subheader("Coherence Analysis Results")
            
            if prediction == 1:
                st.success("✅ This document appears COHERENT (well-structured)")
            else:
                st.warning("⚠️ This document appears INCOHERENT (poorly structured)")
                
            st.write(f"Confidence: {confidence:.2%}")
            
            # Show explanation
            if prediction == 1:
                st.info("""
                **Coherent documents typically:**
                - Have clear logical flow between sections
                - Maintain consistent terminology
                - Follow standard academic structure
                - Contain well-formed paragraphs and transitions
                """)
            else:
                st.info("""
                **Incoherent documents may:**
                - Lack clear organization
                - Have abrupt topic changes
                - Contain repetitive or disjointed content
                - Miss important structural elements
                """)
                
    except Exception as e:
        st.error(f"Error during prediction: {e}")