<a href="https://colab.research.google.com/github/VedantKothari01/DocInsight/blob/main/DocInsight_Demo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


## DocInsight : Demo Version 0.1


In [None]:

!pip install -q sentence-transformers faiss-cpu transformers datasets spacy textstat python-docx pymupdf docx2txt nltk streamlit
!python -m spacy download en_core_web_sm

import nltk
nltk.download('punkt')
nltk.download('punkt_tab')


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m87.3 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
Install complete. If Colab prompts to restart the runtime after spaCy install, please DO restart and then continue.


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [None]:

import os, json, math, tempfile, html, time
from pathlib import Path
from sentence_transformers import SentenceTransformer, CrossEncoder, util
import faiss
import spacy
import textstat
import nltk
from nltk.tokenize import sent_tokenize
import docx2txt
import fitz  #PyMuPDF

print('Imports ready.')


Imports ready.


In [None]:

# --- Utilities: parsing PDFs and DOCX into text and sentence-splits ---

def extract_text_from_pdf(path):
    text = []
    doc = fitz.open(path)
    for page in doc:
        text.append(page.get_text())
    return "\n".join(text)

def extract_text(path):
    path = str(path)
    ext = Path(path).suffix.lower()
    if ext == '.pdf':
        return extract_text_from_pdf(path)
    elif ext == '.docx':
        return docx2txt.process(path)
    elif ext in ['.txt']:
        with open(path,'r',encoding='utf-8') as f:
            return f.read()
    else:
        raise ValueError('Unsupported file type: ' + ext)


def split_sentences(text):

    #basic sentence tokenizer using nltk's punkt
    sents = sent_tokenize(text)

    #strip and filter short sentences
    sents = [s.strip() for s in sents if len(s.strip())>3]
    return sents
print('Parsing utilities ready.')

Parsing utilities ready.


In [None]:

# --- Demo corpus and sample documents ---

demo_corpus = [
    "Climate change is a critical global issue that affects agriculture and health.",
    "The effects of global warming include rising sea levels and more extreme weather.",
    "Machine learning improves many real world tasks such as image recognition and language modeling.",
    "Neural networks can approximate complex functions and are widely used in deep learning.",
    "The French Revolution began in 1789 and led to major political changes in Europe.",
    "Photosynthesis is the process by which green plants convert sunlight into energy.",
    "The mitochondrion is the powerhouse of the cell.",
    "In 1969, Neil Armstrong became the first person to walk on the Moon.",
    "The capital of France is Paris.",
    "SQL stands for Structured Query Language and is used to manage relational databases."
]

student_doc = """Global warming causes severe weather and sea level rise, which impacts agriculture.
In this essay I discuss how neural nets are used for image recognition tasks.
A commonly known fact: The capital of France is Paris.
I also talk about photosynthesis as the mechanism plants use to store solar energy.
"""


os.makedirs('/mnt/data/demo', exist_ok=True)
with open('/mnt/data/demo/student1.txt','w',encoding='utf-8') as f:
    f.write(student_doc)

with open('/mnt/data/demo/corpus.txt','w',encoding='utf-8') as f:
    f.write('\n'.join(demo_corpus))

print('Demo files created in /mnt/data/demo')

Demo files created in /mnt/data/demo


In [None]:

# --- Load SBERT (bi-encoder) and build FAISS index ---
print('Loading SBERT (this may take a minute)...')
sbert_model = SentenceTransformer('all-MiniLM-L6-v2')
print('Encoding corpus...')
corpus_sentences = demo_corpus
corpus_embeddings = sbert_model.encode(corpus_sentences, convert_to_numpy=True, show_progress_bar=True)
import numpy as np
faiss.normalize_L2(corpus_embeddings)
d = corpus_embeddings.shape[1]
index = faiss.IndexFlatIP(d)
index.add(corpus_embeddings)
print('FAISS index built with', index.ntotal, 'sentences')

Loading SBERT (this may take a minute)...
Encoding corpus...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

FAISS index built with 10 sentences


In [None]:

# --- Search function and example ---
def semantic_search(sentence, top_k=5):
    emb = sbert_model.encode([sentence], convert_to_numpy=True)
    faiss.normalize_L2(emb)
    D, I = index.search(emb, top_k)
    results = []
    for score, idx in zip(D[0], I[0]):
        if idx < 0: continue
        results.append({'sentence': corpus_sentences[idx], 'score': float(score)})
    return results

query = 'Rising sea levels and extreme storms are a result of global warming.'
print('Query:', query)
res = semantic_search(query, top_k=5)
for r in res:
    print(f"Score: {r['score']:.3f} -> {r['sentence']}")

Query: Rising sea levels and extreme storms are a result of global warming.
Score: 0.891 -> The effects of global warming include rising sea levels and more extreme weather.
Score: 0.511 -> Climate change is a critical global issue that affects agriculture and health.
Score: 0.197 -> Machine learning improves many real world tasks such as image recognition and language modeling.
Score: 0.179 -> Photosynthesis is the process by which green plants convert sunlight into energy.
Score: 0.080 -> Neural networks can approximate complex functions and are widely used in deep learning.


In [None]:

# --- Cross-encoder reranker: rerank top-k candidates for higher precision ---

print('Loading cross-encoder (this may take another minute)...')

#CrossEncoder from sentence_transformers
reranker = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')  # compact reranker

def rerank(query, candidates):
    #candidates: list of strings
    pairs = [[query, c] for c in candidates]
    scores = reranker.predict(pairs)  #higher is better
    ranked = sorted(zip(candidates, scores), key=lambda x: x[1], reverse=True)
    return [{'sentence': s, 'rerank_score': float(sc)} for s,sc in ranked]

#Example rerank
cands = [r['sentence'] for r in res]
print('Reranked results:')
for item in rerank(query, cands):
    print(f"{item['rerank_score']:.3f} -> {item['sentence']}")

Loading cross-encoder (this may take another minute)...
Reranked results:
6.755 -> The effects of global warming include rising sea levels and more extreme weather.
-6.128 -> Climate change is a critical global issue that affects agriculture and health.
-10.636 -> Photosynthesis is the process by which green plants convert sunlight into energy.
-10.706 -> Machine learning improves many real world tasks such as image recognition and language modeling.
-10.735 -> Neural networks can approximate complex functions and are widely used in deep learning.


In [None]:

nlp = spacy.load('en_core_web_sm')

def stylometry_features(sentence):
    doc = nlp(sentence)
    feats = {}
    feats['num_tokens'] = len([t for t in doc if t.is_alpha])
    feats['avg_word_len'] = sum(len(t.text) for t in doc if t.is_alpha)/max(1,len([t for t in doc if t.is_alpha]))
    feats['flesch_reading_ease'] = textstat.flesch_reading_ease(sentence)

    #type-token ratio approximation
    words = [t.text.lower() for t in doc if t.is_alpha]
    feats['ttr'] = len(set(words))/max(1,len(words))
    feats['punct_density'] = len([t for t in doc if t.is_punct]) / max(1, len(doc))

    #POS ratios (noun, verb)
    pos_counts = {}
    for t in doc:
        pos_counts[t.pos_] = pos_counts.get(t.pos_, 0) + 1
    feats['noun_ratio'] = pos_counts.get('NOUN',0)/max(1,len(doc))
    feats['verb_ratio'] = pos_counts.get('VERB',0)/max(1,len(doc))
    return feats

#compute stylometry for sentences in student doc
text = open('/mnt/data/demo/student1.txt','r',encoding='utf-8').read()
sents = split_sentences(text)
print('Student doc sentences:')
for s in sents:
    print('----\n', s)
    print(stylometry_features(s))

Student doc sentences:
----
 Global warming causes severe weather and sea level rise, which impacts agriculture.
{'num_tokens': 12, 'avg_word_len': 5.833333333333333, 'flesch_reading_ease': 39.55500000000001, 'ttr': 1.0, 'punct_density': 0.14285714285714285, 'noun_ratio': 0.42857142857142855, 'verb_ratio': 0.14285714285714285}
----
 In this essay I discuss how neural nets are used for image recognition tasks.
{'num_tokens': 14, 'avg_word_len': 4.5, 'flesch_reading_ease': 65.72500000000001, 'ttr': 1.0, 'punct_density': 0.06666666666666667, 'noun_ratio': 0.3333333333333333, 'verb_ratio': 0.13333333333333333}
----
 A commonly known fact: The capital of France is Paris.
{'num_tokens': 10, 'avg_word_len': 4.3, 'flesch_reading_ease': 69.78500000000001, 'ttr': 1.0, 'punct_density': 0.16666666666666666, 'noun_ratio': 0.16666666666666666, 'verb_ratio': 0.08333333333333333}
----
 I also talk about photosynthesis as the mechanism plants use to store solar energy.
{'num_tokens': 14, 'avg_word_len'

In [None]:

alpha, beta, gamma = 0.6, 0.3, 0.1

def compute_fused_score(query_sent, top_candidates):

    #top_candidates: output from semantic_search (list dicts)
    candidates = [c['sentence'] for c in top_candidates]
    semantic_scores = [c['score'] for c in top_candidates]
    rerank_results = rerank(query_sent, candidates)

    #map sentence->rerank_score
    rerank_map = {r['sentence']: r['rerank_score'] for r in rerank_results}
    fused = []

    #get min rerank score for normalization safely
    rerank_scores_list = [r['rerank_score'] for r in rerank_results] if rerank_results else [0.0]
    rer_min = min(rerank_scores_list) if rerank_scores_list else 0.0
    for i, cand in enumerate(candidates):
        sem = semantic_scores[i]
        rer = rerank_map.get(cand, 0.0)

        #stylometry: compute difference of flesch_reading_ease as a tiny signal (demo)
        styl_q = stylometry_features(query_sent)['flesch_reading_ease']
        styl_c = stylometry_features(cand)['flesch_reading_ease']
        styl_score = 1.0 - abs((styl_q - styl_c)/50.0)  #normalized rough measure
        fused_score = alpha*sem + beta*(rer - rer_min) + gamma*styl_score
        fused.append({'candidate': cand, 'semantic': float(sem), 'rerank': float(rer), 'styl_score': float(styl_score), 'fused': float(fused_score)})
    merged = sorted(fused, key=lambda x: x['fused'], reverse=True)
    return merged


query = sents[0]
print('Query:', query)
topk = semantic_search(query, top_k=5)
merged = compute_fused_score(query, topk)
for m in merged:
    print(m)

Query: Global warming causes severe weather and sea level rise, which impacts agriculture.
{'candidate': 'The effects of global warming include rising sea levels and more extreme weather.', 'semantic': 0.7982355356216431, 'rerank': 3.615565061569214, 'styl_score': 0.6515307692307687, 'fused': 4.875059082782452}
{'candidate': 'Climate change is a critical global issue that affects agriculture and health.', 'semantic': 0.71343594789505, 'rerank': 1.6991386413574219, 'styl_score': 0.8589999999999998, 'fused': 4.269998327159882}
{'candidate': 'Photosynthesis is the process by which green plants convert sunlight into energy.', 'semantic': 0.24387386441230774, 'rerank': -10.227087020874023, 'styl_score': 1.0, 'fused': 0.4244933784008026}
{'candidate': 'Machine learning improves many real world tasks such as image recognition and language modeling.', 'semantic': 0.21470870077610016, 'rerank': -10.82098388671875, 'styl_score': 0.9600285714285712, 'fused': 0.2248280776085172}
{'candidate': 'The

In [None]:

def generate_report(doc_path, out_json='/mnt/data/demo/report.json', out_html='/mnt/data/demo/report.html'):
    text = extract_text(doc_path)
    sents = split_sentences(text)
    report = {'document': str(doc_path), 'sentences': []}
    for s in sents:
        topk = semantic_search(s, top_k=5)
        fused = compute_fused_score(s, topk)
        best = fused[0] if fused else {}
        entry = {
            'sentence': s,
            'best_match': best.get('candidate',''),
            'semantic_score': best.get('semantic',0.0),
            'rerank_score': best.get('rerank',0.0),
            'stylometry_score': best.get('styl_score',0.0),
            'fused_score': best.get('fused',0.0),
            'stylometry_features': stylometry_features(s)
        }
        report['sentences'].append(entry)

    with open(out_json,'w',encoding='utf-8') as f:
        json.dump(report, f, indent=2)

    parts = [f"<h1>DocInsight Report for {html.escape(str(doc_path))}</h1>"]
    for e in report['sentences']:
        parts.append('<div style="border:1px solid #ddd;padding:8px;margin:6px;">')
        parts.append(f"<b>Sentence:</b> {html.escape(e['sentence'])}<br>")
        parts.append(f"<b>Best match:</b> {html.escape(e['best_match'])} (semantic={e['semantic_score']:.3f}, rerank={e['rerank_score']:.3f}, styl={e['stylometry_score']:.3f}, fused={e['fused_score']:.3f})<br>")
        parts.append(f"<b>Stylometry features:</b> {html.escape(str(e['stylometry_features']))}<br>")
        parts.append('</div>')
    html_text = '\n'.join(parts)
    with open(out_html,'w',encoding='utf-8') as f:
        f.write(html_text)
    print('Report saved to', out_json, 'and', out_html)
    return report

report = generate_report('/mnt/data/demo/student1.txt')
print(report['sentences'][0])

Report saved to /mnt/data/demo/report.json and /mnt/data/demo/report.html
{'sentence': 'Global warming causes severe weather and sea level rise, which impacts agriculture.', 'best_match': 'The effects of global warming include rising sea levels and more extreme weather.', 'semantic_score': 0.7982355356216431, 'rerank_score': 3.615565061569214, 'stylometry_score': 0.6515307692307687, 'fused_score': 4.875059082782452, 'stylometry_features': {'num_tokens': 12, 'avg_word_len': 5.833333333333333, 'flesch_reading_ease': 39.55500000000001, 'ttr': 1.0, 'punct_density': 0.14285714285714285, 'noun_ratio': 0.42857142857142855, 'verb_ratio': 0.14285714285714285}}


In [None]:

streamlit_code = r"""
import streamlit as st, json, os
from demo_utils import extract_text, split_sentences, generate_report  # demo_utils you'd save from notebook
st.title('DocInsight — Mini Demo')
uploaded = st.file_uploader('Upload .txt/.pdf/.docx', type=['txt','pdf','docx'])
if uploaded:
    path = '/tmp/' + uploaded.name
    with open(path,'wb') as f:
        f.write(uploaded.getbuffer())
    report = generate_report(path, out_json='/tmp/report.json', out_html='/tmp/report.html')
    st.markdown('### Report (top sentences)')
    for s in report['sentences'][:10]:
        st.write(s['sentence'])
        st.write('**Best match**:', s['best_match'], '| fused:', round(s['fused_score'],3))
    st.markdown('Download report files below:')
    with open('/tmp/report.html','r',encoding='utf-8') as f:
        st.download_button('Download HTML', f.read(), file_name='docinsight_report.html', mime='text/html')
"""

with open('/mnt/data/demo/app.py','w',encoding='utf-8') as f:
    f.write(streamlit_code)
print('Streamlit app saved to /mnt/data/demo/app.py (run: streamlit run app.py)')

Streamlit app saved to /mnt/data/demo/app.py (run: streamlit run app.py)


In [None]:
# Install ngrok
!pip install -q ngrok

In [None]:
# Install pyngrok
!pip install -q pyngrok

In [None]:
# Save utility functions to demo_utils.py
utility_code = """
import os, json, math, tempfile, html, time
from pathlib import Path
from sentence_transformers import SentenceTransformer, CrossEncoder, util
import faiss
import spacy
import textstat
import nltk
from nltk.tokenize import sent_tokenize
import docx2txt
import fitz  # PyMuPDF
import numpy as np


def extract_text_from_pdf(path):
    text = []
    doc = fitz.open(path)
    for page in doc:
        text.append(page.get_text())
    return "\\n".join(text)

def extract_text(path):
    path = str(path)
    ext = Path(path).suffix.lower()
    if ext == '.pdf':
        return extract_text_from_pdf(path)
    elif ext == '.docx':
        return docx2txt.process(path)
    elif ext in ['.txt']:
        with open(path,'r',encoding='utf-8') as f:
            return f.read()
    else:
        raise ValueError('Unsupported file type: ' + ext)

def split_sentences(text):
    # basic sentence tokenizer using nltk's punkt
    sents = sent_tokenize(text)
    # strip and filter short sentences
    sents = [s.strip() for s in sents if len(s.strip())>3]
    return sents

# --- Load SBERT (bi-encoder) and build FAISS index ---
# Note: In a real app, you'd load the model and index once outside the function
# For this demo, we'll include it here for simplicity, assuming it's pre-loaded or fast enough

# Placeholder for pre-loaded model and index (run these cells in the notebook first)
# sbert_model = SentenceTransformer('all-MiniLM-L6-v2')
# index = faiss.IndexFlatIP(384) # Assuming dimension 384
# corpus_sentences = [] # Load your corpus sentences
# index.add(sbert_model.encode(corpus_sentences, convert_to_numpy=True))

# For the demo, we'll use the pre-loaded objects from the notebook environment
# This requires the notebook cells loading sbert_model, index, and corpus_sentences to be run first.
global sbert_model, index, corpus_sentences
try:
    sbert_model
    index
    corpus_sentences
except NameError:
    print("Warning: SBERT model, FAISS index, or corpus_sentences not found. Please run the relevant cells in the notebook first.")
    # Fallback/placeholder - in a real scenario, handle this properly
    sbert_model = None
    index = None
    corpus_sentences = []


def semantic_search(sentence, top_k=5):
    if sbert_model is None or index is None or not corpus_sentences:
        return [] # Return empty if dependencies not loaded
    emb = sbert_model.encode([sentence], convert_to_numpy=True)
    faiss.normalize_L2(emb)
    D, I = index.search(emb, top_k)
    results = []
    for score, idx in zip(D[0], I[0]):
        if idx < 0: continue
        results.append({'sentence': corpus_sentences[idx], 'score': float(score)})
    return results

# --- Cross-encoder reranker ---
# Placeholder for pre-loaded model
# reranker = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

global reranker
try:
    reranker
except NameError:
    print("Warning: Cross-encoder reranker not found. Please run the relevant cell in the notebook first.")
    reranker = None


def rerank(query, candidates):
    if reranker is None:
        return [{'sentence': s, 'rerank_score': 0.0} for s in candidates] # Return with default score
    pairs = [[query, c] for c in candidates]
    scores = reranker.predict(pairs)
    ranked = sorted(zip(candidates, scores), key=lambda x: x[1], reverse=True)
    return [{'sentence': s, 'rerank_score': float(sc)} for s,sc in ranked]

# --- Stylometry feature extraction ---
# Placeholder for pre-loaded model
# nlp = spacy.load('en_core_web_sm')

global nlp
try:
    nlp
except NameError:
    print("Warning: spaCy model not found. Please run the relevant cell in the notebook first.")
    nlp = None


def stylometry_features(sentence):
    if nlp is None:
        return {} # Return empty features if model not loaded
    doc = nlp(sentence)
    feats = {}
    feats['num_tokens'] = len([t for t in doc if t.is_alpha])
    feats['avg_word_len'] = sum(len(t.text) for t in doc if t.is_alpha)/max(1,len([t for t in doc if t.is_alpha]))
    feats['flesch_reading_ease'] = textstat.flesch_reading_ease(sentence)
    # type-token ratio approximation
    words = [t.text.lower() for t in doc if t.is_alpha]
    feats['ttr'] = len(set(words))/max(1,len(words))
    feats['punct_density'] = len([t for t in doc if t.is_punct]) / max(1, len(doc))
    # POS ratios (noun, verb)
    pos_counts = {}
    for t in doc:
        pos_counts[t.pos_] = pos_counts.get(t.pos_, 0) + 1
    feats['noun_ratio'] = pos_counts.get('NOUN',0)/max(1,len(doc))
    feats['verb_ratio'] = pos_counts.get('VERB',0)/max(1,len(doc))
    return feats

# --- Simple fusion of semantic + reranker + stylometry signals ---
alpha, beta, gamma = 0.6, 0.3, 0.1  # weights for semantic, reranker, stylometry (scaled)

def compute_fused_score(query_sent, top_candidates):
    # top_candidates: output from semantic_search (list dicts)
    candidates = [c['sentence'] for c in top_candidates]
    semantic_scores = [c['score'] for c in top_candidates]
    rerank_results = rerank(query_sent, candidates)
    # map sentence->rerank_score
    rerank_map = {r['sentence']: r['rerank_score'] for r in rerank_results}
    fused = []
    # get min rerank score for normalization safely
    rerank_scores_list = [r['rerank_score'] for r in rerank_results] if rerank_results else [0.0]
    rer_min = min(rerank_scores_list) if rerank_scores_list else 0.0
    for i, cand in enumerate(candidates):
        sem = semantic_scores[i]
        rer = rerank_map.get(cand, 0.0)
        # stylometry: compute difference of flesch_reading_ease as a tiny signal (demo)
        styl_q = stylometry_features(query_sent).get('flesch_reading_ease', 0.0) # Use get with default
        styl_c = stylometry_features(cand).get('flesch_reading_ease', 0.0) # Use get with default
        styl_score = 1.0 - abs((styl_q - styl_c)/50.0)  # normalized rough measure
        fused_score = alpha*sem + beta*(rer - rer_min) + gamma*styl_score
        fused.append({'candidate': cand, 'semantic': float(sem), 'rerank': float(rer), 'styl_score': float(styl_score), 'fused': float(fused_score)})
    merged = sorted(fused, key=lambda x: x['fused'], reverse=True)
    return merged


# --- Report generation ---
def generate_report(doc_path, out_json='/tmp/report.json', out_html='/tmp/report.html'):
    text = extract_text(doc_path)
    sents = split_sentences(text)
    report = {'document': str(doc_path), 'sentences': []}
    for s in sents:
        topk = semantic_search(s, top_k=5)
        fused = compute_fused_score(s, topk)
        best = fused[0] if fused else {}
        entry = {
            'sentence': s,
            'best_match': best.get('candidate',''),
            'semantic_score': best.get('semantic',0.0),
            'rerank_score': best.get('rerank',0.0),
            'stylometry_score': best.get('styl_score',0.0),
            'fused_score': best.get('fused',0.0),
            'stylometry_features': stylometry_features(s)
        }
        report['sentences'].append(entry)
    # save JSON
    with open(out_json,'w',encoding='utf-8') as f:
        json.dump(report, f, indent=2)
    # create simple HTML
    parts = [f"<h1>DocInsight Report for {html.escape(str(doc_path))}</h1>"]
    for e in report['sentences']:
        parts.append('<div style="border:1px solid #ddd;padding:8px;margin:6px;">')
        parts.append(f"<b>Sentence:</b> {html.escape(e['sentence'])}<br>")
        parts.append(f"<b>Best match:</b> {html.escape(e['best_match'])} (semantic={e['semantic_score']:.3f}, rerank={e['rerank_score']:.3f}, styl={e['stylometry_score']:.3f}, fused={e['fused_score']:.3f})<br>")
        parts.append(f"<b>Stylometry features:</b> {html.escape(str(e['stylometry_features']))}<br>")
        parts.append('</div>')
    html_text = '\\n'.join(parts)
    with open(out_html,'w',encoding='utf-8') as f:
        f.write(html_text)
    print('Report saved to', out_json, 'and', out_html)
    return report
"""

with open('/mnt/data/demo/demo_utils.py','w',encoding='utf-8') as f:
    f.write(utility_code)

print('demo_utils.py created in /mnt/data/demo')

demo_utils.py created in /mnt/data/demo


In [None]:
# Run Streamlit with ngrok using pyngrok
from pyngrok import ngrok
import os
from google.colab import userdata
import time

time.sleep(2)
ngrok.kill()

NGROK_AUTH_TOKEN = userdata.get('NGROK_AUTH_TOKEN')
if NGROK_AUTH_TOKEN is None:
    print("NGROK_AUTH_TOKEN not found in Colab Secrets. Please add it.")
else:
    ngrok.set_auth_token(NGROK_AUTH_TOKEN)

    print("Starting Streamlit app in the background...")
    # Run the Streamlit app in the background
    !streamlit run /mnt/data/demo/app.py > /dev/null 2>&1 &

    time.sleep(10)

    print("Attempting to establish ngrok tunnel...")
    try:
        public_url = ngrok.connect(addr="8501", proto="http")
        print(f"Streamlit app available at: {public_url}")
    except Exception as e:
        print(f"Error starting ngrok tunnel: {e}")

Starting Streamlit app in the background...
Attempting to establish ngrok tunnel...
Streamlit app available at: NgrokTunnel: "https://734ee730ecaa.ngrok-free.app" -> "http://localhost:8501"



### Notes & next steps
  - Fine-tune SBERT on an academic paraphrase dataset (PAWS/Quora + synthetic adversaries).  
  - Train a cross-encoder and stylometry classifier with labeled data.  
  - Replace heuristic fusion with a learned logistic regressor trained on validation data.  
  - Build a larger corpus and use FAISS indexing strategies (IVF/PQ) for scale.
---
