# Install Required Libraries #

In [None]:
!pip install faiss-cpu wget transformers datasets torch

# Import Required Libraries #

In [1]:
import random
import wget
import faiss
import torch
import numpy as np
import matplotlib.pyplot as plt
from transformers import DPRContextEncoder, DPRContextEncoderTokenizer, DPRQuestionEncoder, DPRQuestionEncoderTokenizer, AutoTokenizer, AutoModelForCausalLM
from sklearn.manifold import TSNE
from mpl_toolkits.mplot3d import Axes3D
import warnings

def warn(*args, **kwargs):
    pass
warnings.warn = warn
warnings.filterwarnings("ignore")


  from .autonotebook import tqdm as notebook_tqdm


# Defining Helper Functions #

In [None]:
def tsne_plot(data):
    # transform the data to three dimension
    tsne = TSNE(n_components = 3, random_state=42, perplexity=data.shape[0]-1)
    data_3d = tsne.fit_transform(data)

    num_points = len(data_3d)
    colors = plt.cm.tab20(np.linspace(0, 1, num_points))

    figure = plt.figure(figsize=(12, 10))
    ax = figure.add_subplot(111, projection = '3d')

    for idx, point in enumerate(data_3d):
        ax.scatter(point[0], point[1], point[2], color = colors[idx], label = str(idx))

    ax.set_xlabel('TSNE component 1')
    ax.set_ylabel('TSNE component 2')
    ax.set_zlabel('TSNE component 3')
    plt.legend(title='Input Points')
    plt.title('TSNE Visualization')
    plt.show()

# Load and Preprocess Data #

### Convert Pdf to Txt ###

In [None]:
# Install required library
%pip install pymupdf

import fitz  # PyMuPDF
import os  # For file handling

def pdf_to_text(pdf_path, txt_path):
    # Ensure the TXT file is deleted if it exists
    if os.path.exists(txt_path):
        os.remove(txt_path)  # Delete the file to avoid duplication

    doc = fitz.open(pdf_path)  # Open PDF
    text = ""

    for page in doc:  
        text += page.get_text("text") + "\n"  # Extract text from each page
    
    # Write fresh text to the file
    with open(txt_path, "w", encoding="utf-8") as f:
        f.write(text)

    print(f"Text extracted and saved to {txt_path}")

# Usage
pdf_to_text("StrokeRehabilitation.pdf", "StrokeRehabilitation.txt")


Collecting pymupdf
  Downloading pymupdf-1.25.3-cp39-abi3-win_amd64.whl.metadata (3.4 kB)
Collecting pdfplumber
  Downloading pdfplumber-0.11.5-py3-none-any.whl.metadata (42 kB)
Collecting pdfminer.six==20231228 (from pdfplumber)
  Downloading pdfminer.six-20231228-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.1-py3-none-win_amd64.whl.metadata (48 kB)
Collecting cryptography>=36.0.0 (from pdfminer.six==20231228->pdfplumber)
  Downloading cryptography-44.0.2-cp39-abi3-win_amd64.whl.metadata (5.7 kB)
Collecting cffi>=1.12 (from cryptography>=36.0.0->pdfminer.six==20231228->pdfplumber)
  Using cached cffi-1.17.1-cp311-cp311-win_amd64.whl.metadata (1.6 kB)
Downloading pymupdf-1.25.3-cp39-abi3-win_amd64.whl (16.5 MB)
   ---------------------------------------- 0.0/16.5 MB ? eta -:--:--
   ---------------------------------------- 0.0/16.5 MB ? eta -:--:--
   - -------------------------------------- 0.8/16.5 MB 4.2 MB/s eta 0:00