<a href="https://colab.research.google.com/github/PKPandey-DU/Sanskrit_Translation/blob/main/notebooks/01_intro.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
# ============================================
#  Sanskrit_Translation Project – Setup Cell
#  Run this cell once at the start of every session
#  Sanskrit_Translation has been made Public for ease of use.
#  When it contains confidential/sensitive/Pre-published data, than,
#  make it Private and use Token
# ============================================

# --- 1️⃣ Mount Google Drive (persistent storage) ---
from google.colab import drive
drive.mount('/content/drive')

# --- 2️⃣ Create project directories in Drive ---
import os

DRIVE_PROJECT_PATH = "/content/drive/MyDrive/sanskrit_translation_data"
os.makedirs(DRIVE_PROJECT_PATH, exist_ok=True)

print(f"✅ Google Drive mounted and project directory ready at: {DRIVE_PROJECT_PATH}")

# --- 3️⃣ Install project dependencies directly from GitHub ---
!pip install -q -r https://raw.githubusercontent.com/PKPandey-DU/Sanskrit_Translation/main/requirements.txt

print("✅ Requirements installed successfully")

# --- 4️⃣ Verify GitHub access (check a sample file) ---
import requests

url = "https://raw.githubusercontent.com/PKPandey-DU/Sanskrit_Translation/main/data/example_input.txt"
r = requests.get(url)
if r.status_code == 200:
    print("✅ GitHub connection successful. Sample data:")
    print("-" * 40)
    print(r.text.strip())
else:
    print("⚠️ Could not fetch sample file from GitHub.")



Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
✅ Google Drive mounted and project directory ready at: /content/drive/MyDrive/sanskrit_translation_data
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.2/10.2 MB[0m [31m63.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m56.8 MB/s[0m eta [36m0:00:00[0m
[?25h✅ Requirements installed successfully
✅ GitHub connection successful. Sample data:
----------------------------------------
﻿धर्मो रक्षति रक्षितः ।
विद्या ददाति विनयं ।
कर्मण्येवाधिकारस्ते ।


In [6]:
# ============================================
#  Sanskrit_Translation Project – Data Verification Cell
#  Loads sample corpus from GitHub and previews it
# ============================================

import pandas as pd
import requests
from io import StringIO

# --- 1️⃣ Download the small sample corpus directly from GitHub ---
url = "https://raw.githubusercontent.com/PKPandey-DU/Sanskrit_Translation/main/data/corpus_sample.csv"
response = requests.get(url)

if response.status_code == 200:
    print("✅ Successfully fetched corpus_sample.csv from GitHub.")
    data = StringIO(response.text)
    df = pd.read_csv(data)

    # --- 2️⃣ Display first few rows ---
    print("\nSample Sanskrit–English Pairs:")
    display(df.head())

    # --- 3️⃣ Verify column structure ---
    print("\nColumns:", list(df.columns))
    print(f"Total samples: {len(df)}")
else:
    print("⚠️ Could not fetch corpus_sample.csv. Check that the file exists in your GitHub repo.")


✅ Successfully fetched corpus_sample.csv from GitHub.

Sample Sanskrit–English Pairs:


Unnamed: 0,sanskrit,english
0,शिक्षार्थं प्रयत्नः आवश्यकः।,Effort is necessary for learning.
1,सत्यं वद।,Speak the truth.



Columns: ['sanskrit', 'english']
Total samples: 2


In [7]:
# ============================================
#  Sanskrit_Translation Project – Text Preprocessing Cell
#  Cleans and tokenizes the Sanskrit text column
# ============================================

import pandas as pd
import re
import unicodedata

# (Optional) install transliteration helper
!pip install -q indic-transliteration

from indic_transliteration import sanscript
from indic_transliteration.sanscript import transliterate

# --- 1️⃣ Load the sample corpus again (or reuse df from previous cell) ---
url = "https://raw.githubusercontent.com/PKPandey-DU/Sanskrit_Translation/main/data/corpus_sample.csv"
df = pd.read_csv(url)

# --- 2️⃣ Basic normalization function ---
def normalize_sanskrit(text):
    # Normalize Unicode to NFC form (combining diacritics properly)
    text = unicodedata.normalize("NFC", text)
    # Remove punctuation marks and special symbols
    text = re.sub(r"[^\u0900-\u097F\s]", "", text)
    # Collapse multiple spaces and trim
    text = re.sub(r"\s+", " ", text).strip()
    return text

# --- 3️⃣ Tokenize (very simple whitespace-based) ---
def tokenize(text):
    return text.split()

# --- 4️⃣ Apply normalization and tokenization ---
df["sanskrit_clean"] = df["sanskrit"].apply(normalize_sanskrit)
df["tokens"] = df["sanskrit_clean"].apply(tokenize)

# --- 5️⃣ (Optional) Add IAST transliteration column ---
df["sanskrit_iast"] = df["sanskrit_clean"].apply(
    lambda x: transliterate(x, sanscript.DEVANAGARI, sanscript.ITRANS)
)

# --- 6️⃣ Display results ---
display(df[["sanskrit", "sanskrit_clean", "tokens", "sanskrit_iast"]])


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/159.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m153.6/159.6 kB[0m [31m5.9 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m159.6/159.6 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25h

Unnamed: 0,sanskrit,sanskrit_clean,tokens,sanskrit_iast
0,शिक्षार्थं प्रयत्नः आवश्यकः।,शिक्षार्थं प्रयत्नः आवश्यकः।,"[शिक्षार्थं, प्रयत्नः, आवश्यकः।]",shikShArthaM prayatnaH AvashyakaH|
1,सत्यं वद।,सत्यं वद।,"[सत्यं, वद।]",satyaM vada|
