In [None]:
from google.colab import drive
import os
import shutil

# --- 1. Mount Drive (WITH THE NECESSARY FIX) ---
MOUNT_PATH = '/content/drive'


try:
    drive.flush_and_unmount()
except:
    pass

if os.path.exists(MOUNT_PATH):
    shutil.rmtree(MOUNT_PATH, ignore_errors=True)

os.makedirs(MOUNT_PATH, exist_ok=True)

print("Attempting to mount Google Drive...")
drive.mount(MOUNT_PATH)




Drive not mounted, so nothing to flush and unmount.
Attempting to mount Google Drive...
Mounted at /content/drive


In [None]:
import os
import json
import pandas as pd
from collections import defaultdict

print("="*70)
print("CHECKING TOP 20 VIDEOS IN YOUR 11K DATASET")
print("="*70)

VIDEO_DIR = "/content/drive/MyDrive/WLASL/data"
JSON_PATH = "/content/drive/MyDrive/FYP_word/Codes/Others/nslt_2000.json"
VOCAB_PATH = "/content/drive/MyDrive/FYP_word/Codes/Others/WLASL_v0.3.json"


top20_words = [
    'book', 'drink', 'computer', 'go', 'chair',
    'before', 'clothes', 'who', 'candy', 'deaf',
    'cousin', 'yes', 'no', 'walk', 'thin',
    'help', 'fine', 'year', 'now', 'like'
]


print("\n1. Loading metadata files...")
with open(JSON_PATH, 'r') as f:
    nslt_data = json.load(f)

with open(VOCAB_PATH, 'r') as f:
    vocab_data = json.load(f)


print("2. Building word ‚Üí video_id mapping...")
word_to_videos = defaultdict(list)

for entry in vocab_data:
    gloss = entry['gloss']
    if gloss in top20_words:
        instances = entry.get('instances', [])
        for inst in instances:
            video_id = str(inst.get('video_id', '')).zfill(5)
            if video_id in nslt_data:
                subset = nslt_data[video_id]['subset']
                word_to_videos[gloss].append({
                    'video_id': video_id,
                    'subset': subset,
                    'url': inst.get('url', '')
                })


print(f"3. Scanning your video folder: {VIDEO_DIR}")


video_extensions = ['.mp4', '.avi', '.mov', '.mkv', '.webm']
existing_videos = set()

for filename in os.listdir(VIDEO_DIR):
    if any(filename.endswith(ext) for ext in video_extensions):
       
        video_id = os.path.splitext(filename)[0]
        existing_videos.add(video_id)

print(f"   Found {len(existing_videos)} videos in your folder")

# Match with Top 20 words
print("\n4. Matching Top 20 words with your videos...")

available_data = []
missing_data = []

for word in top20_words:
    videos_for_word = word_to_videos[word]

    available_count = 0
    missing_count = 0

    for video_info in videos_for_word:
        video_id = video_info['video_id']

        if video_id in existing_videos:
            available_count += 1
            available_data.append({
                'video_id': video_id,
                'gloss': word,
                'subset': video_info['subset'],
                'status': 'available'
            })
        else:
            missing_count += 1
            missing_data.append({
                'video_id': video_id,
                'gloss': word,
                'subset': video_info['subset'],
                'url': video_info['url'],
                'status': 'missing'
            })

    print(f"   {word:<15} : {available_count:>3} available, {missing_count:>3} missing, {len(videos_for_word):>3} total")

# Create DataFrames
df_available = pd.DataFrame(available_data)
df_missing = pd.DataFrame(missing_data)

print(f"\n{'='*70}")
print("SUMMARY")
print(f"{'='*70}")
print(f"Total videos needed for Top 20: {len(available_data) + len(missing_data)}")
print(f"Videos you HAVE: {len(available_data)} ‚úì")
print(f"Videos MISSING: {len(missing_data)} ‚úó")
print(f"Availability: {len(available_data)/(len(available_data)+len(missing_data))*100:.1f}%")

# Split distribution for available videos
print(f"\n{'='*70}")
print("SPLIT DISTRIBUTION (Available Videos Only)")
print(f"{'='*70}")
if len(df_available) > 0:
    print(df_available['subset'].value_counts())

    # Per-word breakdown
    print(f"\n{'='*70}")
    print("PER-WORD BREAKDOWN (Available Videos)")
    print(f"{'='*70}")
    print(f"{'Word':<15} {'Train':>8} {'Val':>8} {'Test':>8} {'Total':>8}")
    print("-"*70)

    for word in top20_words:
        word_data = df_available[df_available['gloss'] == word]
        train_count = (word_data['subset'] == 'train').sum()
        val_count = (word_data['subset'] == 'val').sum()
        test_count = (word_data['subset'] == 'test').sum()
        total = len(word_data)
        print(f"{word:<15} {train_count:>8} {val_count:>8} {test_count:>8} {total:>8}")

# Save results
AVAILABLE_CSV = '/content/drive/MyDrive/FYP_word/top20_available_videos.csv'
MISSING_CSV = '/content/drive/MyDrive/FYP_word/top20_missing_videos.csv'

df_available.to_csv(AVAILABLE_CSV, index=False)
df_missing.to_csv(MISSING_CSV, index=False)

print(f"\n{'='*70}")
print("FILES SAVED")
print(f"{'='*70}")
print(f"‚úì Available videos: {AVAILABLE_CSV}")
print(f"  ({len(df_available)} videos - ready to use)")
print(f"\n‚úó Missing videos: {MISSING_CSV}")
print(f"  ({len(df_missing)} videos - need to download)")

# Show file naming patterns
print(f"\n{'='*70}")
print("VIDEO FILE NAMING CHECK")
print(f"{'='*70}")
print("First 10 video files in your folder:")
sample_files = sorted(list(existing_videos))[:10]
for f in sample_files:
    print(f"  {f}")

if len(df_available) > 0:
    print(f"\nFirst 10 available Top 20 video IDs:")
    for vid in df_available['video_id'].head(10):
        print(f"  {vid}")

print(f"\n{'='*70}")
print("NEXT STEPS")
print(f"{'='*70}")
if len(df_available) > 0:
    print(f"‚úì You can start training with {len(df_available)} available videos!")
    print(f"  Use: {AVAILABLE_CSV}")
else:
    print("‚úó No matching videos found. Check:")
    print("  1. Video folder path is correct")
    print("  2. Video file naming matches video_id format")
    print(f"  3. Your videos are for words in the Top 20 list")


CHECKING TOP 20 VIDEOS IN YOUR 11K DATASET

1. Loading metadata files...
2. Building word ‚Üí video_id mapping...
3. Scanning your video folder: /content/drive/MyDrive/WLASL/data
   Found 11980 videos in your folder

4. Matching Top 20 words with your videos...
   book            :   6 available,  34 missing,  40 total
   drink           :  15 available,  20 missing,  35 total
   computer        :  14 available,  16 missing,  30 total
   go              :  15 available,  11 missing,  26 total
   chair           :   7 available,  19 missing,  26 total
   before          :  16 available,  10 missing,  26 total
   clothes         :   5 available,  20 missing,  25 total
   who             :  14 available,  11 missing,  25 total
   candy           :  13 available,  11 missing,  24 total
   deaf            :  11 available,  12 missing,  23 total
   cousin          :  14 available,   9 missing,  23 total
   yes             :  12 available,  10 missing,  22 total
   no              :  11 avail

# Download Remaining

In [None]:
import os
import shutil
from tqdm import tqdm
import subprocess

print("="*70)
print("PREPARING TOP 20 VIDEOS DATASET")
print("="*70)

# ===== PATHS =====
AVAILABLE_CSV = '/content/drive/MyDrive/FYP_word/top20_available_videos.csv'
MISSING_CSV = '/content/drive/MyDrive/FYP_word/top20_missing_videos.csv'
SOURCE_VIDEO_DIR = "/content/drive/MyDrive/WLASL/data"  # Your 11k videos
LOCAL_VIDEO_DIR = "/content/top20_videos"  # Colab local (faster)
FINAL_DRIVE_DIR = "/content/drive/MyDrive/FYP_word/top20_complete_videos"  # Final location

os.makedirs(LOCAL_VIDEO_DIR, exist_ok=True)
os.makedirs(FINAL_DRIVE_DIR, exist_ok=True)

# Load CSVs
df_available = pd.read_csv(AVAILABLE_CSV)
df_missing = pd.read_csv(MISSING_CSV)

print(f"\nVideos already in your 11k dataset: {len(df_available)}")
print(f"Videos to download from YouTube: {len(df_missing)}")
print(f"Total Top 20 videos: {len(df_available) + len(df_missing)}")

print(f"\n{'='*70}")
print("STEP 1: Copying existing videos from Drive to Colab")
print(f"{'='*70}")

copied = 0
copy_failed = 0

for idx, row in tqdm(df_available.iterrows(), total=len(df_available), desc="Copying"):
    video_id = row['video_id']

    # Find source file (check multiple extensions)
    source_file = None
    for ext in ['.mp4', '.avi', '.mov', '.mkv', '.webm']:
        potential_source = f"{SOURCE_VIDEO_DIR}/{video_id}{ext}"
        if os.path.exists(potential_source):
            source_file = potential_source
            break

    if source_file:
        dest_file = f"{LOCAL_VIDEO_DIR}/{video_id}.mp4"
        try:
            shutil.copy2(source_file, dest_file)
            copied += 1
        except Exception as e:
            copy_failed += 1
    else:
        copy_failed += 1

print(f"\n‚úì Copied: {copied}/{len(df_available)}")
print(f"‚úó Failed: {copy_failed}")

# ===== STEP 2: Download missing 267 videos =====
print(f"\n{'='*70}")
print("STEP 2: Downloading missing videos from YouTube")
print(f"{'='*70}")

# Install yt-dlp
print("Installing yt-dlp...")
subprocess.run(['pip', 'install', '-q', 'yt-dlp'], check=False)

downloaded = 0
download_failed = 0

for idx, row in tqdm(df_missing.iterrows(), total=len(df_missing), desc="Downloading"):
    video_id = row['video_id']
    url = row['url']

    if pd.isna(url) or url == '':
        download_failed += 1
        continue

    output_path = f"{LOCAL_VIDEO_DIR}/{video_id}.mp4"

    # Skip if already exists (from copy step)
    if os.path.exists(output_path):
        downloaded += 1
        continue

    try:
        # Download using yt-dlp
        cmd = [
            'yt-dlp',
            '-f', 'best[ext=mp4]/best',
            '-o', output_path,
            '--quiet',
            '--no-warnings',
            '--no-check-certificate',
            url
        ]
        result = subprocess.run(cmd, timeout=60, capture_output=True)

        if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
            downloaded += 1
        else:
            download_failed += 1

    except Exception as e:
        download_failed += 1

print(f"\n‚úì Downloaded: {downloaded}/{len(df_missing)}")
print(f"‚úó Failed: {download_failed}")

## Step 3 and Step 4

In [None]:
import pandas as pd

# ===== STEP 3: Verify and count total (FIXED) =====
print(f"\n{'='*70}")
print("STEP 3: Verifying collected videos")
print(f"{'='*70}")

total_collected = len([f for f in os.listdir(LOCAL_VIDEO_DIR) if f.endswith('.mp4')])
print(f"Total videos in local folder: {total_collected}")

# Create final CSV with only available videos
# Combine both CSVs for lookup
df_all_metadata = pd.concat([df_available, df_missing], ignore_index=True)

all_data = []
unmatched = []

for video_file in os.listdir(LOCAL_VIDEO_DIR):
    if video_file.endswith('.mp4'):
        video_id = video_file.replace('.mp4', '')

        # Find metadata (try both as string and int)
        match = df_all_metadata[df_all_metadata['video_id'].astype(str) == video_id]

        if not match.empty:
            all_data.append({
                'video_id': video_id,
                'gloss': match.iloc[0]['gloss'],
                'subset': match.iloc[0]['subset']
            })
        else:
            unmatched.append(video_id)

df_final = pd.DataFrame(all_data)

print(f"\n{'='*70}")
print("FINAL DATASET SUMMARY")
print(f"{'='*70}")
print(f"Total videos collected: {total_collected}")
print(f"Videos matched with metadata: {len(df_final)}")
print(f"Videos without metadata: {len(unmatched)}")

if len(unmatched) > 0 and len(unmatched) <= 10:
    print(f"\nUnmatched video IDs (sample): {unmatched[:10]}")

if len(df_final) > 0:
    print(f"\nPer-word breakdown:")
    word_counts = df_final['gloss'].value_counts().sort_index()
    for word, count in word_counts.items():
        print(f"  {word:<15} : {count:>3} videos")

    print(f"\nSplit distribution:")
    print(df_final['subset'].value_counts())
else:
    print("\n‚ö†Ô∏è  WARNING: No videos matched with metadata!")
    print("This means the video IDs in your folder don't match the Top 20 list")
    print("You may need to use your original 332 videos from earlier instead")


# ===== STEP 4: Save to Drive =====
print(f"\n{'='*70}")
print("STEP 4: Saving videos to Drive")
print(f"{'='*70}")

saved = 0
for video_file in tqdm(os.listdir(LOCAL_VIDEO_DIR), desc="Saving to Drive"):
    if video_file.endswith('.mp4'):
        source = f"{LOCAL_VIDEO_DIR}/{video_file}"
        dest = f"{FINAL_DRIVE_DIR}/{video_file}"

        try:
            shutil.copy2(source, dest)
            saved += 1
        except:
            pass

print(f"‚úì Saved {saved} videos to Drive")

# Save final CSV
FINAL_CSV = '/content/drive/MyDrive/FYP_word/top20_final_dataset.csv'
df_final.to_csv(FINAL_CSV, index=False)

print(f"\n{'='*70}")
print("COMPLETE")
print(f"{'='*70}")
print(f"Videos saved to: {FINAL_DRIVE_DIR}")
print(f"CSV saved to: {FINAL_CSV}")
print(f"\nTotal videos: {len(df_final)}")
print(f"Ready for keypoint extraction!")

# Clean up local folder (optional - saves Colab disk space)
# Uncomment if you want to delete local copies after saving to Drive
# shutil.rmtree(LOCAL_VIDEO_DIR)
# print("\n‚úì Cleaned up local Colab folder")



STEP 3: Verifying collected videos
Total videos in local folder: 300

FINAL DATASET SUMMARY
Total videos collected: 300
Videos matched with metadata: 300
Videos without metadata: 0

Per-word breakdown:
  before          :   7 videos
  book            :   8 videos
  candy           :   8 videos
  chair           :   6 videos
  clothes         :  10 videos
  computer        :  21 videos
  cousin          :  16 videos
  deaf            :  17 videos
  drink           :  23 videos
  fine            :  16 videos
  go              :  18 videos
  help            :  15 videos
  like            :  18 videos
  no              :  17 videos
  now             :  16 videos
  thin            :  18 videos
  walk            :  15 videos
  who             :  18 videos
  year            :  15 videos
  yes             :  18 videos

Split distribution:
subset
train    209
val       50
test      41
Name: count, dtype: int64

STEP 4: Saving videos to Drive


Saving to Drive: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 300/300 [00:08<00:00, 36.24it/s]

‚úì Saved 300 videos to Drive

COMPLETE! üéâ
Videos saved to: /content/drive/MyDrive/FYP_word/top20_complete_videos
CSV saved to: /content/drive/MyDrive/FYP_word/top20_final_dataset.csv

Total videos: 300
Ready for keypoint extraction!



