In [3]:
import pandas as pd
import os

# --- Configuration ---
# The CSV file you want to clean.
INPUT_CSV = 'Gramitically_correct_dataset.csv'

# The new CSV file that will be created with the cleaned data.
OUTPUT_CSV = 'Gramitically_correct_videos_exist.csv'

# The folder where all the videos are located.
VIDEO_FOLDER = r"D:\Des646\iSign-videos_v1.1"

# The file extension of your video files.
VIDEO_EXTENSION = ".mp4"
# ---------------------

print(f"--- Cleaning CSV based on videos in '{VIDEO_FOLDER}' ---")

# 1. Load the CSV file
try:
    df = pd.read_csv(INPUT_CSV)
    initial_count = len(df)
    print(f"Loaded {initial_count} rows from '{INPUT_CSV}'.")
except FileNotFoundError:
    print(f"‚ùå Error: The file '{INPUT_CSV}' was not found.")
    exit()

# 2. Define a function to check if a video file exists for a given UID
def video_exists(uid):
    filename = str(uid) + VIDEO_EXTENSION
    full_path = os.path.join(VIDEO_FOLDER, filename)
    return os.path.exists(full_path)

# 3. Apply the check to each row and filter the DataFrame
print("Checking for missing videos and filtering the data...")
# The .apply method runs the 'video_exists' function on each UID
# The result is a boolean (True/False) series used to filter the DataFrame
rows_to_keep = df['uid'].apply(video_exists)
filtered_df = df[rows_to_keep]

final_count = len(filtered_df)
deleted_count = initial_count - final_count

# 4. Save the cleaned data to a new file
filtered_df.to_csv(OUTPUT_CSV, index=False)

# 5. Print the final summary report
print("\n--- ‚úÖ Process Complete ---")
print(f"Original rows: {initial_count}")
print(f"Rows with existing videos: {final_count}")
print(f"Rows deleted due to missing videos: {deleted_count}")
print(f"\nCleaned data has been saved to '{OUTPUT_CSV}'")

--- Cleaning CSV based on videos in 'D:\Des646\iSign-videos_v1.1' ---
Loaded 27305 rows from 'Gramitically_correct_dataset.csv'.
Checking for missing videos and filtering the data...

--- ‚úÖ Process Complete ---
Original rows: 27305
Rows with existing videos: 27066
Rows deleted due to missing videos: 239

Cleaned data has been saved to 'Gramitically_correct_videos_exist.csv'


In [4]:
import pandas as pd
import numpy as np
from collections import Counter
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# --- NLTK setup (run once) ---
try:
    stopwords.words('english')
except LookupError:
    nltk.download('stopwords')
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')
# --------------------------------

# --- Parameters You Can Tune ---
# The ONLY input file needed: your large dataset of grammatically correct sentences.
INPUT_FILENAME = 'Gramitically_correct_videos_exist.csv'

# --- Stage 1 Parameters ---
# Defines the initial vocabulary from the most common words in the large input file.
CORE_VOCAB_SIZE = 250

# --- Stage 2 Parameters ---
# The minimum frequency for a word to be included in the final "real" sentences.
MIN_CORE_FREQUENCY = 2

# --- Stage 3 Parameters ---
# The number of new, recombined sentences you want to create.
NUM_AUGMENTED_EXAMPLES = 3000

# Names for the two final output files.
OUTPUT_REAL_SENTENCES = 'real_sentences.csv'
OUTPUT_RECOMBINED_SENTENCES = 'recombined_sentences.csv'
# -----------------------------------

def preprocess_text(sentence):
    """Cleans and tokenizes a sentence."""
    if not isinstance(sentence, str): return []
    sentence = sentence.lower()
    sentence = re.sub(r'[^\w\s]', '', sentence)
    tokens = word_tokenize(sentence)
    stop_words = set(stopwords.words('english'))
    return [word for word in tokens if word not in stop_words]

# --- Part 1: Finding "Perfect" Sentences from the Main File ---
print(f"--- Part 1: Finding 'Perfect' Sentences from '{INPUT_FILENAME}' ---")
try:
    df = pd.read_csv(INPUT_FILENAME)
except FileNotFoundError:
    print(f"‚ùå Error: Input file '{INPUT_FILENAME}' not found. Please make sure it's in the same directory.")
    exit()

df['tokens'] = df['sentences'].apply(preprocess_text)
df = df[df['tokens'].str.len() > 0].copy()

# Define the Core Vocabulary from the most common words in the entire dataset
all_words = [word for tokens in df['tokens'] for word in tokens]
word_counts = Counter(all_words)
core_vocab = {word for word, count in word_counts.most_common(CORE_VOCAB_SIZE)}
print(f"Defined a Core Vocabulary of the top {len(core_vocab)} words.")

# Find all sentences composed ONLY of these core words
def is_in_core_vocab(tokens):
    return all(word in core_vocab for word in tokens)
perfect_sentences_df = df[df['tokens'].apply(is_in_core_vocab)].copy()
print(f"Found {len(perfect_sentences_df)} 'perfect' sentences that only use the core vocabulary.")


# --- Part 2: Cleaning the "Perfect" Sentences to Create the "Seed" ---
print(f"\n--- Part 2: Cleaning the 'Perfect' Sentences ---")
# Count words within this "perfect" set to find the very rare ones
perfect_word_counts = Counter(word for tokens in perfect_sentences_df['tokens'] for word in tokens)
rare_words = {word for word, count in perfect_word_counts.items() if count < MIN_CORE_FREQUENCY}
print(f"Found {len(rare_words)} words with frequency < {MIN_CORE_FREQUENCY} within the perfect set. Removing them...")

# Create the final, clean "seed" DataFrame
pure_core_df = perfect_sentences_df[~perfect_sentences_df['tokens'].apply(lambda t: any(w in rare_words for w in t))].copy()
pure_core_df = pure_core_df.dropna(subset=['uid', 'sentences'])
print(f"‚úÖ Created a final clean 'seed' dataset of {len(pure_core_df)} sentences.")


# --- Part 3: Augmenting the "Seed" via Weighted Recombination ---
print(f"\n--- Part 3: Augmenting the 'Seed' Dataset ---")
if len(pure_core_df) < 2:
    print("‚ùå Error: The clean 'seed' dataset has fewer than 2 sentences, cannot perform recombination.")
    exit()
    
# Calculate word frequencies and rarity scores within the PURE dataset
word_counts_core = Counter(word for tokens in pure_core_df['tokens'] for word in tokens)
def calculate_rarity_score(tokens):
    if not tokens: return 0
    return sum(1 / (word_counts_core.get(word, 1)) for word in tokens)

pure_core_df['rarity_score'] = pure_core_df['tokens'].apply(calculate_rarity_score)
print("Calculated rarity scores to prioritize sentences with less common words.")

# Generate new augmented examples
print(f"Generating {NUM_AUGMENTED_EXAMPLES} new recombined sentences...")
augmented_data = []
for i in range(NUM_AUGMENTED_EXAMPLES):
    # --- FIX IS HERE ---
    # Sample two rows into a single DataFrame
    sampled_blocks = pure_core_df.sample(n=2, weights='rarity_score', replace=True)
    
    # Get the first and second rows from the sampled DataFrame
    block_A_data = sampled_blocks.iloc[0]
    block_B_data = sampled_blocks.iloc[1]
    
    # Extract the text and UIDs from each row
    text_A, uid_A = block_A_data['sentences'], block_A_data['uid']
    text_B, uid_B = block_B_data['sentences'], block_B_data['uid']
    # --- END FIX ---
    
    new_text = text_A + " " + text_B
    augmented_data.append({'uid1': uid_A, 'uid2': uid_B, 'text': new_text})
print(f"‚úÖ Generated {len(augmented_data)} new training examples.")


# --- Part 4: Saving the Final Outputs ---
print("\n--- Part 4: Saving the final files ---")
pure_core_df[['uid', 'sentences']].to_csv(OUTPUT_REAL_SENTENCES, index=False)
print(f"‚úÖ Original clean sentences saved to '{OUTPUT_REAL_SENTENCES}'")

recombined_df = pd.DataFrame(augmented_data)
recombined_df.to_csv(OUTPUT_RECOMBINED_SENTENCES, index=False)
print(f"‚úÖ Recombined sentences saved to '{OUTPUT_RECOMBINED_SENTENCES}'")

--- Part 1: Finding 'Perfect' Sentences from 'Gramitically_correct_videos_exist.csv' ---
Defined a Core Vocabulary of the top 250 words.
Found 443 'perfect' sentences that only use the core vocabulary.

--- Part 2: Cleaning the 'Perfect' Sentences ---
Found 32 words with frequency < 2 within the perfect set. Removing them...
‚úÖ Created a final clean 'seed' dataset of 417 sentences.

--- Part 3: Augmenting the 'Seed' Dataset ---
Calculated rarity scores to prioritize sentences with less common words.
Generating 3000 new recombined sentences...
‚úÖ Generated 3000 new training examples.

--- Part 4: Saving the final files ---
‚úÖ Original clean sentences saved to 'real_sentences.csv'
‚úÖ Recombined sentences saved to 'recombined_sentences.csv'


In [1]:
import pandas as pd
from pathlib import Path
import sys

# --- Configuration ---
# PLEASE UPDATE THESE PATHS IF THEY ARE INCORRECT
VIDEOS_DIR = r"D:\Des646\Dataset_Creation\final_videos"
REAL_CSV_PATH = r"D:\Des646\Dataset_Creation\real_sentences.csv"
RECOMBINED_CSV_PATH = r"D:\Des646\Dataset_Creation\recombined_sentences.csv"
# --- End of Configuration ---

def clean_csv_files():
    """
    Scans the video directory and filters two CSV files based on
    the existence of corresponding video files.
    """
    try:
        # 1. Define paths
        videos_path = Path(VIDEOS_DIR)
        real_csv = Path(REAL_CSV_PATH)
        recombined_csv = Path(RECOMBINED_CSV_PATH)

        # 2. Check if paths exist
        if not videos_path.is_dir():
            print(f"Error: Video directory not found at: {videos_path}")
            return
        if not real_csv.is_file():
            print(f"Error: Real sentences CSV not found at: {real_csv}")
            return
        if not recombined_csv.is_file():
            print(f"Error: Recombined sentences CSV not found at: {recombined_csv}")
            return

        # 3. Get all existing video UIDs from the folder
        print(f"Scanning for videos in {videos_path}...")
        
        # Use .stem to get the filename without the .mp4 extension
        existing_video_uids = {file.stem for file in videos_path.glob('*.mp4')}
        
        if not existing_video_uids:
            print(f"Warning: No .mp4 files found in {videos_path}. Both CSVs will be emptied.")
        else:
            print(f"Found {len(existing_video_uids)} existing video files.")

        # --- 4. Process real_sentences.csv ---
        print(f"\nProcessing {real_csv.name}...")
        try:
            df_real = pd.read_csv(real_csv, dtype={'uid': str}) # Read uid as string
            initial_real_rows = len(df_real)
            
            # Filter rows where 'uid' is in our set of existing video UIDs
            df_real_filtered = df_real[df_real['uid'].isin(existing_video_uids)]
            final_real_rows = len(df_real_filtered)
            
            # Save the changes back to the same file
            df_real_filtered.to_csv(real_csv, index=False)
            
            print(f"Finished processing {real_csv.name}.")
            print(f"  Initial rows: {initial_real_rows}")
            print(f"  Final rows:   {final_real_rows}")
            print(f"  Rows removed: {initial_real_rows - final_real_rows}")

        except Exception as e:
            print(f"Error processing {real_csv.name}: {e}")

        # --- 5. Process recombined_sentences.csv ---
        print(f"\nProcessing {recombined_csv.name}...")
        try:
            df_recombined = pd.read_csv(recombined_csv, dtype={'uid1': str, 'uid2': str})
            initial_recombined_rows = len(df_recombined)
            
            # Filter rows where *both* uid1 AND uid2 are in the set
            condition_uid1_exists = df_recombined['uid1'].isin(existing_video_uids)
            condition_uid2_exists = df_recombined['uid2'].isin(existing_video_uids)
            
            df_recombined_filtered = df_recombined[condition_uid1_exists & condition_uid2_exists]
            final_recombined_rows = len(df_recombined_filtered)
            
            # Save the changes back to the same file
            df_recombined_filtered.to_csv(recombined_csv, index=False)
            
            print(f"Finished processing {recombined_csv.name}.")
            print(f"  Initial rows: {initial_recombined_rows}")
            print(f"  Final rows:   {final_recombined_rows}")
            print(f"  Rows removed: {initial_recombined_rows - final_recombined_rows}")

        except Exception as e:
            print(f"Error processing {recombined_csv.name}: {e}")

        print("\nScript finished.")

    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        sys.exit(1)

if __name__ == "__main__":
    # To run this script:
    # 1. Make sure you have pandas installed: pip install pandas
    # 2. Save this file as clean_csv_by_videos.py
    # 3. Run it from your terminal: python clean_csv_by_videos.py
    clean_csv_files()

Scanning for videos in D:\Des646\Dataset_Creation\final_videos...
Found 330 existing video files.

Processing real_sentences.csv...
Finished processing real_sentences.csv.
  Initial rows: 417
  Final rows:   330
  Rows removed: 87

Processing recombined_sentences.csv...
Finished processing recombined_sentences.csv.
  Initial rows: 3000
  Final rows:   2022
  Rows removed: 978

Script finished.


In [None]:
import pandas as pd
from collections import Counter
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import sys

# --- NLTK setup (run once if needed) ---
try:
    stopwords.words('english')
except LookupError:
    print("Downloading 'stopwords' from NLTK...")
    nltk.download('stopwords')
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    print("Downloading 'punkt' tokenizer from NLTK...")
    nltk.download('punkt')
# --------------------------------

# --- Configuration ---
# The names of the two files generated by the previous script.
REAL_SENTENCES_FILE = 'real_sentences.csv'
RECOMBINED_SENTENCES_FILE = 'recombined_sentences2.csv'
# The name of the new file to save the vocabulary to.
VOCABULARY_FILE = 'vocabulary2.csv'
# ---------------------

def preprocess_text(sentence):
    """
    Cleans and tokenizes a sentence.
    This MUST be identical to the function used to create the datasets.
    """
    if not isinstance(sentence, str): return []
    sentence = sentence.lower()
    sentence = re.sub(r'[^\w\s]', '', sentence) # Remove punctuation
    tokens = word_tokenize(sentence)
    stop_words = set(stopwords.words('english'))
    return [word for word in tokens if word not in stop_words and word.isalpha()] # Added isalpha() to remove numbers/stray ws

# --- Main Analysis Logic ---
print("--- Starting Word Frequency Analysis ---")

# 1. Load both datasets
try:
    real_df = pd.read_csv(REAL_SENTENCES_FILE)
    recombined_df = pd.read_csv(RECOMBINED_SENTENCES_FILE)
    print(f"‚úÖ Loaded {len(real_df)} real sentences and {len(recombined_df)} recombined sentences.")
except FileNotFoundError as e:
    print(f"‚ùå Error: Could not find a required file. Make sure both '{REAL_SENTENCES_FILE}' and '{RECOMBINED_SENTENCES_FILE}' are present.")
    print(f"Details: {e}")
    sys.exit(1)
except pd.errors.EmptyDataError as e:
    print(f"‚ùå Error: One of the files is empty. {e}")
    sys.exit(1)


# 2. Combine the text from both files into a single list
real_sentences = real_df['sentences']
recombined_sentences = recombined_df['text']
all_sentences = pd.concat([real_sentences, recombined_sentences], ignore_index=True)

print(f"Analyzing a total of {len(all_sentences)} sentences.")

# 3. Preprocess all sentences and create a flat list of all words
print("Processing text and tokenizing all sentences...")
# This creates a list of lists, where each inner list is the tokens of a sentence
tokenized_sentences = all_sentences.apply(preprocess_text)

# This flattens the list of lists into a single list of all words
all_words = [word for sentence_tokens in tokenized_sentences for word in sentence_tokens]

if not all_words:
    print("‚ùå No words were found after processing. Cannot perform analysis.")
    sys.exit()

# 4. Count the frequency of each word
word_counts = Counter(all_words)

# --- 5. NEW: Save vocabulary to CSV file ---
print(f"\nSaving vocabulary to {VOCABULARY_FILE}...")
try:
    # Convert the Counter object to a DataFrame
    vocab_df = pd.DataFrame(word_counts.items(), columns=['word', 'freq'])
    
    # Sort by frequency in descending order
    vocab_df = vocab_df.sort_values(by='freq', ascending=False)
    
    # Save to CSV
    vocab_df.to_csv(VOCABULARY_FILE, index=False)
    
    print(f"‚úÖ Vocabulary file with {len(vocab_df)} words saved successfully.")
except Exception as e:
    print(f"‚ùå Error saving vocabulary file: {e}")
# ---------------------------------------------

# 6. Generate and print the final report
print("\n--- üìä Final Dataset Analysis Report ---")
print(f"Total Unique Words (Vocabulary Size): {len(word_counts)}")

# Find the lowest and highest frequencies
most_common_word, highest_freq = word_counts.most_common(1)[0]
least_common_word, lowest_freq = word_counts.most_common()[-1]

print(f"Highest Frequency: '{most_common_word}' appeared {highest_freq} times.")
print(f"Lowest Frequency: ¬†'{least_common_word}' appeared {lowest_freq} times.")

# Find all words that share the lowest frequency
words_with_lowest_freq = [word for word, count in word_counts.items() if count == lowest_freq]

print(f"\nThere are {len(words_with_lowest_freq)} word(s) with the lowest frequency of {lowest_freq}:")
# Print a sample of them if there are too many
print(words_with_lowest_freq[:20]) # Shows up to the first 20

print("\n--- Top 15 Most Common Words ---")
for word, count in word_counts.most_common(15):
    print(f"- {word}: {count} times")

print("\n--- Analysis Complete ---")

--- Starting Word Frequency Analysis ---
‚úÖ Loaded 330 real sentences and 2022 recombined sentences.
Analyzing a total of 2352 sentences.
Processing text and tokenizing all sentences...

Saving vocabulary to vocabulary.csv...
‚úÖ Vocabulary file with 174 words saved successfully.

--- üìä Final Dataset Analysis Report ---
Total Unique Words (Vocabulary Size): 174
Highest Frequency: 'know' appeared 331 times.
Lowest Frequency: ¬†'sign' appeared 17 times.

There are 1 word(s) with the lowest frequency of 17:
['sign']

--- Top 15 Most Common Words ---
- know: 331 times
- let: 281 times
- people: 260 times
- picture: 240 times
- time: 235 times
- shocking: 205 times
- see: 200 times
- case: 194 times
- tell: 185 times
- video: 184 times
- deaf: 176 times
- went: 169 times
- first: 156 times
- together: 144 times
- however: 135 times

--- Analysis Complete ---
