In [130]:
import pandas as pd
import string
import re
from collections import Counter

In [131]:
def replace_ampersand(df_in):
    """
    Replaces all ampersand (&) characters in the 'text' column with the word 'and'.
    This is applied to the original text before other cleaning steps.
    """
    print("\n--- Applying Rule: Replace Ampersands (& -> 'and') ---")
    df_out = df_in.copy()
    
    # Use .str.replace() on the 'text' column. We add spaces around 'and' for proper formatting.
    # We also check if the column exists first.
    if 'text' in df_out.columns:
        # Count how many rows are affected for reporting
        rows_affected = df_out['text'].str.contains('&', na=False).sum()
        df_out['text'] = df_out['text'].str.replace('&', ' and ', regex=False)
        print(f"➡️ Found and replaced ampersands in {rows_affected} sentences.")
    else:
        print("⚠️ 'text' column not found.")
        
    return df_out

In [132]:
def preprocess_text_column(df_in):
    """
    Applies universal preprocessing to the text:
    1. Converts all text to lowercase.
    2. Replaces all punctuation with a single space.
    3. Collapses any multiple spaces into a single space.
    Returns the DataFrame with a new 'cleaned_text' column.
    """
    print("\n--- Applying Universal Preprocessing (v2) ---")
    df_out = df_in.copy()

    # Ensure text column is string and convert to lowercase
    text_series = df_out['text'].astype(str).str.lower()

    # --- CHANGE 1: Replace punctuation with a space ---
    # Create a translation table that maps each punctuation character to a space
    translator = str.maketrans(string.punctuation, ' ' * len(string.punctuation))
    text_series = text_series.str.translate(translator)

    # --- CHANGE 2: Collapse multiple spaces into one ---
    # Use a regular expression to replace one or more whitespace characters with a single space
    # and strip any leading/trailing spaces.
    text_series = text_series.str.replace(r'\s+', ' ', regex=True).str.strip()

    df_out['cleaned_text'] = text_series

    print("✅ Text converted to lowercase, punctuation replaced, and spaces normalized.")
    return df_out

In [133]:
def remove_exact_duplicates(df_in):
    """
    Removes rows where the 'cleaned_text' is an exact duplicate of a previous one.
    """
    print("\n--- Applying Rule: Remove Exact Duplicates ---")
    initial_rows = len(df_in)

    # Drop duplicates based on the 'cleaned_text' column, keeping the first instance
    df_out = df_in.drop_duplicates(subset=['cleaned_text'], keep='first').copy()

    final_rows = len(df_out)
    print(f"➡️ Sentences removed: {initial_rows - final_rows}")
    print(f"➡️ Sentences remaining: {final_rows}")
    return df_out

In [134]:
def remove_non_alpha_sentences(df_in):
    """
    Removes any sentence that contains characters other than lowercase
    letters and spaces in its 'cleaned_text'.
    """
    print("\n--- Applying Rule: Remove Sentences with Non-Alphabetic Characters ---")
    initial_rows = len(df_in)

    def contains_only_alpha_and_space(text):
        # The regex pattern '[^a-z\s]' matches any character that is NOT a-z or a space.
        # If re.search finds such a character, the sentence is invalid.
        if re.search(r'[^a-z\s]', text):
            return False # Contains invalid characters
        return True # All characters are valid

    mask = df_in['cleaned_text'].apply(contains_only_alpha_and_space)
    df_out = df_in[mask].copy()
    
    final_rows = len(df_out)
    print(f"➡️ Sentences removed: {initial_rows - final_rows}")
    print(f"➡️ Sentences remaining: {final_rows}")
    return df_out

In [135]:
def filter_by_character_length(df_in, min_length=4, max_length=120):
    """
    Removes rows where the 'cleaned_text' character length is outside the
    specified min and max range.
    """
    print(f"\n--- Applying Rule: Filter by Character Length (between {min_length} and {max_length}) ---")
    initial_rows = len(df_in)

    # --- THIS IS THE MODIFIED PART ---
    # Create a boolean mask to keep rows where the length is within the range
    mask = (df_in['cleaned_text'].str.len() >= min_length) & (df_in['cleaned_text'].str.len() <= max_length)
    df_out = df_in[mask].copy()
    # ---------------------------------

    final_rows = len(df_out)
    print(f"➡️ Sentences removed: {initial_rows - final_rows}")
    print(f"➡️ Sentences remaining: {final_rows}")
    return df_out

In [136]:
def filter_by_word_count(df_in, min_words=3):
    """
    Removes sentences that have fewer than the minimum number of words.
    """
    print(f"\n--- Applying Rule: Filter by Word Count (>= {min_words} words) ---")
    initial_rows = len(df_in)
    
    # Calculate word count for each sentence in the 'cleaned_text'
    word_counts = df_in['cleaned_text'].str.split().str.len()
    
    # Keep rows where the word count is greater than or equal to the minimum
    df_out = df_in[word_counts >= min_words].copy()
    
    final_rows = len(df_out)
    print(f"➡️ Sentences removed: {initial_rows - final_rows}")
    print(f"➡️ Sentences remaining: {final_rows}")
    return df_out

In [137]:
def remove_high_repetition_sentences(df_in, threshold=0.7):
    """
    Removes sentences that are overly repetitive based on a calculated ratio.
    The ratio is the frequency of the most common word divided by the total number of words.
    """
    print(f"\n--- Applying Rule: Remove Sentences with Repetition Ratio > {threshold} ---")
    initial_rows = len(df_in)

    def calculate_repeat_ratio(text):
        """Calculates the repetition ratio for a single sentence."""
        words = text.split()
        total_words = len(words)
        
        # Avoid division by zero for empty or single-word sentences
        if total_words <= 1:
            return 0.0
            
        # Count the occurrences of each word
        word_counts = Counter(words)
        
        # Find the count of the most common word
        most_common_count = word_counts.most_common(1)[0][1]
        
        # Calculate the ratio
        return most_common_count / total_words

    # Calculate the ratio for each sentence
    df_in['repeat_ratio'] = df_in['cleaned_text'].apply(calculate_repeat_ratio)
    
    # Create a mask to keep sentences below or equal to the threshold
    mask = df_in['repeat_ratio'] <= threshold
    df_out = df_in[mask].copy()
    
    # We can drop the temporary ratio column from the final output
    df_out.drop(columns=['repeat_ratio'], inplace=True)
    
    final_rows = len(df_out)
    print(f"➡️ Sentences removed: {initial_rows - final_rows}")
    print(f"➡️ Sentences remaining: {final_rows}")
    return df_out

In [138]:
# --- Load the Dataset ---
try:
    df = pd.read_csv('iSign_v1.1.csv')
    original_rows = len(df)
    print(f"✅ Successfully loaded the dataset.")
    print(f"Initial number of sentences: {original_rows}")
except FileNotFoundError:
    print("❌ Error: 'iSign_v1.1.csv' not found. Please make sure the file is in the correct directory.")
    df = None # Set df to None if file is not found

✅ Successfully loaded the dataset.
Initial number of sentences: 127237


In [139]:
df_processed=replace_ampersand(df)


--- Applying Rule: Replace Ampersands (& -> 'and') ---
➡️ Found and replaced ampersands in 2120 sentences.


In [140]:
df_processed = preprocess_text_column(df_processed)


--- Applying Universal Preprocessing (v2) ---
✅ Text converted to lowercase, punctuation replaced, and spaces normalized.


In [141]:
df_processed = filter_by_character_length(df_processed)


--- Applying Rule: Filter by Character Length (between 4 and 120) ---
➡️ Sentences removed: 4241
➡️ Sentences remaining: 122996


In [142]:
df_processed = filter_by_word_count(df_processed)


--- Applying Rule: Filter by Word Count (>= 3 words) ---
➡️ Sentences removed: 5998
➡️ Sentences remaining: 116998


In [143]:
df_processed = remove_exact_duplicates(df_processed)


--- Applying Rule: Remove Exact Duplicates ---
➡️ Sentences removed: 2348
➡️ Sentences remaining: 114650


In [144]:
df_processed = remove_non_alpha_sentences(df_processed)


--- Applying Rule: Remove Sentences with Non-Alphabetic Characters ---
➡️ Sentences removed: 25484
➡️ Sentences remaining: 89166


In [145]:
df_processed=remove_high_repetition_sentences(df_processed)


--- Applying Rule: Remove Sentences with Repetition Ratio > 0.7 ---
➡️ Sentences removed: 32
➡️ Sentences remaining: 89134


In [148]:
print("\n\n--- ✅ Filtering Complete ---")
print(f"Total sentences removed in this step: {len(df) - len(df_processed)}")
print(f"Final number of sentences after this step: {len(df_processed)}")



--- ✅ Filtering Complete ---
Total sentences removed in this step: 38103
Final number of sentences after this step: 89134


In [147]:
df_processed.to_csv('cleaned_data.csv', index=False)