In [17]:
import pandas as pd
import os

df = pd.read_csv('../data/processed/toxic_data_cleaned.csv', encoding='utf-8')

In [18]:
df.shape

(49172, 3)

In [19]:
df.head()

Unnamed: 0,Text,Toxic_flag,Toxic_type
0,champu,True,Threatening
1,champali,True,Threatening
2,champutha,True,Threatening
3,champesta,True,Threatening
4,champestam,True,Threatening


In [6]:
# check no of categories in Toxic flag
df['Toxic_flag'].value_counts()

Toxic_flag
FALSE            44520
True              1686
False              650
TRUE               359
Common_Insult        2
Name: count, dtype: int64

In [20]:
#check toxic flag = 'Common Insult'
df[df['Toxic_flag'] == 'Common_Insult']

Unnamed: 0,Text,Toxic_flag,Toxic_type


In [9]:
# change the rows if Toxic_flag is 'Common_Insult' to 'TRUE' and update that particular records 'Toxic_type' to 'Common_Insult'
df.loc[df['Toxic_flag'] == 'Common_Insult', 'Toxic_flag'] = 'TRUE'
df.loc[df['Toxic_flag'] == 'TRUE', 'Toxic_type'] = 'Common_Insult'  


In [10]:
df['Toxic_flag'].value_counts()

Toxic_flag
FALSE    44520
True      1686
False      650
TRUE       361
Name: count, dtype: int64

In [11]:
# Also 'Toxic_flag' has 'FALSE' and 'TRUE' values, in various cases, Make all 'Toxic_flag' values to upper case
df['Toxic_flag'] = df['Toxic_flag'].str.upper()

In [12]:
df['Toxic_flag'].value_counts()

Toxic_flag
FALSE    45170
TRUE      2047
Name: count, dtype: int64

In [13]:
df['Toxic_type'].value_counts()

Toxic_type
Sexual_Abuse            899
none                    650
Common_Insult           495
Mixed_Toxicity          190
Threatening             164
Harassment_Bullying     105
Profanity_Generic        99
Religious_Caste_Slur     96
Name: count, dtype: int64

In [16]:
#total Toxic comments
df[df['Toxic_flag'] == 'TRUE'].shape[0]
#total Non Toxic comments
df[df['Toxic_flag'] == 'FALSE'].shape[0]
#total toxic_type comments
df[df['Toxic_flag'] == 'TRUE']['Toxic_type'].value_counts()
#print above values in a table


Toxic_type
Sexual_Abuse            899
Common_Insult           495
Mixed_Toxicity          190
Threatening             164
Harassment_Bullying     104
Profanity_Generic        99
Religious_Caste_Slur     96
Name: count, dtype: int64

In [None]:
# save the updated dataframe
df.to_csv('../data/processed/toxic_data_cleaned.csv', index=False, encoding='utf-8')

In [77]:
import pandas as pd
import re

# Load the dataset
dataset_path = "../data/raw/toxicity_data.csv"
df = pd.read_csv(dataset_path)

# Function to convert English text to lowercase while keeping other characters as they are
def process_text(text):
    if isinstance(text, str):
        # Convert only English letters to lowercase
        text = re.sub(r'[a-zA-Z]+', lambda x: x.group(0).lower(), text)
        return text
    return text

# Apply the function to all columns in the dataframe
df = df.applymap(process_text)

# Save the updated dataset back to the same location
df.to_csv(dataset_path, index=False)

print("‚úÖ Dataset saved with English letters in lowercase.")

  df = df.applymap(process_text)


‚úÖ Dataset saved with English letters in lowercase.


In [78]:
import pandas as pd

# Load dataset
df = pd.read_csv("../data/raw/toxicity_data.csv")

# Identify duplicate texts (including the first occurrence)
duplicate_texts = df[df.duplicated(subset='Text', keep=False)]

# Sort for better readability
duplicate_texts_sorted = duplicate_texts.sort_values(by='Text')

# Show the head (e.g., first 20 rows)
print("üßæ Duplicate groups (first 20 rows):")
print(duplicate_texts_sorted.head(20)[['Text', 'Toxic_flag', 'Toxic_type']].to_string(index=False))


üßæ Duplicate groups (first 20 rows):
   Text Toxic_flag Toxic_type
   abba      false       none
   abba      false       none
   akka      false       none
   akka      false       none
   amma      false       none
   amma      false       none
  atanu      false       none
  atanu      false       none
  avunu      false       none
  avunu      false       none
   babu      false       none
   babu      false       none
   guru      false       none
   guru      false       none
     he      false       none
     he      false       none
      i      false       none
      i      false       none
ivvandi      false       none
ivvandi      false       none


In [79]:
import pandas as pd

# Load dataset
path = "../data/raw/toxicity_data.csv"
df = pd.read_csv(path)

# Get count before cleaning
before_count = len(df)

# Find all duplicate texts (including all instances)
duplicates_all = df[df.duplicated(subset='Text', keep=False)]

# Step 1: Group by 'Text'
def dedup_logic(group):
    if group['Toxic_flag'].any():
        # If any are toxic, keep only one toxic
        return group[group['Toxic_flag'] == True].head(1)
    else:
        # Else keep only one non-toxic
        return group.head(1)

deduped_df = duplicates_all.groupby('Text', group_keys=False).apply(dedup_logic)

# Step 2: Get all unique texts from deduped version
texts_to_keep = deduped_df['Text'].unique()

# Step 3: Remove all records with duplicate Texts, and add back only selected rows
df_cleaned = pd.concat([
    df[~df['Text'].isin(duplicates_all['Text'])],
    deduped_df
], ignore_index=True)

# Count after cleaning
after_count = len(df_cleaned)

# Show summary
print(f"\n‚úÖ Records before cleaning: {before_count}")
print(f"‚úÖ Records after cleaning: {after_count}")
print(f"üóëÔ∏è Records removed: {before_count - after_count}")

# Optionally save cleaned data back to the same file
df_cleaned.to_csv(path, index=False)



‚úÖ Records before cleaning: 57321
‚úÖ Records after cleaning: 57265
üóëÔ∏è Records removed: 56


  deduped_df = duplicates_all.groupby('Text', group_keys=False).apply(dedup_logic)


In [80]:
import pandas as pd
import re

# Load dataset while skipping bad lines
path = "../data/raw/toxicity_data.csv"
df = pd.read_csv(path, on_bad_lines='skip')

# Convert all string columns to lowercase
df = df.applymap(lambda x: x.lower() if isinstance(x, str) else x)

# Remove special characters in 'Text' column
df['Text'] = df['Text'].str.replace(r"[.,'\"!;()\[\]{}<>/%^&_+=\\|-]", "", regex=True)

# Count before cleaning
before_count = len(df)

# Find all duplicate texts (including all instances)
duplicates_all = df[df.duplicated(subset='Text', keep=False)]

# Step 1: Group by 'Text'
def dedup_logic(group):
    if group['Toxic_flag'].any():
        # If any are toxic, keep only one toxic
        return group[group['Toxic_flag'] == True].head(1)
    else:
        # Else keep only one non-toxic
        return group.head(1)

deduped_df = duplicates_all.groupby('Text', group_keys=False).apply(dedup_logic)

# Step 2: Remove original duplicate texts and add cleaned unique rows
df_cleaned = pd.concat([
    df[~df['Text'].isin(duplicates_all['Text'])],
    deduped_df
], ignore_index=True)

# Count after cleaning
after_count = len(df_cleaned)

# Summary
print(f"\n‚úÖ Records before cleaning: {before_count}")
print(f"‚úÖ Records after cleaning: {after_count}")
print(f"üóëÔ∏è Records removed: {before_count - after_count}")

# Save the cleaned file to the same location
df_cleaned.to_csv(path, index=False)


  df = df.applymap(lambda x: x.lower() if isinstance(x, str) else x)
  deduped_df = duplicates_all.groupby('Text', group_keys=False).apply(dedup_logic)



‚úÖ Records before cleaning: 57265
‚úÖ Records after cleaning: 57265
üóëÔ∏è Records removed: 0


# combine all csv's 
toxicity_data --> Text,Toxic_flag,Toxic_type,lang
toxicity_data 2 --> Text,User,Red_Flag,Label,Toxic_flag,Toxic_type,lang
toxicity_data 3 --> Text,Label,Toxic_flag,Toxic_type,lang

In [81]:
import pandas as pd
import unicodedata
import re

# File paths
paths = [
    "../data/raw/toxicity_data.csv",
    "../data/raw/toxicity_data2.csv",
    "../data/raw/toxicity_data3.csv"
]

# Required columns
final_columns = ['Text', 'Toxic_flag', 'Toxic_type', 'lang']

# Load and harmonize files
dfs = []
for path in paths:
    df = pd.read_csv(path, on_bad_lines='skip')
    
    # Normalize missing required columns
    for col in final_columns:
        if col not in df.columns:
            df[col] = None
    df = df[final_columns]
    dfs.append(df)

# Combine datasets
combined_df = pd.concat(dfs, ignore_index=True)

# Normalize and clean text
def clean_text(text):
    if not isinstance(text, str):
        return ""
    text = unicodedata.normalize("NFKC", text).lower().strip()
    text = re.sub(r"[.,'\"!;()\[\]{}<>/%^&_+=\\|-]", "", text)
    text = re.sub(r"\s+", " ", text)
    return text

combined_df['Text'] = combined_df['Text'].apply(clean_text)

# Ensure Toxic_flag is boolean
combined_df['Toxic_flag'] = combined_df['Toxic_flag'].astype(str).str.lower().map({'true': True, 'false': False, '1': True, '0': False})
combined_df['Toxic_flag'] = combined_df['Toxic_flag'].fillna(False)

# Store original counts
original_toxic = combined_df['Toxic_flag'].sum()
original_non_toxic = len(combined_df) - original_toxic

# Deduplication
duplicates_all = combined_df[combined_df.duplicated(subset='Text', keep=False)]

def dedup_logic(group):
    if group['Toxic_flag'].any():
        return group[group['Toxic_flag'] == True].head(1)
    else:
        return group.head(1)

deduped_df = duplicates_all.groupby('Text', group_keys=False).apply(dedup_logic)

# Merge cleaned duplicates with unique entries
final_df = pd.concat([
    combined_df[~combined_df['Text'].isin(duplicates_all['Text'])],
    deduped_df
], ignore_index=True)

# New counts
new_toxic = final_df['Toxic_flag'].sum()
new_non_toxic = len(final_df) - new_toxic

# Calculate losses
loss_toxic = original_toxic - new_toxic
loss_non_toxic = original_non_toxic - new_non_toxic

pct_loss_toxic = (loss_toxic / original_toxic * 100) if original_toxic else 0
pct_loss_non_toxic = (loss_non_toxic / original_non_toxic * 100) if original_non_toxic else 0

# Save to file
final_df.to_csv("../data/raw/toxicity_data.csv", index=False)

# Report
print("üìä Summary of Toxic/Non-Toxic Records")
print(f"Before cleaning:  Toxic: {original_toxic}, Non-Toxic: {original_non_toxic}")
print(f"After cleaning:   Toxic: {new_toxic}, Non-Toxic: {new_non_toxic}")
print(f"Records removed:  Toxic: {loss_toxic} ({pct_loss_toxic:.2f}%), Non-Toxic: {loss_non_toxic} ({pct_loss_non_toxic:.2f}%)")
print(f"‚úÖ Final cleaned dataset saved as 'toxicity_data.csv'")



FileNotFoundError: [Errno 2] No such file or directory: '../data/raw/toxicity_data2.csv'

In [82]:
df = pd.read_csv('../data/raw/toxicity_data.csv', encoding='utf-8')
df.head()
df.shape
df['Toxic_flag'].value_counts()

Toxic_flag
false         45419
true          11845
toxic_flag        1
Name: count, dtype: int64

In [83]:
import pandas as pd
import random

# Load the dataset
df = pd.read_csv("../data/raw/toxicity_data.csv")

# Filter: Text length < 100
df = df[df['Text'].str.len() < 100]

# Drop missing required values for fair sampling
df = df.dropna(subset=['Text', 'Toxic_flag', 'Toxic_type', 'lang'])

# Ensure string types
df['Toxic_type'] = df['Toxic_type'].astype(str)
df['lang'] = df['lang'].astype(str)

# Required categories
toxic_flags = df['Toxic_flag'].dropna().unique().tolist()
toxic_types = df['Toxic_type'].dropna().unique().tolist()
langs = df['lang'].dropna().unique().tolist()

# Initialize final set
final_records = pd.DataFrame(columns=df.columns)

# At least one of each toxic_flag
for tf in toxic_flags:
    sample = df[df['Toxic_flag'] == tf].sample(1, random_state=42)
    final_records = pd.concat([final_records, sample])

# At least one of each toxic_type
for tt in toxic_types:
    if tt not in final_records['Toxic_type'].values:
        sample = df[df['Toxic_type'] == tt].sample(1, random_state=42)
        final_records = pd.concat([final_records, sample])

# At least one of each lang
for lg in langs:
    if lg not in final_records['lang'].values:
        sample = df[df['lang'] == lg].sample(1, random_state=42)
        final_records = pd.concat([final_records, sample])

# Fill remaining to 10
remaining = 10 - len(final_records)
if remaining > 0:
    remaining_samples = df[~df.index.isin(final_records.index)].sample(remaining, random_state=42)
    final_records = pd.concat([final_records, remaining_samples])

# Drop duplicates, shuffle
final_records = final_records.drop_duplicates().sample(10, random_state=42)

# Display as CSV-style output
print("Text,Toxic_flag,Toxic_type,lang")
for _, row in final_records.iterrows():
    text = row['Text'].replace("\n", " ").replace(",", " ")
    print(f"{text},{row['Toxic_flag']},{row['Toxic_type']},{row['lang']}")


Text,Toxic_flag,Toxic_type,lang
telugu industry naa modda kudavandi raa,true,sexual_abuse,tenglish
‡∞â‡∞Ç‡∞°‡∞ø,false,none,telugu
supporting a rauist politician shame,true,mixed_toxicity,english
‡∞®‡±Ä ‡∞é‡∞¶‡±ç‡∞¶‡±Å ‡∞≤‡∞æ‡∞Ç‡∞ü‡∞ø ‡∞Æ‡±ä‡∞ñ‡∞Ç ‡∞ö‡±Ç‡∞∏‡±ç‡∞§‡±á ‡∞®‡∞µ‡±ç‡∞µ‡±ä‡∞∏‡±ç‡∞§‡±Å‡∞Ç‡∞¶‡∞ø idiot,true,political_toxicity,telugu
‡∞Æ‡±Ä ‡∞ï‡±Å‡∞≤‡∞Ç ‡∞µ‡∞æ‡∞≥‡±ç‡∞≥‡∞Ç‡∞§‡∞æ ‡∞ö‡∞ö‡±ç‡∞ö‡∞ø‡∞® ‡∞ï‡±Å‡∞ï‡±ç‡∞ï‡∞≤‡±Å,true,caste_slur,telugu
‡∞Ö‡∞¶‡∞ø ‡∞ö‡±Ç‡∞°‡∞ó‡∞æ‡∞®‡±á ‡∞®‡±Ä ‡∞¨‡±Å‡∞¶‡±ç‡∞ß‡∞ø ‡∞™‡±ãi‡±Å‡∞Ç‡∞¶‡∞ø,true,gender_targeted,telugu
maciga batch ni samajam lo pedite champeyali,true,religious_caste_slur,tenglish
‡∞®‡±Ä ‡∞Ø‡∞Æ‡±ç‡∞Æ‡∞®‡∞ø ‡∞¶‡±Ü‡∞Ç** ‡∞ó‡±Å‡∞Ç‡∞ü‡∞æ,true,profanity_sexual,telugu
text,toxic_flag,toxic_type,lang
meeru andharu kalisi desanni nasanam chesthunnaru,true,political_criticism_abusive,telugu


In [59]:
import pandas as pd
import unicodedata
import re
import os
from glob import glob

# Directory path
data_dir = "../data/ref/Synthetic/"
csv_files = glob(os.path.join(data_dir, "*.csv"))

# Required columns
final_columns = ['Text', 'Toxic_flag', 'Toxic_type', 'lang']

# Load and harmonize all CSV files
dfs = []
for path in csv_files:
    df = pd.read_csv(path, on_bad_lines='skip')
    print(f"Loaded {path} with {len(df)} records")

    # Add missing required columns
    for col in final_columns:
        if col not in df.columns:
            df[col] = None

    # Subset to required columns
    df = df[final_columns]
    dfs.append(df)

# Combine all CSVs
combined_df = pd.concat(dfs, ignore_index=True)

# Normalize and clean the 'Text' column
def clean_text(text):
    if not isinstance(text, str):
        return ""
    text = unicodedata.normalize("NFKC", text).lower().strip()
    text = re.sub(r"[.,'\"!;:()\[\]{}<>/%^&_+=\\|@#*$~`?]", "", text)  # remove special characters
    text = re.sub(r"\s+", " ", text)  # normalize whitespace
    return text

combined_df['Text'] = combined_df['Text'].apply(clean_text)

# Normalize Toxic_flag values
combined_df['Toxic_flag'] = combined_df['Toxic_flag'].astype(str).str.lower().map({
    'true': True, 'false': False, '1': True, '0': False
})
combined_df['Toxic_flag'] = combined_df['Toxic_flag'].fillna(False)

# Store counts before cleaning
original_toxic = combined_df['Toxic_flag'].sum()
original_non_toxic = len(combined_df) - original_toxic

# Deduplication logic
duplicates_all = combined_df[combined_df.duplicated(subset='Text', keep=False)]

def dedup_logic(group):
    if group['Toxic_flag'].any():
        return group[group['Toxic_flag'] == True].head(1)
    else:
        return group.head(1)

deduped_df = duplicates_all.groupby('Text', group_keys=False).apply(dedup_logic)

# Merge with non-duplicate entries
final_df = pd.concat([
    combined_df[~combined_df['Text'].isin(duplicates_all['Text'])],
    deduped_df
], ignore_index=True)

# Final counts
new_toxic = final_df['Toxic_flag'].sum()
new_non_toxic = len(final_df) - new_toxic

# Loss calculations
loss_toxic = original_toxic - new_toxic
loss_non_toxic = original_non_toxic - new_non_toxic

pct_loss_toxic = (loss_toxic / original_toxic * 100) if original_toxic else 0
pct_loss_non_toxic = (loss_non_toxic / original_non_toxic * 100) if original_non_toxic else 0

# Save cleaned file
output_path = os.path.join(data_dir, "toxicity_data.csv")
final_df.to_csv(output_path, index=False)

# Report
print("\nüìä Summary of Toxic/Non-Toxic Records")
print(f"Before cleaning:  Toxic: {original_toxic}, Non-Toxic: {original_non_toxic}")
print(f"After cleaning:   Toxic: {new_toxic}, Non-Toxic: {new_non_toxic}")
print(f"Records removed:  Toxic: {loss_toxic} ({pct_loss_toxic:.2f}%), Non-Toxic: {loss_non_toxic} ({pct_loss_non_toxic:.2f}%)")
print(f"‚úÖ Final cleaned dataset saved as '{output_path}'")


Loaded ../data/ref/Synthetic\final_hate_speech_dataset_10000.csv with 10000 records
Loaded ../data/ref/Synthetic\hate_speech_data.csv with 7500 records
Loaded ../data/ref/Synthetic\hate_speech_dataset_10000.csv with 9999 records
Loaded ../data/ref/Synthetic\hate_speech_telugu_fanwars.csv with 1004 records
Loaded ../data/ref/Synthetic\hate_speech_telugu_fanwars_100.csv with 99 records
Loaded ../data/ref/Synthetic\hate_speech_telugu_fanwars_50.csv with 50 records
Loaded ../data/ref/Synthetic\rich_hate_speech_dataset_10000.csv with 11199 records

üìä Summary of Toxic/Non-Toxic Records
Before cleaning:  Toxic: 32515, Non-Toxic: 7336
After cleaning:   Toxic: 9635, Non-Toxic: 1002
Records removed:  Toxic: 22880 (70.37%), Non-Toxic: 6334 (86.34%)
‚úÖ Final cleaned dataset saved as '../data/ref/Synthetic/toxicity_data.csv'


  deduped_df = duplicates_all.groupby('Text', group_keys=False).apply(dedup_logic)


In [75]:
import pandas as pd
import unicodedata
import re
import os
from glob import glob

# === CONFIGURATION ===
input_dir = "../data/raw/"  # Change this path if needed
output_file = "../data/ref/Synthetic/toxicity_data.csv"
final_columns = ['Text', 'Toxic_flag', 'Toxic_type', 'lang']

# List of unwanted phrases (add more as needed)
unwanted_phrases = ['‡∞™‡∞ø‡∞ö‡±ç‡∞ö‡∞ø‡∞ó‡∞æ ‡∞â‡∞Ç‡∞¶‡∞ø', '‡∞®‡∞æ‡∞ü‡∞ï‡±Ä‡∞Ø‡∞Ç‡∞ó‡∞æ ‡∞â‡∞®‡±ç‡∞®‡∞æ‡∞Ø‡∞ø ', '‡∞®‡±Ä ‡∞Æ‡∞æ‡∞ü‡∞≤‡±Å ‡∞®‡∞æ‡∞ü‡∞ï‡±Ä‡∞Ø‡∞Ç‡∞ó‡∞æ']

# === STEP 1: Read and combine all CSVs ===
csv_files = glob(os.path.join(input_dir, "*.csv"))
dfs = []

for path in csv_files:
    df = pd.read_csv(path, on_bad_lines='skip')

    # Add missing required columns if any
    for col in final_columns:
        if col not in df.columns:
            df[col] = None
    df = df[final_columns]
    dfs.append(df)

combined_df = pd.concat(dfs, ignore_index=True)

# === STEP 2: Normalize and clean Text ===
def clean_text(text):
    if not isinstance(text, str):
        return ""
    text = unicodedata.normalize("NFKC", text).lower().strip()
    text = re.sub(r"[.,'\"!;()\[\]{}<>/%^&_+=\\|-]", "", text)  # remove special characters
    text = re.sub(r"\s+", " ", text)  # normalize whitespace
    return text

combined_df['Text'] = combined_df['Text'].apply(clean_text)

# === STEP 3: Convert Toxic_flag to boolean ===
combined_df['Toxic_flag'] = combined_df['Toxic_flag'].astype(str).str.lower().map({
    'true': True, 'false': False, '1': True, '0': False
})
combined_df['Toxic_flag'] = combined_df['Toxic_flag'].fillna(False)

# === STEP 4: Store original counts ===
original_toxic = combined_df['Toxic_flag'].sum()
original_non_toxic = len(combined_df) - original_toxic

# === STEP 5: Remove duplicates with priority for toxic ===
duplicates_all = combined_df[combined_df.duplicated(subset='Text', keep=False)]

def dedup_logic(group):
    if group['Toxic_flag'].any():
        return group[group['Toxic_flag'] == True].head(1)
    else:
        return group.head(1)

deduped_df = duplicates_all.groupby('Text', group_keys=False).apply(dedup_logic)

# Merge cleaned duplicates with unique entries
final_df = pd.concat([
    combined_df[~combined_df['Text'].isin(duplicates_all['Text'])],
    deduped_df
], ignore_index=True)

# === STEP 6: Remove rows containing unwanted phrases ===
# Clean and normalize the unwanted phrases just like the Text column
cleaned_unwanted = [clean_text(p) for p in unwanted_phrases]

# Build regex pattern from cleaned phrases
regex_pattern = '|'.join(map(re.escape, cleaned_unwanted))

# Match cleaned pattern on cleaned Text
to_remove = final_df['Text'].str.contains(regex_pattern, na=False)
removed_count = to_remove.sum()

final_df = final_df[~to_remove]



# === STEP 7: New counts and loss percentages ===
new_toxic = final_df['Toxic_flag'].sum()
new_non_toxic = len(final_df) - new_toxic

loss_toxic = original_toxic - new_toxic
loss_non_toxic = original_non_toxic - new_non_toxic

pct_loss_toxic = (loss_toxic / original_toxic * 100) if original_toxic else 0
pct_loss_non_toxic = (loss_non_toxic / original_non_toxic * 100) if original_non_toxic else 0

# === STEP 8: Save the cleaned dataset ===
final_df.to_csv(output_file, index=False)

# === STEP 9: Summary ===
print("üìä Summary of Toxic/Non-Toxic Records")
print(f"Before cleaning:  Toxic: {original_toxic}, Non-Toxic: {original_non_toxic}")
print(f"After cleaning:   Toxic: {new_toxic}, Non-Toxic: {new_non_toxic}")
print(f"Records removed:  Toxic: {loss_toxic} ({pct_loss_toxic:.2f}%), Non-Toxic: {loss_non_toxic} ({pct_loss_non_toxic:.2f}%)")
print(f"üßπ Removed {removed_count} records containing any of: {unwanted_phrases}")
print(f"‚úÖ Final cleaned dataset saved as '{output_file}'")



  deduped_df = duplicates_all.groupby('Text', group_keys=False).apply(dedup_logic)


üìä Summary of Toxic/Non-Toxic Records
Before cleaning:  Toxic: 11845, Non-Toxic: 44641
After cleaning:   Toxic: 11724, Non-Toxic: 44638
Records removed:  Toxic: 121 (1.02%), Non-Toxic: 3 (0.01%)
üßπ Removed 121 records containing any of: ['‡∞™‡∞ø‡∞ö‡±ç‡∞ö‡∞ø‡∞ó‡∞æ ‡∞â‡∞Ç‡∞¶‡∞ø', '‡∞®‡∞æ‡∞ü‡∞ï‡±Ä‡∞Ø‡∞Ç‡∞ó‡∞æ ‡∞â‡∞®‡±ç‡∞®‡∞æ‡∞Ø‡∞ø ', '‡∞®‡±Ä ‡∞Æ‡∞æ‡∞ü‡∞≤‡±Å ‡∞®‡∞æ‡∞ü‡∞ï‡±Ä‡∞Ø‡∞Ç‡∞ó‡∞æ']
‚úÖ Final cleaned dataset saved as '../data/ref/Synthetic/toxicity_data.csv'


In [76]:
removed_count = final_df['Text'].str.contains('‡∞®‡∞æ‡∞ü‡∞ï‡±Ä‡∞Ø‡∞Ç‡∞ó‡∞æ ‡∞â‡∞®‡±ç‡∞®‡∞æ‡∞Ø‡∞ø ', na=False).sum()
print(f"üßπ Removed {removed_count} records containing the phrase '‡∞®‡∞æ‡∞ü‡∞ï‡±Ä‡∞Ø‡∞Ç‡∞ó‡∞æ ‡∞â‡∞®‡±ç‡∞®‡∞æ‡∞Ø‡∞ø '")

üßπ Removed 0 records containing the phrase '‡∞®‡∞æ‡∞ü‡∞ï‡±Ä‡∞Ø‡∞Ç‡∞ó‡∞æ ‡∞â‡∞®‡±ç‡∞®‡∞æ‡∞Ø‡∞ø '


In [2]:
import sys
import torch

# Patch to avoid Streamlit poking into torch.classes
sys.modules['torch.classes'].__path__ = []
# print ("Torch classes path patched to avoid Streamlit interference.")
# print ("Torch version:", torch.__version__)
# print the paths to verify
print("Torch class paths:", sys.modules['torch.classes'].__path__)
# Ensure the patch works by checking if torch.classes is empty
if not sys.modules['torch.classes'].__path__:
    print("‚úÖ Torch classes path successfully patched.")
else:
    print("‚ùå Torch classes path patch failed. Current paths:", sys.modules['torch.classes'].__path__)


Torch class paths: []
‚úÖ Torch classes path successfully patched.


In [4]:
import unicodedata

def print_unicode_info(label, string):
    print(f"\n{label}: '{string}'")
    print("Codepoints:", [f"{ord(c)} ({hex(ord(c))})" for c in string])
    print("UTF-8 Bytes:", string.encode('utf-8'))
    print("Unicode Form (NFC):", unicodedata.normalize('NFC', string))
    print("Unicode Form (NFD):", unicodedata.normalize('NFD', string))

def compare_telugu_lists(list1, list2):
    for i, s1 in enumerate(list1):
        for j, s2 in enumerate(list2):
            print(f"\nüîç Comparing List1[{i}]: '{s1}' vs List2[{j}]: '{s2}'")

            # Normalize both strings
            norm_s1 = unicodedata.normalize('NFC', s1)
            norm_s2 = unicodedata.normalize('NFC', s2)

            # Comparisons
            print(f"Unicode Equality: {s1 == s2}")
            print(f"After Normalization Equality: {norm_s1 == norm_s2}")
            print(f"UTF-8 Bytes Equality: {s1.encode('utf-8') == s2.encode('utf-8')}")

            # Print deeper info
            print_unicode_info("Original S1", s1)
            print_unicode_info("Original S2", s2)

# Example inputs from two transliteration methods
list1 = ["‡∞∏‡±Ç‡∞¶‡∞ø"]  # From suudhi
list2 = ["‡∞∏‡±Ç‡∞¶‡∞ø"]  # From soodi or another transliterator

compare_telugu_lists(list1, list2)



üîç Comparing List1[0]: '‡∞∏‡±Ç‡∞¶‡∞ø' vs List2[0]: '‡∞∏‡±Ç‡∞¶‡∞ø'
Unicode Equality: True
After Normalization Equality: True
UTF-8 Bytes Equality: True

Original S1: '‡∞∏‡±Ç‡∞¶‡∞ø'
Codepoints: ['3128 (0xc38)', '3138 (0xc42)', '3110 (0xc26)', '3135 (0xc3f)']
UTF-8 Bytes: b'\xe0\xb0\xb8\xe0\xb1\x82\xe0\xb0\xa6\xe0\xb0\xbf'
Unicode Form (NFC): ‡∞∏‡±Ç‡∞¶‡∞ø
Unicode Form (NFD): ‡∞∏‡±Ç‡∞¶‡∞ø

Original S2: '‡∞∏‡±Ç‡∞¶‡∞ø'
Codepoints: ['3128 (0xc38)', '3138 (0xc42)', '3110 (0xc26)', '3135 (0xc3f)']
UTF-8 Bytes: b'\xe0\xb0\xb8\xe0\xb1\x82\xe0\xb0\xa6\xe0\xb0\xbf'
Unicode Form (NFC): ‡∞∏‡±Ç‡∞¶‡∞ø
Unicode Form (NFD): ‡∞∏‡±Ç‡∞¶‡∞ø


In [5]:
#Check if these two strings are equivalent in Unicode normalization and UTF-8 encoding
import unicodedata
def check_equivalence(s1, s2):
    norm_s1 = unicodedata.normalize('NFC', s1)
    norm_s2 = unicodedata.normalize('NFC', s2)
    
    print(f"Comparing '{s1}' and '{s2}':")
    print("Unicode Equality:", s1 == s2)
    print("After Normalization Equality:", norm_s1 == norm_s2)
    print("UTF-8 Bytes Equality:", s1.encode('utf-8') == s2.encode('utf-8'))
    print("NFC Normalized S1:", norm_s1)
    print("NFC Normalized S2:", norm_s2)
# Example strings to compare
s1 = "‡∞∏‡±Ç‡∞¶‡∞ø"  # From suudhi
s2 = "‡∞∏‡±Ç‡∞¶‡∞ø"  # From soodi or another transliterator
check_equivalence(s1, s2)

Comparing '‡∞∏‡±Ç‡∞¶‡∞ø' and '‡∞∏‡±Ç‡∞¶‡∞ø':
Unicode Equality: True
After Normalization Equality: True
UTF-8 Bytes Equality: True
NFC Normalized S1: ‡∞∏‡±Ç‡∞¶‡∞ø
NFC Normalized S2: ‡∞∏‡±Ç‡∞¶‡∞ø


In [9]:
import pandas as pd
df = pd.read_csv("C:/Users/prudh/Desktop/Toxicity_Platform/data/training/multi/dataset_multiclass.csv")
print(df['Toxic_type'].value_counts())


Toxic_type
none                    1999
profanity_generic       1000
common_insult           1000
mixed_toxicity          1000
religious_caste_slur    1000
sexual_abuse            1000
threatening              801
harassment_bullying      760
gender_targeted          670
films_fan_war            633
political_toxicity       611
Name: count, dtype: int64
