In [None]:
 !pip install pandas
 !pip install openpyxl  # For Excel files
 !pip install beautifulsoup4
 !pip install nltk
 !pip install scikit-learn
 !pip install bnlp-toolkit  # Bengali NLP tools (if available)
 !pip install emoji  # For emoji handling

import pandas as pd
import string
import re
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
import emoji
import os

Collecting bnlp-toolkit
  Downloading bnlp_toolkit-4.0.3-py3-none-any.whl.metadata (3.3 kB)
Collecting gensim==4.3.2 (from bnlp-toolkit)
  Downloading gensim-4.3.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.3 kB)
Collecting scipy==1.10.1 (from bnlp-toolkit)
  Downloading scipy-1.10.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (58 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.9/58.9 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sklearn-crfsuite==0.3.6 (from bnlp-toolkit)
  Downloading sklearn_crfsuite-0.3.6-py2.py3-none-any.whl.metadata (3.8 kB)
Collecting tqdm==4.66.3 (from bnlp-toolkit)
  Downloading tqdm-4.66.3-py3-none-any.whl.metadata (57 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.6/57.6 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting ftfy==6.2.0 (from bnlp-toolkit)
  Downloading ftfy-6.2.0-py3-none-any.whl.metadata (7.3 kB)
Collecting emoji==1.7

In [None]:
# Before running the script, install these packages:

# Try to import Bengali NLP tools (if available)
try:
    from bnlp.corpus import stopwords
    from bnlp.stemmer import BanglaStemmer
    BENGALI_NLP_AVAILABLE = True
    bengali_stemmer = BanglaStemmer()
    bengali_stopwords = set(stopwords())
    print("Bengali NLP tools loaded successfully.")
except ImportError:
    BENGALI_NLP_AVAILABLE = False
    print("Bengali NLP tools not available. Using basic preprocessing only.")

# Download NLTK resources
nltk.download('punkt')

# Load Bangla Cyberbullying dataset (XLSX format)
print("Loading XLSX dataset...")
excel_file = "/content/Bangla Cyberbullying Dataset.xlsx"  # Make sure this matches your file name
if not os.path.exists(excel_file):
    # Try to find any Excel file in the current directory
    excel_files = [f for f in os.listdir('.') if f.endswith('.xlsx')]
    if excel_files:
        excel_file = excel_files[0]
        print(f"Using found Excel file: {excel_file}")
    else:
        raise FileNotFoundError(f"No Excel file found. Please ensure your dataset is in the current directory.")

df = pd.read_excel(excel_file)

# Initialize log_steps for storing steps
log_steps = []

# Check the column names in your dataset
print("Columns in the dataset:", df.columns.tolist())

# For this cyberbullying dataset, determine which column contains the text
# Common names might be 'text', 'comment', 'content', etc.
content_columns = [col for col in df.columns if col.lower() in ['text', 'comment', 'content', 'message', 'post']]
if content_columns:
    content_column = content_columns[0]
    print(f"Using '{content_column}' as the text content column")
else:
    # If no standard column name is found, use the first non-label column as a guess
    non_label_cols = [col for col in df.columns if col.lower() not in ['label', 'class', 'target', 'category']]
    if non_label_cols:
        content_column = non_label_cols[0]
        print(f"No standard text column found. Using '{content_column}' as the text content column")
    else:
        content_column = df.columns[0]
        print(f"Using first column '{content_column}' as the text content column")

# Sample validation - check if we have actual text content
sample_text = df[content_column].astype(str).iloc[0]
if len(sample_text.strip()) < 5:  # Very short text may indicate wrong column
    print(f"WARNING: Selected content column '{content_column}' may not contain text (first sample is very short)")

# Example text for logging
example_text = sample_text

def log_step(title, content):
    log_steps.append(f"\n--- {title} ---\n{content[:1000]}...\n")  # Limit log length

log_step("Original Text", example_text)

# Step 1: HTML Parsing
html_parsed = BeautifulSoup(example_text, "html.parser").get_text()
log_step("HTML Parsed", html_parsed)

# Step 2: Remove URLs
url_pattern = r'https?://\S+|www\.\S+'
no_urls = re.sub(url_pattern, '', html_parsed)
log_step("URLs Removed", no_urls)

# Step 3: Remove emojis
def remove_emoji(text):
    return emoji.replace_emoji(text, replace='')

no_emoji = remove_emoji(no_urls)
log_step("Emojis Removed", no_emoji)

# Step 4: Remove special characters (keeping Bangla Unicode range)
# Bangla Unicode range: \u0980-\u09FF
def remove_special_chars(text):
    # Keep Bangla characters, digits, and spaces
    return re.sub(r'[^\u0980-\u09FF\s\d]', '', text)

no_special = remove_special_chars(no_emoji)
log_step("Special Characters Removed", no_special)

# Step 5: Tokenization (simple space-based for Bangla)
tokens = no_special.split()
log_step("Tokenized", str(tokens[:100]) + "...")  # Only show first 100 tokens

# Step 6: Stopword Removal (if Bengali NLP tools are available)
if BENGALI_NLP_AVAILABLE:
    filtered = [word for word in tokens if word not in bengali_stopwords]
    log_step("Stopwords Removed", str(filtered[:100]) + "...")  # Only show first 100 tokens
else:
    filtered = tokens  # Skip stopword removal if tools aren't available
    log_step("Stopwords Removal (Skipped)", "Bengali stopwords list not available")

# Step 7: Stemming (if Bengali NLP tools are available)
if BENGALI_NLP_AVAILABLE:
    stemmed = [bengali_stemmer.stem(word) for word in filtered]
    final_text = ' '.join(stemmed)
    log_step("Stemmed", final_text[:1000] + "...")  # Only show first 1000 chars
else:
    final_text = ' '.join(filtered)  # Skip stemming if tools aren't available
    log_step("Stemming (Skipped)", "Bengali stemmer not available")

# Clean function for full dataset
def clean_bangla_text(text):
    text = str(text)
    text = BeautifulSoup(text, "html.parser").get_text()  # HTML removal
    text = re.sub(url_pattern, '', text)  # URL removal
    text = remove_emoji(text)  # Emoji removal
    text = remove_special_chars(text)  # Special character removal

    tokens = text.split()  # Tokenization

    # Apply stopword removal if available
    if BENGALI_NLP_AVAILABLE:
        tokens = [word for word in tokens if word not in bengali_stopwords]

    # Apply stemming if available
    if BENGALI_NLP_AVAILABLE:
        tokens = [bengali_stemmer.stem(word) for word in tokens]

    return ' '.join(tokens)

# Apply cleaning
print("Cleaning text data...")
df["cleaned_text"] = df[content_column].astype(str).apply(clean_bangla_text)

# Save cleaned dataset (as CSV for broader compatibility)
print("Saving cleaned dataset...")
df.to_csv("bangla_cyberbullying_cleaned.csv", index=False)

# Also save as Excel if preferred
df.to_excel("bangla_cyberbullying_cleaned.xlsx", index=False)

# Limit dataset size to avoid memory issues
max_samples = min(10000, len(df))  # Adjust based on your system's memory
print(f"Using {max_samples} samples for TF-IDF analysis...")
df_sample = df["cleaned_text"].dropna().iloc[:max_samples]

# TF-IDF vectorization (optimized)
print("Performing TF-IDF vectorization...")
vectorizer = TfidfVectorizer(max_features=1000)  # Limit features to reduce memory
tfidf_matrix = vectorizer.fit_transform(df_sample)
feature_names = vectorizer.get_feature_names_out()

# Calculate average TF-IDF weights
avg_weights = tfidf_matrix.mean(axis=0).A1  # Convert to 1D array
feature_scores = pd.Series(avg_weights, index=feature_names).sort_values(ascending=False)

# Get top 30 words and bottom 30 words
top_features = feature_scores.head(30)
bottom_features = feature_scores.tail(30)

# Write processing log
print("Writing processing log...")
with open("processing_log.txt", "w", encoding="utf-8") as f:
    f.write("==== TEXT CLEANING LOG ====\n")
    f.write("Steps Applied to Each Text Entry:\n")
    f.write("1. HTML Parsing\n")
    f.write("2. URL Removal\n")
    f.write("3. Emoji Removal\n")
    f.write("4. Special Character Removal (preserving Bangla Unicode)\n")
    f.write("5. Tokenization\n")
    if BENGALI_NLP_AVAILABLE:
        f.write("6. Stopword Removal\n")
        f.write("7. Stemming\n")
    else:
        f.write("6. Stopword Removal (SKIPPED - tools not available)\n")
        f.write("7. Stemming (SKIPPED - tools not available)\n")
    f.write("\n==== Example Text Processing ====\n")
    f.write('\n'.join(log_steps))
    f.write("\n\n==== TF-IDF SUMMARY ====\n")
    f.write(f"TF-IDF Matrix Shape: {tfidf_matrix.shape}\n")
    f.write("Top 30 TF-IDF features:\n")
    f.write(top_features.to_string())

# Write TF-IDF summary
print("Writing TF-IDF summary...")
with open("tfidf_summary.txt", "w", encoding="utf-8") as f:
    f.write(f"TF-IDF Matrix Shape: {tfidf_matrix.shape}\n\n")
    f.write("Top 30 TF-IDF features by average weight:\n")
    f.write(top_features.to_string())
    f.write("\n\n")
    f.write("Bottom 30 TF-IDF features by average weight:\n")
    f.write(bottom_features.to_string())

# Additional analysis: Separate bullying and non-bullying (if labels exist)
# Look for typical label column names
label_columns = [col for col in df.columns if col.lower() in ['label', 'class', 'target', 'category', 'cyberbullying', 'bullying', 'bully']]

if label_columns:
    label_column = label_columns[0]
    print(f"Performing analysis by {label_column} category...")

    # Get unique categories
    categories = df[label_column].unique()

    with open("category_analysis.txt", "w", encoding="utf-8") as f:
        f.write("==== CATEGORY-BASED ANALYSIS ====\n\n")

        for category in categories:
            f.write(f"Category: {category}\n")

            # Get samples for this category
            category_samples = df[df[label_column] == category]["cleaned_text"].dropna()
            if len(category_samples) > 0:
                # Limit samples to avoid memory issues
                category_samples = category_samples.iloc[:min(5000, len(category_samples))]

                # TF-IDF for this category
                cat_vectorizer = TfidfVectorizer(max_features=500)
                cat_tfidf = cat_vectorizer.fit_transform(category_samples)
                cat_features = cat_vectorizer.get_feature_names_out()

                # Calculate average weights
                cat_weights = cat_tfidf.mean(axis=0).A1
                cat_scores = pd.Series(cat_weights, index=cat_features).sort_values(ascending=False)

                # Write top 20 features for this category
                f.write(f"Top 20 TF-IDF features for category '{category}':\n")
                f.write(cat_scores.head(20).to_string())
                f.write("\n\n")
            else:
                f.write(f"No samples found for category '{category}'\n\n")

    print("- category_analysis.txt")
else:
    print("No label column identified for category analysis")

print("✅ All files generated:")
print("- bangla_cyberbullying_cleaned.csv")
print("- bangla_cyberbullying_cleaned.xlsx")
print("- processing_log.txt")
print("- tfidf_summary.txt")

punkt not found. downloading...


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Bengali NLP tools not available. Using basic preprocessing only.


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Loading XLSX dataset...
Columns in the dataset: ['No.', 'Text', 'Label']
Using 'Text' as the text content column
Cleaning text data...
Saving cleaned dataset...
Using 10000 samples for TF-IDF analysis...
Performing TF-IDF vectorization...
Writing processing log...
Writing TF-IDF summary...
Performing analysis by Label category...
- category_analysis.txt
✅ All files generated:
- bangla_cyberbullying_cleaned.csv
- bangla_cyberbullying_cleaned.xlsx
- processing_log.txt
- tfidf_summary.txt
