In [None]:
!pip install deep-translator

Collecting deep-translator
  Downloading deep_translator-1.11.4-py3-none-any.whl.metadata (30 kB)
Downloading deep_translator-1.11.4-py3-none-any.whl (42 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.3/42.3 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: deep-translator
Successfully installed deep-translator-1.11.4


In [None]:
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from deep_translator import GoogleTranslator
import re

In [None]:
def is_hindi(text):
    """Check if the text contains Hindi characters."""
    hindi_pattern = re.compile("[\u0900-\u097F]+")  # Unicode range for Devanagari script
    return bool(hindi_pattern.search(text))

In [None]:
def translate_mixed_text(text, source_lang='hi', target_lang='en'):
    """
    Translate only the Hindi parts of a mixed Hindi-English text to English.
    """
    words = text.split()
    translated_words = []
    for word in words:
        if is_hindi(word):  # Translate Hindi words
            translated_word = GoogleTranslator(source=source_lang, target=target_lang).translate(word)
        else:  # Keep English words unchanged
            translated_word = word
        translated_words.append(translated_word)
    return ' '.join(translated_words)

In [None]:
from sklearn.model_selection import train_test_split

def stratified_split(data, train_size=0.7, valid_size=0.15, test_size=0.15, random_state=42):
    """
    Splits the dataset into train, validation, and test sets with stratified class distribution,
    maintaining 'Category' along with 'text' and 'label'.
    """
    if abs(train_size + valid_size + test_size - 1.0) > 1e-6:
        raise ValueError("Train, valid, and test sizes must sum to 1.0.")

    texts = [entry['text'] for entry in data]
    labels = [entry['label'] for entry in data]
    categories = [entry['Category'] for entry in data]

    # Combine texts, labels, and categories for stratification
    combined_data = list(zip(texts, labels, categories))
    combined_texts, combined_labels = zip(*[(text, label) for text, label, _ in combined_data])

    # First split into train and temp (valid+test)
    train_data, temp_data = train_test_split(
        combined_data,
        test_size=(valid_size + test_size),
        stratify=combined_labels,
        random_state=random_state
    )

    # Extract texts and labels from temp data for further stratification
    temp_texts, temp_labels = zip(*[(text, label) for text, label, _ in temp_data])

    # Further split temp data into validation and test
    valid_data, test_data = train_test_split(
        temp_data,
        test_size=test_size / (valid_size + test_size),
        stratify=temp_labels,
        random_state=random_state
    )

    # Format back into dictionaries
    train_data = [{'text': text, 'label': label, 'Category': category} for text, label, category in train_data]
    valid_data = [{'text': text, 'label': label, 'Category': category} for text, label, category in valid_data]
    test_data = [{'text': text, 'label': label, 'Category': category} for text, label, category in test_data]

    return train_data, valid_data, test_data


In [None]:
def plot_class_distributions(train_data, valid_data, test_data):
    """Plot class distributions in train, validation, and test sets."""
    train_labels = [entry['label'] for entry in train_data]
    valid_labels = [entry['label'] for entry in valid_data]
    test_labels = [entry['label'] for entry in test_data]

    # Create subplots
    plt.figure(figsize=(12, 4))
    datasets = [("Train", train_labels), ("Validation", valid_labels), ("Test", test_labels)]
    for i, (name, labels) in enumerate(datasets, 1):
        plt.subplot(1, 3, i)
        plt.hist(labels, bins=range(3), align='left', rwidth=0.8, color='skyblue', alpha=0.7)
        plt.title(f"{name} Set")
        plt.xlabel("Label")
        plt.ylabel("Count")
        plt.xticks([0, 1])
    plt.tight_layout()
    plt.show()

In [None]:
def save_to_csv(data, filename):
    """Save the dataset to a CSV file with text and label columns."""
    df = pd.DataFrame(data)
    df.to_csv(filename, index=False)

In [None]:
def process_excel(input_excel):
    """
    Process the input Excel file to extract 'text' and 'label' columns
    and format the data as a list of dictionaries.

    Args:
        input_excel (str): Path to the input Excel file.

    Returns:
        list: List of dictionaries with 'text' and 'label' keys.
    """
    # Read the Excel file
    df = pd.read_excel(input_excel)

    # Extract 'Text' and 'label' columns and format as list of dictionaries
    formatted_data = [
        {'label': int(row['label']), 'text': row['Text'], 'Category': row['Category']}
        for _, row in df.iterrows()
    ]

    return formatted_data

In [None]:
safe_file = '/content/Safe Text.xlsx'
unsafe_file = '/content/Unsafe Text.xlsx'

safe_df = process_excel(safe_file)
unsafe_df = process_excel(unsafe_file)

data = safe_df + unsafe_df

print(f"Total records: {len(data)}")
print("Sample combined data:", data[:5])

Total records: 300
Sample combined data: [{'label': 0, 'text': 'Wow, this is really inspiring! Your journey is a testament to hard work and perseverance. Thank you for sharing such personal insights with us!', 'Category': 'Optimism'}, {'label': 0, 'text': 'बहुत बढ़िया अंतर्दृष्टि! मैंने आज भौतिकी के बारे में कुछ नया सीखा। यथास्थिति को चुनौती देने वाले नए परिप्रेक्ष्य को देखना हमेशा ताज़ा होता है।', 'Category': 'Learning'}, {'label': 0, 'text': 'This post truly resonated with me. The way you articulate your thoughts makes complex ideas so much more accessible. Keep sharing your wisdom!', 'Category': 'Learning'}, {'label': 0, 'text': 'आपका दृष्टिकोण सचमुच आंखें खोल देने वाला है। ये वार्तालाप होना बहुत महत्वपूर्ण है, और आप इन्हें सुविधाजनक बनाने का उत्कृष्ट कार्य कर रहे हैं।', 'Category': 'Optimism'}, {'label': 0, 'text': 'Such a well-written piece! I appreciate the depth of research you put into this. It shows how passionate you are about your work, and it inspires others to dig deeper t

In [None]:
# Step 3: Translate text if it contains Hindi
translated_data = []
for entry in data:
    text = entry['text']
    if is_hindi(text):  # Check if text contains Hindi
        translated_text = translate_mixed_text(text)
    else:
        translated_text = text  # Keep English text unchanged
    translated_data.append({'text': translated_text, 'label': entry['label'], 'Category': entry['Category']})

print("Sample translated data:", translated_data[:5])

Sample translated data: [{'text': 'Wow, this is really inspiring! Your journey is a testament to hard work and perseverance. Thank you for sharing such personal insights with us!', 'label': 0, 'Category': 'Optimism'}, {'text': 'Very Excellent insight! I Today Physics Of About In Some? New Learned it. status quo To challenge to give ones New Perspective To Look Always fresh Would Is.', 'label': 0, 'Category': 'Learning'}, {'text': 'This post truly resonated with me. The way you articulate your thoughts makes complex ideas so much more accessible. Keep sharing your wisdom!', 'label': 0, 'Category': 'Learning'}, {'text': 'Yours Approach Really Eyes Shell to give gonna Is. These Conversation Happen Very Important Is, And You These convenient create Of Excellent Work Tax are Are.', 'label': 0, 'Category': 'Optimism'}, {'text': 'Such a well-written piece! I appreciate the depth of research you put into this. It shows how passionate you are about your work, and it inspires others to dig deepe

In [None]:
# Step 4: Split the dataset into train, validation, and test sets
train_data, valid_data, test_data = stratified_split(translated_data)

print(f"Size of Train Set: {len(train_data)}")
print(f"Size of Validation Set: {len(valid_data)}")
print(f"Size of Test Set: {len(test_data)}")

Size of Train Set: 210
Size of Validation Set: 45
Size of Test Set: 45


In [None]:
# Step 6: Save datasets to CSV files
save_to_csv(train_data, "text_train_set.csv")
save_to_csv(valid_data, "text_valid_set.csv")
save_to_csv(test_data, "text_test_set.csv")