In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import re

# Load datasets
data = pd.read_csv("/Users/soubantiksengupta/Downloads/Bangla-Text-Dataset-main/dataset.csv")
factwatch_data = pd.read_csv("/Users/soubantiksengupta/Downloads/FactWatch-Data.csv")
toxlex_data = pd.read_csv("/Users/soubantiksengupta/Downloads/ToxLex_bn-output-v1-29-January-2022.csv")

# Define text cleaning and preprocessing function
def preprocess_text(text):
    text = re.sub(r'[^\u0980-\u09FF]+', ' ', str(text))  # Keep Bengali chars
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text

# Function to map labels for all datasets
def map_labels(label, dataset='original'):
    label = str(label)  # Ensure label is a string
    if dataset == 'original':
        return 'sincere' if label in ['sexual', 'not bully'] else 'ironical'
    elif dataset == 'factwatch':
        return 'sincere' if 'serious_condition' in label else 'ironical'
    elif dataset == 'toxlex':
        return 'sincere' if label == 'High' else 'ironical'

# Preprocess and map labels
data['processed_comment'] = data['comment'].apply(preprocess_text)
data['mapped_label'] = data['label'].apply(map_labels)

# Preprocessing new datasets
factwatch_data['processed_content'] = factwatch_data['Post_Content_Summary'].apply(preprocess_text)
factwatch_data['mapped_label'] = factwatch_data['Category'].apply(lambda x: map_labels(x, 'factwatch'))

toxlex_data['processed_content'] = toxlex_data['Base_bigram'].apply(preprocess_text)
toxlex_data['mapped_label'] = toxlex_data['Degree_of_toxicity'].apply(lambda x: map_labels(x, 'toxlex'))

# Combine all datasets
combined_data = pd.concat([
    data[['processed_comment', 'mapped_label']],
    factwatch_data[['processed_content', 'mapped_label']].rename(columns={'processed_content': 'processed_comment'}),
    toxlex_data[['processed_content', 'mapped_label']].rename(columns={'processed_content': 'processed_comment'})
])

# Check for and remove any NaN values in 'mapped_label'
combined_data = combined_data.dropna(subset=['mapped_label'])

# Feature extraction and model training
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X = tfidf_vectorizer.fit_transform(combined_data['processed_comment'])
y = combined_data['mapped_label']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model = LogisticRegression(random_state=42, max_iter=1000)
model.fit(X_train, y_train)

# Function to classify new user input
def classify_bengali_text(text):
    processed_text = preprocess_text(text)
    vectorized_text = tfidf_vectorizer.transform([processed_text])
    prediction = model.predict(vectorized_text)
    return prediction[0]

# Evaluate the model
y_pred = model.predict(X_test)
report = classification_report(y_test, y_pred)
print(report)

# Example of classifying a new input
user_input = input("Enter a Bengali comment to classify: ")
classification_result = classify_bengali_text(user_input)
print(f"The comment is classified as: {classification_result}")

              precision    recall  f1-score   support

    ironical       0.70      0.63      0.66      4503
     sincere       0.69      0.75      0.72      4893

    accuracy                           0.69      9396
   macro avg       0.69      0.69      0.69      9396
weighted avg       0.69      0.69      0.69      9396

Enter a Bengali comment to classify: আমাদের সম্প্রদায়ের জরুরী সমস্যাগুলি চিন্তামূলক বিবেচনা এবং নিশ্চিত পদক্ষেপের মাধ্যমে কঠোরভাবে অবগত করা উচিত|
The comment is classified as: sincere


In [19]:
import pandas as pd

# Load datasets
data = pd.read_csv("/Users/soubantiksengupta/Downloads/Bangla-Text-Dataset-main/dataset.csv")
factwatch_data = pd.read_csv("/Users/soubantiksengupta/Downloads/FactWatch-Data.csv")
toxlex_data = pd.read_csv("/Users/soubantiksengupta/Downloads/ToxLex_bn-output-v1-29-January-2022.csv")

# Print the first few rows of each dataset to understand their structure
print("Dataset 1 - General Social Media Comments:")
print(data.head())
print("\nColumns:", data.columns)

print("\nDataset 2 - FactWatch Data:")
print(factwatch_data.head())
print("\nColumns:", factwatch_data.columns)

print("\nDataset 3 - ToxLex Data:")
print(toxlex_data.head())
print("\nColumns:", toxlex_data.columns)

Dataset 1 - General Social Media Comments:
                                             comment    Category  Gender  \
0  ওই হালার পুত এখন কি মদ খাওয়ার সময় রাতের বেলা...       Actor  Female   
1  ঘরে বসে শুট করতে কেমন লেগেছে? ক্যামেরাতে কে ছি...      Singer    Male   
2                       অরে বাবা, এই টা কোন পাগল????       Actor  Female   
3                              ক্যাপ্টেন অফ বাংলাদেশ      Sports    Male   
4                                           পটকা মাছ  Politician    Male   

   comment react number      label  
0                   1.0     sexual  
1                   2.0  not bully  
2                   2.0  not bully  
3                   0.0  not bully  
4                   0.0      troll  

Columns: Index(['comment', 'Category', 'Gender', 'comment react number', 'label'], dtype='object')

Dataset 2 - FactWatch Data:
   Post_ID                                         Post_Title  \
0      1.0  পঙ্গু হাসপাতালে কি কৃত্রিম পা সংযোজনের ক্যাম্প...   
1      2.0  কুলাউড়া