In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.combine import SMOTETomek
from collections import Counter
import pandas as pd

# Load your dataset
train_data = pd.read_csv(r'D:\major_project\comment_classification\train.csv')

# Extract the features (comment_text) and the target labels (toxic, etc.)
X = train_data['comment_text']
y_columns = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

# Apply TF-IDF vectorization to the comment_text (optimize max_features to improve speed and memory)
tfidf = TfidfVectorizer(max_features=500)  # Adjust max_features based on your data size and memory
X_tfidf = tfidf.fit_transform(X)

# Initialize the balanced dataset
balanced_data = pd.DataFrame(X, columns=['comment_text'])

# Apply SMOTE-Tomek for each label individually
smote_tomek = SMOTETomek(n_jobs=-1)  # Utilize multiple processors for faster resampling
for label in y_columns:
    # Resample the data using SMOTE-Tomek
    X_res, y_res = smote_tomek.fit_resample(X_tfidf, train_data[label])
    
    # Store the balanced labels
    balanced_data[label] = y_res

# Save the balanced dataset
balanced_data.to_csv('balanced_dataset_ST.csv', index=False)

# Check the distribution after balancing
for label in y_columns:
    print(f'Distribution for {label}: {Counter(balanced_data[label])}')




In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.combine import SMOTETomek
from imblearn.over_sampling import SMOTE
from collections import Counter
import pandas as pd

# Load your dataset (adjust the file path accordingly)
train_data = pd.read_csv(r'D:\major_project\comment_classification\train.csv')

# Extract the features (comment_text) and the target labels (toxic, etc.)
X = train_data['comment_text']
y_columns = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

# Apply TF-IDF vectorization to the comment_text
# Adjust max_features based on available memory (fewer features = faster, less memory)
tfidf = TfidfVectorizer(max_features=500)  # You can lower this if memory is an issue
X_tfidf = tfidf.fit_transform(X)

# Initialize SMOTE-Tomek
smote_tomek = SMOTETomek(sampling_strategy='auto', smote=SMOTE(n_jobs=-1))

# Initialize the balanced dataset
balanced_data = pd.DataFrame(X, columns=['comment_text'])

# Apply SMOTE-Tomek for each label individually
for label in y_columns:
    print(f"Balancing label: {label}")
    
    # Resample the data using SMOTE-Tomek for each label
    X_res, y_res = smote_tomek.fit_resample(X_tfidf, train_data[label])
    
    # Store the balanced labels
    balanced_data[label] = y_res

# Save the balanced dataset (output file)
balanced_data.to_csv('balanced_dataset_11.csv', index=False)

# Check the distribution after balancing for each label
for label in y_columns:
    print(f'Distribution for {label}: {Counter(balanced_data[label])}')


Balancing label: toxic


