In [7]:
import pandas as pd

# Load the dataset in chunks to handle errors
file_path = "/content/all_data.csv"  # Path to your uploaded file
output_file_path = "/content/processed_data.csv"  # Output path for cleaned data

# Open a new file to save the processed data
with open(output_file_path, "w", encoding="utf-8") as output_file:
    # Read the file in chunks
    for chunk in pd.read_csv(
        file_path,
        chunksize=10000,  # Process 10,000 rows at a time
        on_bad_lines="skip",  # Skip bad lines
        encoding="utf-8",
        low_memory=False,  # Avoid memory issues
    ):
        # Preprocessing: clean text column
        chunk["comment_text"] = chunk["comment_text"].str.lower()  # Convert text to lowercase
        chunk["comment_text"] = chunk["comment_text"].str.replace(r"[^a-zA-Z\s]", "", regex=True)  # Remove non-alphabetic characters

        # Select important columns
        processed_chunk = chunk[["id", "comment_text", "toxicity", "severe_toxicity"]]

        # Append the cleaned data to the output file
        processed_chunk.to_csv(output_file, index=False, mode="a", header=output_file.tell() == 0)

print(f"Processed data saved to {output_file_path}")


Processed data saved to /content/processed_data.csv


In [8]:
import pandas as pd

# Load the processed dataset
processed_file_path = "/content/processed_data.csv"
data = pd.read_csv(processed_file_path)

# Inspect the first few rows
print(data.head())

# Check for missing values
print(data.isnull().sum())

# Get dataset statistics
print(data.describe())


  data = pd.read_csv(processed_file_path)


        id                                       comment_text  toxicity  \
0  1083994  he got his money now he lies in wait till afte...  0.373134   
1   650904  mad dog will surely put the liberals in mental...  0.605263   
2  5902188  and trump continues his lifelong cowardice by ...  0.666667   
3  7084460  while arresting a man for resisting arrest\n\n...  0.815789   
4  5410943       tucker and paul are both total bad ass mofos  0.550000   

   severe_toxicity  
0         0.044776  
1         0.013158  
2         0.015873  
3         0.065789  
4         0.037500  
id                   0
comment_text       381
toxicity             1
severe_toxicity      1
dtype: int64
           toxicity  severe_toxicity
count  2.001235e+06     2.001235e+06
mean   1.030730e-01     4.586408e-03
std    1.970402e-01     2.287432e-02
min    0.000000e+00     0.000000e+00
25%    0.000000e+00     0.000000e+00
50%    0.000000e+00     0.000000e+00
75%    1.666667e-01     0.000000e+00
max    1.000000e+00   

In [9]:
# Drop rows with missing comment_text
data = data.dropna(subset=['comment_text'])

# Drop rows with missing toxicity or severe_toxicity values
data = data.dropna(subset=['toxicity', 'severe_toxicity'])

print("Remaining rows after handling missing values:", len(data))


Remaining rows after handling missing values: 2000854


In [11]:
from sklearn.model_selection import train_test_split

# Features and labels
X = data['comment_text']
y = data[['toxicity', 'severe_toxicity']]

# Split into train, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

print(f"Train size: {len(X_train)}, Validation size: {len(X_val)}, Test size: {len(X_test)}")


Train size: 1400597, Validation size: 300128, Test size: 300129


In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF vectorizer
vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1, 2))

# Fit and transform training data, and transform validation and test data
X_train_tfidf = vectorizer.fit_transform(X_train)
X_val_tfidf = vectorizer.transform(X_val)
X_test_tfidf = vectorizer.transform(X_test)

print("TF-IDF vectorization completed!")


TF-IDF vectorization completed!


In [14]:
# Binarize the toxicity values (threshold: 0.5)
y_train['toxicity'] = (y_train['toxicity'] >= 0.5).astype(int)
y_val['toxicity'] = (y_val['toxicity'] >= 0.5).astype(int)
y_test['toxicity'] = (y_test['toxicity'] >= 0.5).astype(int)

print("Toxicity labels binarized for classification!")


Toxicity labels binarized for classification!


In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Train Logistic Regression for binary toxicity classification
model_toxicity = LogisticRegression(max_iter=1000)
model_toxicity.fit(X_train_tfidf, y_train['toxicity'])

# Validate the model
y_val_pred_toxicity = model_toxicity.predict(X_val_tfidf)

# Evaluate performance
print("Validation Performance (Toxicity):")
print(classification_report(y_val['toxicity'], y_val_pred_toxicity))


Validation Performance (Toxicity):
              precision    recall  f1-score   support

           0       0.95      0.99      0.97    276055
           1       0.78      0.40      0.53     24073

    accuracy                           0.94    300128
   macro avg       0.86      0.69      0.75    300128
weighted avg       0.94      0.94      0.93    300128



In [17]:
from xgboost import XGBClassifier
from sklearn.metrics import classification_report

# Binarize the toxicity variable (if not already done)
y_train['toxicity'] = (y_train['toxicity'] >= 0.5).astype(int)
y_val['toxicity'] = (y_val['toxicity'] >= 0.5).astype(int)
y_test['toxicity'] = (y_test['toxicity'] >= 0.5).astype(int)

# Initialize the XGBoost classifier
xgb_model = XGBClassifier(
    max_depth=6,          # Depth of each tree
    n_estimators=100,     # Number of trees
    learning_rate=0.1,    # Learning rate
    use_label_encoder=False,  # Suppress label encoding warnings
    eval_metric="logloss"     # Use log-loss for binary classification
)

# Train the model on the training data
xgb_model.fit(X_train_tfidf, y_train['toxicity'])

# Validate the model on the validation set
y_val_pred = xgb_model.predict(X_val_tfidf)

# Evaluate performance
print("Validation Performance (Toxicity - XGBoost):")
print(classification_report(y_val['toxicity'], y_val_pred))

# Predict on the test set
y_test_pred = xgb_model.predict(X_test_tfidf)

# Evaluate on the test set
print("Test Performance (Toxicity - XGBoost):")
print(classification_report(y_test['toxicity'], y_test_pred))


Parameters: { "use_label_encoder" } are not used.



Validation Performance (Toxicity - XGBoost):
              precision    recall  f1-score   support

           0       0.94      1.00      0.97    276055
           1       0.86      0.26      0.40     24073

    accuracy                           0.94    300128
   macro avg       0.90      0.63      0.68    300128
weighted avg       0.93      0.94      0.92    300128

Test Performance (Toxicity - XGBoost):
              precision    recall  f1-score   support

           0       0.94      1.00      0.97    276036
           1       0.86      0.26      0.40     24093

    accuracy                           0.94    300129
   macro avg       0.90      0.63      0.68    300129
weighted avg       0.93      0.94      0.92    300129



In [None]:
# Install required versions
!pip install tensorflow==2.12.0 transformers==4.33.2 --upgrade

# Import necessary libraries
import pandas as pd
import tensorflow as tf
from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.callbacks import EarlyStopping

# Ensure eager execution
tf.config.run_functions_eagerly(True)

# Load dataset
file_path = "/content/all_data.csv"  # Replace with your dataset path
data = pd.read_csv(file_path, dtype={'id': str, 'article_id': str}, low_memory=False)

# Define PCL categories
categories = ['toxicity', 'severe_toxicity', 'identity_attack', 'insult', 'threat', 'obscene', 'sexual_explicit']
data = data.sample(frac=0.1, random_state=42)  # Use 10% of the data for faster training
data = data.dropna(subset=['comment_text'])

for category in categories:
    data.loc[:, category] = data[category].fillna(0).astype(int)

# Split dataset
from sklearn.model_selection import train_test_split
X = data['comment_text']
y = data[categories]

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=len(categories))

# Tokenize data
def tokenize_data(texts, tokenizer, max_len=64):
    return tokenizer(
        texts.tolist(),
        max_length=max_len,
        padding="max_length",
        truncation=True,
        return_tensors="tf"
    )

train_encodings = tokenize_data(X_train, tokenizer)
val_encodings = tokenize_data(X_val, tokenizer)
test_encodings = tokenize_data(X_test, tokenizer)

# Prepare TensorFlow datasets
train_dataset = tf.data.Dataset.from_tensor_slices((
    {'input_ids': train_encodings['input_ids'], 'attention_mask': train_encodings['attention_mask']},
    y_train.values
)).shuffle(10000).batch(16).prefetch(tf.data.AUTOTUNE)

val_dataset = tf.data.Dataset.from_tensor_slices((
    {'input_ids': val_encodings['input_ids'], 'attention_mask': val_encodings['attention_mask']},
    y_val.values
)).batch(16).prefetch(tf.data.AUTOTUNE)

test_dataset = tf.data.Dataset.from_tensor_slices((
    {'input_ids': test_encodings['input_ids'], 'attention_mask': test_encodings['attention_mask']},
    y_test.values
)).batch(16).prefetch(tf.data.AUTOTUNE)

# Compile the model
optimizer = Adam(learning_rate=3e-5)
loss = BinaryCrossentropy(from_logits=True)
metrics = [tf.keras.metrics.AUC(name="auc")]

model.compile(optimizer=optimizer, loss=loss, metrics=metrics)

# Add EarlyStopping to prevent overfitting
early_stopping = EarlyStopping(monitor="val_auc", patience=1, restore_best_weights=True)

# Train the model
history = model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=2,
    callbacks=[early_stopping]
)

# Evaluate the model
test_results = model.evaluate(test_dataset)
print("Test Results:", test_results)

# Predict on the test set
predictions = model.predict(test_dataset)

# Post-process predictions (convert logits to probabilities)
probabilities = tf.nn.sigmoid(predictions.logits).numpy()
print("Sample Probabilities:", probabilities[:5])  # Display the first 5 predictions


Collecting tensorflow==2.12.0
  Downloading tensorflow-2.12.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.4 kB)
Collecting transformers==4.33.2
  Downloading transformers-4.33.2-py3-none-any.whl.metadata (119 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.9/119.9 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
Collecting gast<=0.4.0,>=0.2.1 (from tensorflow==2.12.0)
  Downloading gast-0.4.0-py3-none-any.whl.metadata (1.1 kB)
Collecting keras<2.13,>=2.12.0 (from tensorflow==2.12.0)
  Downloading keras-2.12.0-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting numpy<1.24,>=1.22 (from tensorflow==2.12.0)
  Downloading numpy-1.23.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.3 kB)
Collecting tensorboard<2.13,>=2.12 (from tensorflow==2.12.0)
  Downloading tensorboard-2.12.3-py3-none-any.whl.metadata (1.8 kB)
Collecting tensorflow-estimator<2.13,>=2.12.0 (from tensorflow==2.12.0)
  Downloading tensorflow_estimator-2.1