In [None]:
#BERT model


#!pip install datasets
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
import torch
from datasets import Dataset
import json

# Load and preprocess the dataset
def preprocess_data(json_file):
    data = []
    with open(json_file, 'r') as f:
        for line in f:
            try:
                data.append(json.loads(line))
            except json.JSONDecodeError as e:
                print(f"Skipping invalid line: {line.strip()} due to error: {e}")
                continue
    texts = [item['text'] for item in data]
    labels = [1 if sum(label[0] for label in item['composite_toxic']) > 2 else 0 for item in data]
    return Dataset.from_dict({'text': texts, 'label': labels})

dataset = preprocess_data('/content/z639_assignment1_training.json')

# Tokenize the text
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=128)  # Reduced max_length

tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Load the model
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=1,  # Further reduced epochs
    per_device_train_batch_size=16,  # Try increasing if memory allows
    evaluation_strategy="epoch",
    save_steps=10_000,
    save_total_limit=2,
    fp16=True  # Use mixed precision
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
    eval_dataset=tokenized_datasets
)

# Train the model
trainer.train()

# Save the trained model and tokenizer
model.save_pretrained('/content/saved_model')
tokenizer.save_pretrained('/content/saved_model')

from google.colab import files
import shutil

# Zip the saved model directory
shutil.make_archive('/content/saved_model', 'zip', '/content/saved_model')

# Download the zipped model directory
files.download('/content/saved_model.zip')

#Test data
# Load the tokenizer and trained model
tokenizer = DistilBertTokenizer.from_pretrained('/content/saved_model')
model = DistilBertForSequenceClassification.from_pretrained('/content/saved_model')

# Function to load and preprocess test data
def load_test_data(json_file):
    with open(json_file, 'r') as f:
        data = [json.loads(line) for line in f]
    texts = [item['text'] for item in data]  # Extract the text field
    return texts

# Load test data
test_texts = load_test_data('/content/z639_assignment1_test.json')

# Tokenize the test data
inputs = tokenizer(test_texts, return_tensors='pt', padding=True, truncation=True, max_length=128)

# Put the model in evaluation mode
model.eval()

# Make predictions
with torch.no_grad():
    outputs = model(**inputs)

# Get predicted labels
predictions = torch.argmax(outputs.logits, dim=-1)

import pandas as pd
import json

# Load the test data with platform_id
def load_test_data_with_id(json_file):
    data = []
    with open(json_file, 'r') as f:
        for line in f:
            try:
                data.append(json.loads(line))  # Load each line as a separate JSON object
            except json.JSONDecodeError as e:
                print(f"Skipping invalid line: {line.strip()} due to error: {e}")
                continue
    texts = [item['text'] for item in data]  # Extract the text field
    platform_ids = [item['platform_id'] for item in data]  # Extract the platform_id field
    return texts, platform_ids

# Assuming 'predictions' and 'test_texts' are defined as before
test_texts, platform_ids = load_test_data_with_id('/content/z639_assignment1_test.json')

# Create a DataFrame to save predictions
df = pd.DataFrame({
    'platform_id': platform_ids,
    'prediction': [True if pred == 1 else False for pred in predictions]
})

# Save the results to a CSV file
df.to_csv('/content/predictions_with_id.csv', index=False)

# Print confirmation
print("Predictions saved to /content/predictions_with_id.csv")

#Evaluate the BERT results
from sklearn.metrics import classification_report, accuracy_score
import json

# Load the true labels from your test dataset (assuming the labels are present)
def load_true_labels(json_file):
    with open(json_file, 'r') as f:
        data = [json.loads(line) for line in f]
    # Instead of relying on 'composite_toxic', adjust to the structure of your test data
    # For example, if your test data has a 'label' key directly, use:
    # labels = [item['label'] for item in data]
    # However, since we do not have information about the actual structure of the test data,
    # here's a placeholder that assumes a list called 'toxicity_level' with a single value
    labels = [1 if item.get('toxicity_level', [0])[0] > 0.5 else 0 for item in data]
    return labels

# Assuming 'predictions' and 'test_texts' are defined as before
true_labels = load_true_labels('/content/z639_assignment1_test.json')

# Calculate accuracy
accuracy = accuracy_score(true_labels, predictions)
print(f"Accuracy: {accuracy:.2f}")

# Detailed classification report
report = classification_report(true_labels, predictions, target_names=['Not Toxic', 'Toxic'])
print(report)

In [None]:
#SVM model

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Load and preprocess the dataset
def preprocess_data_svm(json_file):
    import json
    # Try to fix the JSON file by reading it as a list of JSON objects
    with open(json_file, 'r') as f:
        data = []
        for line in f:
            if line.strip():  # Skip empty lines
                try:
                    data.append(json.loads(line))
                except json.JSONDecodeError as e:
                    print(f"Skipping invalid JSON line: {line.strip()}")
                    print(f"Error: {e}")
    texts = [item['text'] for item in data]
    labels = [1 if sum(label[0] for label in item['composite_toxic']) > 2 else 0 for item in data]
    return texts, labels

texts, labels = preprocess_data_svm('z639_assignment1_training.json')

# Vectorization
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(texts)

# Split the dataset
X_train, X_val, y_train, y_val = train_test_split(X, labels, test_size=0.2, random_state=42)

# Train SVM
svm_model = SVC(kernel='linear')
svm_model.fit(X_train, y_train)

# Evaluate
y_pred = svm_model.predict(X_val)
print(classification_report(y_val, y_pred))

import joblib

# Save the trained model
joblib.dump(svm_model, '/content/svm_model.pkl')

# Save the TF-IDF vectorizer
joblib.dump(vectorizer, '/content/tfidf_vectorizer.pkl')

import json
import pandas as pd
import joblib

# Load the test data with platform_id
def load_test_data_with_id(json_file):
    data = []
    with open(json_file, 'r') as f:
        for line in f:
            if line.strip():  # Skip empty lines
                try:
                    data.append(json.loads(line))
                except json.JSONDecodeError as e:
                    print(f"Skipping invalid JSON line: {line.strip()}")
                    print(f"Error: {e}")
    texts = [item['text'] for item in data]  # Extract the text field
    platform_ids = [item['platform_id'] for item in data]  # Extract the platform_id field
    return texts, platform_ids

# Load the model and vectorizer
svm_model_loaded = joblib.load('/content/svm_model.pkl')
vectorizer_loaded = joblib.load('/content/tfidf_vectorizer.pkl')

# Load and transform the test data
test_texts, platform_ids = load_test_data_with_id('/content/z639_assignment1_test.json')
X_test = vectorizer_loaded.transform(test_texts)

# Make predictions
predictions = svm_model_loaded.predict(X_test)

# Create a DataFrame to save predictions
df = pd.DataFrame({
    'platform_id': platform_ids,
    'prediction': [True if pred == 1 else False for pred in predictions]
})

# Save the results to a CSV file
df.to_csv('/content/predictions_with_id.csv', index=False)

# Print confirmation
print("Predictions saved to /content/predictions_with_id.csv")


#Evaluate the SVM results
import json
from sklearn.metrics import classification_report, accuracy_score

# Load the true labels from your test dataset
def load_true_labels(json_file):
    with open(json_file, 'r') as f:
        data = []
        # Read the file line by line and parse each line as a JSON object
        for line in f:
            # Skip empty lines
            if line.strip():
                try:
                    data.append(json.loads(line))
                except json.JSONDecodeError as e:
                    print(f"Skipping invalid JSON line: {line.strip()}")
                    print(f"Error: {e}")
    # Check if 'composite_toxic' key exists before accessing it
    labels = [1 if sum(label[0] for label in item.get('composite_toxic', [])) > 2 else 0 for item in data]
    return labels

# Load true labels
true_labels = load_true_labels('/content/z639_assignment1_test.json')

# Assuming 'predictions' are already defined as before from the SVM model
# Calculate accuracy
accuracy = accuracy_score(true_labels, predictions)
print(f"Accuracy: {accuracy:.2f}")

# Detailed classification report
report = classification_report(true_labels, predictions, target_names=['Not Toxic', 'Toxic'])
print(report)