In [None]:
pip install transformers torch


In [None]:
!pip install datasets

In [None]:
import pandas as pd

dataset = pd.read_csv("/kaggle/input/labelled-dataset/Label.csv")
dataset.head()


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
# Load the CSV dataset into a Pandas DataFrame
dataset = pd.read_csv("/kaggle/input/labelled-dataset/Label.csv")

label_mapping = {
    "Computer generated Review": 0,
    "Original Review": 1
}

# Apply train_test_split to the Pandas DataFrame
train_df, test_df = train_test_split(dataset, test_size=0.2, random_state=42)


In [None]:
from transformers import AutoTokenizer
from datasets import Dataset

# Convert pandas DataFrames into Hugging Face Datasets
hf_train_dataset = Dataset.from_pandas(train_df)
hf_test_dataset = Dataset.from_pandas(test_df)

# Apply the label mapping to the datasets
hf_train_dataset = hf_train_dataset.map(lambda examples: {'label': label_mapping[examples['label']]})
hf_test_dataset = hf_test_dataset.map(lambda examples: {'label': label_mapping[examples['label']]})

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained('xlnet-base-cased')

# Define the tokenization function
def tokenize_function(examples):
    # Apply padding to max_length, truncating if necessary
    return tokenizer(examples["text_"], padding="max_length", truncation=True, max_length=128)

# Tokenize the training and testing datasets with batched processing
tokenized_train_dataset = hf_train_dataset.map(tokenize_function, batched=True)
tokenized_test_dataset = hf_test_dataset.map(tokenize_function, batched=True)




In [None]:
train_dataset = tokenized_train_dataset.shuffle(seed=42)  # Shuffle only, no selection of range
eval_dataset = tokenized_test_dataset.shuffle(seed=42)  # Shuffle only, no selection of range

import pandas as pd

# Convert small_train_dataset to a pandas DataFrame
train_df = pd.DataFrame(train_dataset)
eval_df = pd.DataFrame(eval_dataset)

train_df.to_csv('train_dataset.csv', index=False)
eval_df.to_csv('eval_dataset.csv', index=False)


In [None]:
from transformers import XLNetForSequenceClassification

model = XLNetForSequenceClassification.from_pretrained('xlnet-base-cased', num_labels=2)  # Adjust num_labels for your case


In [None]:
!pip install evaluate

In [None]:
from datasets import Dataset
from transformers import Trainer, TrainingArguments, EarlyStoppingCallback
import numpy as np
import evaluate
# Load accuracy metric
metric = evaluate.load("accuracy")

# Define function to compute metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,       # This must be True for EarlyStoppingCallback
    metric_for_best_model='eval_loss',  # Metric to monitor
    greater_is_better=False,             # Whether to maximize or minimize the monitored metric
    num_train_epochs=3,                 # Total number of training epochs
    per_device_train_batch_size=16,       # Training batch size
    per_device_eval_batch_size=16,        # Evaluation batch size
    logging_dir='./logs',                 # Directory for storing logs
    logging_steps=10,
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=1)]# Pass the compute_metrics function
)

# Start training
trainer.train()

In [None]:
trainer.evaluate()

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# Assuming 'model' is your trained model
model.save_pretrained('/kaggle/working/XLnet_model')  

# Save the tokenizer as well
tokenizer = AutoTokenizer.from_pretrained('xlnet-base-cased')  
tokenizer.save_pretrained('/kaggle/working/XLnet_model')  


In [None]:
# Zip the saved model directory
!zip -r /kaggle/working/XLnet_model.zip /kaggle/working/XLnet_model

In [None]:
!pip install tensorboard


In [None]:
from transformers import AutoTokenizer

# Load the tokenizer from the saved directory
tokenizer = AutoTokenizer.from_pretrained('/kaggle/working/XLnet_model')

In [None]:
import pandas as pd

# Load the evaluation dataset (adjust the file path if necessary)
eval_data = pd.read_csv('/kaggle/working/eval_dataset.csv')

# Assuming 'review' column has the review text and 'label' has the true labels
texts = eval_data['text_'].tolist()
true_labels = eval_data['label'].tolist()

In [None]:
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Step 1: Predict on test data
test_results = trainer.predict(eval_dataset)  # Replace test_dataset with your test data variable
preds = np.argmax(test_results.predictions, axis=1)  # Get the predicted class labels
labels = test_results.label_ids  # True labels from test data

conf_matrix = confusion_matrix(labels, preds)

# Plot Confusion Matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=label_mapping.keys(), yticklabels=label_mapping.keys())
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix")
plt.show()



In [None]:
from sklearn.metrics import classification_report

report = classification_report(labels, preds, target_names=['Computer Generated', 'Original'])
print(report)