In [None]:
!pip install transformers pandas torch
!pip install datasets


In [None]:
import pandas as pd

dataset = pd.read_excel("/kaggle/input/labelled-dataset/Label.xlsx")
dataset.head()

In [None]:
from transformers import AutoTokenizer
from datasets import Dataset

#Convert into huggingFace dataset
hf_train_dataset = Dataset.from_pandas(train_df)
hf_test_dataset = Dataset.from_pandas(test_df)

# Apply the mapping to the dataset
hf_train_dataset = hf_train_dataset.map(lambda examples: {'label': label_mapping[examples['label']]})
hf_test_dataset = hf_test_dataset.map(lambda examples: {'label': label_mapping[examples['label']]})


tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
def tokenize_function(examples):
    return tokenizer(examples["text_"], padding="max_length", truncation=True)

# Tokenize the training and testing datasets
tokenized_train_dataset = hf_train_dataset.map(tokenize_function, batched=True)
tokenized_test_dataset = hf_test_dataset.map(tokenize_function, batched=True)


In [None]:
train_dataset = tokenized_train_dataset.shuffle(seed=42) 
eval_dataset = tokenized_test_dataset.shuffle(seed=42)  

import pandas as pd

# Convert small_train_dataset to a pandas DataFrame
train_df = pd.DataFrame(train_dataset)
eval_df = pd.DataFrame(eval_dataset)

train_df.to_csv('train_dataset.csv', index=False)
eval_df.to_csv('eval_dataset.csv', index=False)


In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=2)

In [None]:
!pip install evaluate

In [None]:
import numpy as np
import evaluate

metric = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)
    
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="test_trainer",           
    eval_strategy="epoch",         
    per_device_train_batch_size=16,      
    per_device_eval_batch_size=16,       
    num_train_epochs=2,                  
    save_steps=10_000,                   
    save_total_limit=2,                
    logging_dir="logs",               
    logging_steps=500,                   
)

    
# You would then initialize the Trainer with your model and datasets
trainer = Trainer(
    model=model,                         
    args=training_args,                  
    train_dataset=train_dataset,        
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics
)

# Start training
trainer.train()

In [None]:
trainer.evaluate()

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# Assuming 'model' is your trained model
model.save_pretrained('/kaggle/working/Prediction_model')  

# Save the tokenizer as well
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')  
tokenizer.save_pretrained('/kaggle/working/Prediction_model')  


In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("/kaggle/working/Prediction_model")
model = AutoModelForSequenceClassification.from_pretrained("/kaggle/working/Prediction_model").to(device)

# Test with new input
input_text = """Anyone expecting some miracle tool... this is it.  Not too big or too small, but just what I wanted.  The plastic parts are solid and the plastic is solid.  The metal part is well made and will last a long time.  This is an excellent product.  I highly recommend.Works great. I installed it in the kitchen and it works great. I installed it in my dining room and it works great.Great.  I like that the spring is"""
# Tokenize the input
inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)

# Get prediction
with torch.no_grad():
    outputs = model(**inputs)

logits = outputs.logits
predicted_class = logits.argmax(-1).item()



# Interpret the result
if predicted_class == 0:
    print("Computer generated Review")
else:
    print("Original Review")


In [None]:
# Zip the saved model directory
!zip -r /kaggle/working/Prediction_model.zip /kaggle/working/Prediction_model


In [None]:
from transformers import BertTokenizer

# Load the tokenizer from the saved directory
tokenizer = BertTokenizer.from_pretrained('/kaggle/working/Prediction_model')


In [None]:
import pandas as pd

# Load the evaluation dataset (adjust the file path if necessary)
eval_data = pd.read_csv('/kaggle/working/eval_dataset.csv')

# Assuming 'review' column has the review text and 'label' has the true labels
texts = eval_data['text_'].tolist()
true_labels = eval_data['label'].tolist()


In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Load the tokenizer and the model
model_path = '/kaggle/working/Prediction_model'
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)

# Tokenize the evaluation dataset in batches
batch_size = 100  # Adjust this size based on your memory limits
predicted_labels = []

for i in range(0, len(texts), batch_size):
    batch_texts = texts[i:i+batch_size]
    inputs = tokenizer(batch_texts, return_tensors="pt", padding=True, truncation=True, max_length=512)

    with torch.no_grad():
        outputs = model(**inputs)
    
    # Convert logits to predicted labels
    batch_predictions = torch.argmax(outputs.logits, axis=1).numpy()
    predicted_labels.extend(batch_predictions)


In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(true_labels, predicted_labels)
TN, FP, FN, TP = cm.ravel()
cm_table = pd.DataFrame(
    {
        'Predicted True': [TP, FP],
        'Predicted False': [FN, TN]
    },
    index=['Actual True', 'Actual False']
)

print(cm_table)


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')

plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')
plt.show()


In [None]:
from sklearn.metrics import classification_report

report = classification_report(true_labels, predicted_labels, target_names=['Computer Generated', 'Original'])
print(report)
