In [5]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl.metadata (9.4 kB)
Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.1


In [4]:
from datasets import Dataset
import pandas as pd
import evaluate
import numpy as np
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding, AutoTokenizer, set_seed
import os
from sklearn.model_selection import train_test_split
from scipy.special import softmax
import argparse
import logging

def preprocess_function(examples, **fn_kwargs):
    return fn_kwargs['tokenizer'](examples["text"], truncation=True)


def get_data(train_path, test_path, random_seed):
    """
    function to read dataframe with columns
    """

    train_df = pd.read_json(train_path, lines=True)
    test_df = pd.read_json(test_path, lines=True)
    
    train_df, val_df = train_test_split(train_df, test_size=0.2, stratify=train_df['label'], random_state=random_seed)

    return train_df, val_df, test_df

def compute_metrics(eval_pred):

    f1_metric = evaluate.load("f1")

    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    
    results = {}
    results.update(f1_metric.compute(predictions=predictions, references = labels, average="micro"))

    return results


def fine_tune(train_df, valid_df, checkpoints_path, id2label, label2id, model):

    # pandas dataframe to huggingface Dataset
    train_dataset = Dataset.from_pandas(train_df)
    valid_dataset = Dataset.from_pandas(valid_df)
    
    # get tokenizer and model from huggingface
    tokenizer = AutoTokenizer.from_pretrained(model)     # put your model here
    model = AutoModelForSequenceClassification.from_pretrained(
       model, num_labels=len(label2id), id2label=id2label, label2id=label2id    # put your model here
    )
    
    # tokenize data for train/valid
    tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True, fn_kwargs={'tokenizer': tokenizer})
    tokenized_valid_dataset = valid_dataset.map(preprocess_function, batched=True,  fn_kwargs={'tokenizer': tokenizer})
    

    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


    # create Trainer 
    training_args = TrainingArguments(
        output_dir=checkpoints_path,
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=3,
        weight_decay=0.01,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train_dataset,
        eval_dataset=tokenized_valid_dataset,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    trainer.train()

    # save best model
    best_model_path = checkpoints_path+'/best/'
    
    if not os.path.exists(best_model_path):
        os.makedirs(best_model_path)
    

    trainer.save_model(best_model_path)


def test(test_df, model_path, id2label, label2id):
    
    # load tokenizer from saved model 
    tokenizer = AutoTokenizer.from_pretrained(model_path)

    # load best model
    model = AutoModelForSequenceClassification.from_pretrained(
       model_path, num_labels=len(label2id), id2label=id2label, label2id=label2id
    )
            
    test_dataset = Dataset.from_pandas(test_df)

    tokenized_test_dataset = test_dataset.map(preprocess_function, batched=True,  fn_kwargs={'tokenizer': tokenizer})
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    # create Trainer
    trainer = Trainer(
        model=model,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )
    # get logits from predictions and evaluate results using classification report
    predictions = trainer.predict(tokenized_test_dataset)
    prob_pred = softmax(predictions.predictions, axis=-1)
    preds = np.argmax(predictions.predictions, axis=-1)
    metric = evaluate.load("bstrai/classification_report")
    results = metric.compute(predictions=preds, references=predictions.label_ids)
    
    # return dictionary of classification report
    return results, preds


if __name__ == '__main__':

    random_seed = 0
    train_path =  "/kaggle/input/text-dataset/subtaskA_train_monolingual.jsonl" # For example 'subtaskA_train_multilingual.jsonl'
    test_path =  "/kaggle/input/text-dataset/subtaskA_monolingual.jsonl" # For example 'subtaskA_test_multilingual.jsonl'
    model =  "xlm-roberta-base" # For example 'xlm-roberta-base'
    subtask =  'A' # For example 'A'
    prediction_path = 'subtaskA_predictions.jsonl' # For example subtaskB_predictions.jsonl

    if not os.path.exists(train_path):
        logging.error("File doesnt exists: {}".format(train_path))
        raise ValueError("File doesnt exists: {}".format(train_path))
    
    if not os.path.exists(test_path):
        logging.error("File doesnt exists: {}".format(train_path))
        raise ValueError("File doesnt exists: {}".format(train_path))
    

    if subtask == 'A':
        id2label = {0: "human", 1: "machine"}
        label2id = {"human": 0, "machine": 1}
    elif subtask == 'B':
        id2label = {0: 'human', 1: 'chatGPT', 2: 'cohere', 3: 'davinci', 4: 'bloomz', 5: 'dolly'}
        label2id = {'human': 0, 'chatGPT': 1,'cohere': 2, 'davinci': 3, 'bloomz': 4, 'dolly': 5}
    else:
        logging.error("Wrong subtask: {}. It should be A or B".format(train_path))
        raise ValueError("Wrong subtask: {}. It should be A or B".format(train_path))

    set_seed(random_seed)

    #get data for train/dev/test sets
    train_df, valid_df, test_df = get_data(train_path, test_path, random_seed)
    
    # train detector model
    fine_tune(train_df, valid_df, f"{model}/subtask{subtask}/{random_seed}", id2label, label2id, model)

    # test detector model
    results, predictions = test(test_df, f"{model}/subtask{subtask}/{random_seed}/best/", id2label, label2id)
    
    logging.info(results)
    predictions_df = pd.DataFrame({'id': test_df['id'], 'label': predictions})
    predictions_df.to_json(prediction_path, lines=True, orient='records')

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/96 [00:00<?, ?ba/s]

  0%|          | 0/24 [00:00<?, ?ba/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss,F1
1,0.0954,0.30335,0.928732
2,0.0586,0.2124,0.956204
3,0.0241,0.36591,0.940882


Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

  0%|          | 0/35 [00:00<?, ?ba/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Downloading builder script:   0%|          | 0.00/5.24k [00:00<?, ?B/s]

TypeError: object of type 'NoneType' has no len()

In [1]:
!pip install jsonlines

Collecting jsonlines
  Downloading jsonlines-4.0.0-py3-none-any.whl.metadata (1.6 kB)
Downloading jsonlines-4.0.0-py3-none-any.whl (8.7 kB)
Installing collected packages: jsonlines
Successfully installed jsonlines-4.0.0


In [27]:
import jsonlines
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, f1_score, confusion_matrix
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

# Load your dataset
def load_data(file_path):
    texts, labels = [], []
    with jsonlines.open(file_path, mode='r') as reader:
        for obj in reader:
            texts.append(obj['text'])
            # Assuming the label for human-written is 'human' and for ChatGPT-written is 'chatgpt'
            labels.append(obj['label'])
    return texts, labels

# Preprocess and Vectorize the text data
def preprocess_and_vectorize(texts):
    vectorizer = TfidfVectorizer(stop_words=stopwords.words('english'), max_features=10000)
    features = vectorizer.fit_transform(texts)
    return features, vectorizer

def test_preprocess_and_vectorize(texts, vectorizer):
    return vectorizer.transform(texts)

# Main function to load data, train and evaluate the model
def train(jsonl_file_path, test_file_path):
    # Load the dataset
    texts, labels = load_data(jsonl_file_path)
    test_text, test_labels = load_data(test_file_path)
    
    # Preprocess and vectorize text data
    X, vectorizer = preprocess_and_vectorize(texts)
    y = pd.Series(labels)
    
    
    
    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    X_test_gold = test_preprocess_and_vectorize(test_text,vectorizer)
    y_test_gold = pd.Series(test_labels)
    
    # Initialize and train a logistic regression classifier
    classifier = RandomForestClassifier()
    classifier.fit(X_train, y_train)
    
    # Make predictions on the test set
    predictions = classifier.predict(X_test)
    
    predictions_gold = classifier.predict(X_test_gold)
    
    # Evaluate the classifier
    print(f"Training Accuracy: {accuracy_score(y_test, predictions)}")
    print(f"Training Micro F1 Score:{f1_score(y_test, predictions, average='micro')} ")
    print("\nTraining Classification Report:")
    print(classification_report(y_test, predictions))
    print(f"\nTraining Confusion Matrix:")
    print(confusion_matrix(y_test, predictions))
    
    print(f"\n\nTesting Gold Accuracy: {accuracy_score(y_test_gold, predictions_gold)}")
    print(f"Testing Gold Micro F1 Score:{f1_score(y_test_gold, predictions_gold, average='micro')} ")
    print("\Testing Gold Classification Report:")
    print(classification_report(y_test_gold, predictions_gold))
    print(f"\Testing Gold Confusion Matrix:")
    print(confusion_matrix(y_test_gold, predictions_gold))
    
    return classifier, test_text, X_test
    
    


[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [28]:
jsonl_file_path = '/kaggle/input/text-dataset/subtaskA_train_monolingual.jsonl'  # Update this path
classifier, test_text, X_test = train(jsonl_file_path, "/kaggle/input/text-dataset/subtaskA_monolingual_gold.jsonl")

Training Accuracy: 0.8772127588510354
Training Micro F1 Score:0.8772127588510354 

Training Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.93      0.89     12496
           1       0.91      0.82      0.87     11456

    accuracy                           0.88     23952
   macro avg       0.88      0.87      0.88     23952
weighted avg       0.88      0.88      0.88     23952


Training Confusion Matrix:
[[11585   911]
 [ 2030  9426]]


Testing Gold Accuracy: 0.8691935107376284
Testing Gold Micro F1 Score:0.8691935107376284 
\Testing Gold Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.91      0.87     16272
           1       0.91      0.83      0.87     18000

    accuracy                           0.87     34272
   macro avg       0.87      0.87      0.87     34272
weighted avg       0.87      0.87      0.87     34272

\Testing Gold Confusion Matrix:
[[14772  15

In [29]:
data = [{"text" : test_text[index], "label" : classifier.predict(encodings).tolist(), 'id':index} for index, encodings in enumerate(X_test)]
    
with jsonlines.open('prediction.json', mode='w') as writer:
    for entry in data:
        writer.write(entry)