In [None]:
import torch
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("Using device:", device)
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

Using device: cpu


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [None]:
seed = 25
# random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

In [None]:
train_data = pd.read_csv('/content/drive/MyDrive/datasets/subtask_1/en/train.tsv',sep='\t')
train_data = train_data.reset_index(drop=True)
print(train_data.head())

      id                                               text      label
0  12322  you need to stop the engine and wait until it ...  generated
1   1682  The Commission shall publish the report; an in...  generated
2  22592  I have not been tweeting a lot lately, but I d...  generated
3  17390  I pass my exam and really thankgod for that bu...      human
4  30453  The template will have 3 parts: a mustache sha...      human


In [None]:
train_data_texts = train_data['text'].to_list()
train_data_labels = train_data['label'].to_list()
print("Unique labels: ", set(train_data_labels))

Unique labels:  {'human', 'generated'}


In [None]:
from sklearn.model_selection import train_test_split
train_data_texts = train_data['text'].to_list()
train_data_labels = train_data['label'].to_list()
train_texts, test_texts, train_labels, test_labels = train_test_split(train_data_texts, train_data_labels, test_size=0.1, random_state=25)
train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=0.1, random_state=25)
print('train data size: ', len(train_texts))
print('validation data size: ', len(val_texts))
print('test data size: ', len(test_texts))


train data size:  27414
validation data size:  3046
test data size:  3385


In [None]:
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Create tf-idf vectorizer and fit it on train texts
tfidf_vectorizer = TfidfVectorizer()
train_tfidf = tfidf_vectorizer.fit_transform(train_texts)

# Transform test texts to tf-idf vectors
test_tfidf = tfidf_vectorizer.transform(test_texts)

In [None]:
# Train Random Forest classifier on train tf-idf vectors
rf_classifier = RandomForestClassifier()
rf_classifier.fit(train_tfidf, train_labels)

# Predict labels for test tf-idf vectors
pred_labels = rf_classifier.predict(test_tfidf)

# Generate classification report
report = classification_report(test_labels, pred_labels)
print(report)

              precision    recall  f1-score   support

   generated       0.73      0.85      0.78      1682
       human       0.82      0.68      0.75      1703

    accuracy                           0.77      3385
   macro avg       0.78      0.77      0.77      3385
weighted avg       0.78      0.77      0.77      3385



In [None]:
'''
  XGBoost is a gradient boosting algorithm, which means that it builds a series of weak learners sequentially, 
  where each new learner tries to improve the errors of the previous ones. 
  On the other hand, Random Forest is a bagging algorithm that builds multiple decision trees 
  in parallel and combines their predictions by taking the majority vote.


  XGBoost is a powerful algorithm that can handle complex relationships between features and 
  the target variable, especially for large datasets, while Random Forest is a reliable 
  algorithm that is easier to interpret and generally works well for smaller datasets with fewer features. 
'''

# XGBoost only supports ASCII characters, so you may need to preprocess your data to remove any non-ASCII characters before feeding it to XGBoost.
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import xgboost as xgb
from sklearn.metrics import classification_report

def preprocess_text(text):
    # Replace any non-ASCII characters with their ASCII equivalents
    text = text.encode('ascii', 'ignore').decode('utf-8')
    # Remove any remaining non-alphanumeric characters and convert to lowercase
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text).lower()
    return text

# Preprocess the train and test texts
train_texts_ascii = [preprocess_text(text) for text in train_texts]
test_texts_ascii = [preprocess_text(text) for text in test_texts]

# Convert train and test texts to tf-idf vectors
tfidf = TfidfVectorizer()
train_tfidf = tfidf.fit_transform(train_texts_ascii)
test_tfidf = tfidf.transform(test_texts_ascii)

label_map = {'human': 0, 'generated': 1}
train_labels_numeric = np.array([label_map[label] for label in train_labels], dtype=np.int32)
print(train_labels)
print(train_tfidf.shape)
print(len(train_labels))
test_labels_numeric = np.array([label_map[label] for label in test_labels], dtype=np.int32)
# Train the XGBoost model on the tf-idf vectors
xgb_classifier = xgb.XGBClassifier()
xgb_classifier.fit(train_tfidf, train_labels_numeric)

# Evaluate the XGBoost model on the test set
xgboost_predictions = xgb_classifier.predict(test_tfidf)
print(classification_report(test_labels_numeric, xgboost_predictions))






['generated', 'generated', 'generated', 'generated', 'human', 'generated', 'human', 'human', 'generated', 'human', 'generated', 'generated', 'generated', 'human', 'human', 'human', 'generated', 'human', 'generated', 'human', 'generated', 'generated', 'generated', 'human', 'generated', 'human', 'human', 'human', 'human', 'generated', 'generated', 'generated', 'generated', 'generated', 'human', 'human', 'generated', 'human', 'human', 'generated', 'human', 'human', 'generated', 'human', 'human', 'human', 'generated', 'human', 'human', 'generated', 'human', 'generated', 'generated', 'generated', 'human', 'generated', 'generated', 'human', 'generated', 'generated', 'human', 'generated', 'generated', 'generated', 'human', 'generated', 'generated', 'generated', 'human', 'generated', 'generated', 'generated', 'human', 'generated', 'generated', 'generated', 'human', 'human', 'generated', 'generated', 'generated', 'human', 'generated', 'generated', 'human', 'human', 'human', 'generated', 'human'