# Import Libraries:

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
%pip install -q datasets tqdm pandas numpy scikit-learn transformers tokenizers catboost

In [3]:
import sys
from datasets import Dataset
from tqdm.auto import tqdm

import pandas as pd
import numpy as np
from sklearn.metrics import classification_report


from transformers import PreTrainedTokenizerFast
from sklearn.feature_extraction.text import TfidfVectorizer
from tokenizers import (
    decoders,
    models,
    normalizers,
    pre_tokenizers,
    processors,
    trainers,
    Tokenizer,
)


from sklearn.linear_model import SGDClassifier
from catboost import CatBoostClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import VotingClassifier

In [5]:
hc3_dataframe=pd.read_csv('HC3_Data.csv')

In [7]:
hc3_dataframe.head()

Unnamed: 0.1,Unnamed: 0,id,question,human_answer,chatgpt_answers
0,0,0,"Why is every book I hear about a "" NY Times # ...","['Basically there are many categories of "" Bes...",['There are many different best seller lists t...
1,1,1,"If salt is so bad for cars , why do we use it ...",['salt is good for not dying in car crashes an...,"[""Salt is used on roads to help melt ice and s..."
2,2,2,Why do we still have SD TV channels when HD lo...,"[""The way it works is that old TV stations got...","[""There are a few reasons why we still have SD..."
3,3,3,Why has nobody assassinated Kim Jong - un He i...,"[""You ca n't just go around assassinating the ...",['It is generally not acceptable or ethical to...
4,4,4,How was airplane technology able to advance so...,['Wanting to kill the shit out of Germans driv...,['After the Wright Brothers made the first pow...


In [8]:
hc3_dataframe.shape

(24322, 5)

In [9]:
hc3_data_list=[]
hc3_label=[]

In [10]:
for i in tqdm(range(100000)):
        j=np.random.randint(0,24322)
        k=np.random.randint(0,24322)
        listc=[j,k]
        choice=np.random.choice(listc)
        if choice==j:
            hc3_data_list.append(hc3_dataframe['human_answer'][j])
            hc3_label.append(0)
        else:
            hc3_data_list.append(hc3_dataframe['chatgpt_answers'][k])
            hc3_label.append(1)


  0%|          | 0/100000 [00:00<?, ?it/s]

In [11]:
suff_df=pd.DataFrame()

In [12]:
suff_df['text']=hc3_data_list
suff_df['label']=hc3_label
suff_df.head()

Unnamed: 0,text,label
0,"[""Well , we carry and die of rabies without go...",0
1,['3D shapes are usually described in 3D files ...,1
2,['Its all about bluff . Tracy Emin is notoriou...,0
3,"[""There are a few reasons why different car ma...",1
4,['You will need to see a tax expert. Your edit...,0


In [13]:
suff_df.shape

(100000, 2)

In [14]:
suff_df.label.values.sum()

49844

In [15]:
new_df=suff_df.drop_duplicates()

In [16]:
new_df.head()

Unnamed: 0,text,label
0,"[""Well , we carry and die of rabies without go...",0
1,['3D shapes are usually described in 3D files ...,1
2,['Its all about bluff . Tracy Emin is notoriou...,0
3,"[""There are a few reasons why different car ma...",1
4,['You will need to see a tax expert. Your edit...,0


In [17]:
new_df.reset_index(drop=True,inplace=True)

In [18]:
new_df.label.values.sum()

20283

# now we have our suffled dataframe

In [19]:
new_df.head()

Unnamed: 0,text,label
0,"[""Well , we carry and die of rabies without go...",0
1,['3D shapes are usually described in 3D files ...,1
2,['Its all about bluff . Tracy Emin is notoriou...,0
3,"[""There are a few reasons why different car ma...",1
4,['You will need to see a tax expert. Your edit...,0


In [20]:
# new_df.to_csv("new_suff_df.csv")

In [21]:
new_df.shape

(40058, 2)

In [22]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(new_df, test_size=0.2)
train.reset_index(drop=True,inplace=True)
test.reset_index(drop=True,inplace=True)

In [23]:
train.head()

Unnamed: 0,text,label
0,['It is possible that the NSA\'s malware has n...,1
1,"[""The problem is the fact that the Placebo eff...",0
2,"[""Visitations to Chernobyl are highly regulate...",0
3,"[""It depends on what you mean by fire . The he...",0
4,['Desmanthus illinoensis (commonly known as Il...,0


In [24]:
train.shape

(32046, 2)

In [25]:
test.head()

Unnamed: 0,text,label
0,"['Hello, With available information,I could th...",0
1,['Before a minor surgery like getting your wis...,1
2,"[""Actually , you can stand close to it . It 's...",0
3,"[""CAPTCHAs are a way to tell if someone is a r...",1
4,"['From what i ve learned , Relativisim is the ...",0


In [26]:
test.shape

(8012, 2)

# trian your own tokenizer

In [27]:
lower_case= False
vocab_Size = 30522

In [28]:
train_tokenizer = Tokenizer(models.BPE(unk_token="[UNK]"))
train_tokenizer.normalizer = normalizers.Sequence([normalizers.NFC()] + [normalizers.Lowercase()] if lower_case else [])
train_tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()
special_tokens = ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]
trainer = trainers.BpeTrainer(vocab_size=vocab_Size, special_tokens=special_tokens)
dataset = Dataset.from_pandas(test[['text']])

In [29]:
def train_corp_iter():
    for i in range(0, len(dataset), 1000):
        # Filter out None values from the batch
        batch = [text for text in dataset[i : i + 1000]["text"] if text is not None]
        # Yield the batch only if it contains data
        if batch:
            yield batch

In [30]:
train_tokenizer.train_from_iterator(train_corp_iter(), trainer=trainer)
tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=train_tokenizer,
    unk_token="[UNK]",
    pad_token="[PAD]",
    cls_token="[CLS]",
    sep_token="[SEP]",
    mask_token="[MASK]",
)

# tokenized the dataset

In [31]:
tokenized_texts_test = []

for text in tqdm(test['text'].tolist()):
    tokenized_texts_test.append(tokenizer.tokenize(text))

tokenized_texts_train = []

for text in tqdm(train['text'].tolist()):
    tokenized_texts_train.append(tokenizer.tokenize(text))

  0%|          | 0/8012 [00:00<?, ?it/s]

  0%|          | 0/32046 [00:00<?, ?it/s]

In [32]:
tokenized_texts_train[0]

["Ġ['",
 'It',
 'Ġis',
 'Ġpossible',
 'Ġthat',
 'Ġthe',
 'ĠNSA',
 "\\'",
 's',
 'Ġmalware',
 'Ġhas',
 'Ġnot',
 'Ġbeen',
 'Ġdetected',
 'Ġon',
 'Ġmillions',
 'Ġof',
 'Ġcomputers',
 'Ġand',
 'Ġservers',
 'Ġbecause',
 'Ġthe',
 'Ġagency',
 'Ġuses',
 'Ġadvanced',
 'Ġtechniques',
 'Ġto',
 'Ġhide',
 'Ġits',
 'Ġpresence',
 'Ġand',
 'Ġactivities',
 '.',
 'ĠThese',
 'Ġtechniques',
 'Ġcan',
 'Ġinclude',
 'Ġusing',
 'Ġencryption',
 'Ġto',
 'Ġsecure',
 'Ġthe',
 'Ġcommunications',
 'Ġbetween',
 'Ġthe',
 'Ġmalware',
 'Ġand',
 'Ġthe',
 'Ġagency',
 "\\'",
 's',
 'Ġservers',
 ',',
 'Ġdisgu',
 'ising',
 'Ġthe',
 'Ġmalware',
 'Ġas',
 'Ġlegitimate',
 'Ġsoftware',
 ',',
 'Ġand',
 'Ġusing',
 'Ġservers',
 'Ġand',
 'Ġinfrastructure',
 'Ġthat',
 'Ġare',
 'Ġnot',
 'Ġeasily',
 'Ġtrace',
 'able',
 'Ġback',
 'Ġto',
 'Ġthe',
 'Ġagency',
 '.',
 'ĠAs',
 'Ġfor',
 'Ġwhether',
 'Ġit',
 'Ġis',
 'Ġpossible',
 'Ġfor',
 'Ġanyone',
 'Ġto',
 'Ġbe',
 'Ġ"',
 'N',
 'SA',
 '-',
 'free',
 ',"',
 'Ġit',
 'Ġis',
 'Ġimportant',
 'Ġto'

In [33]:
len(tokenized_texts_test[0])

61

In [34]:
def dummy(text):
    return text

In [35]:
vectorizer = TfidfVectorizer(ngram_range=(3, 5), lowercase=False, sublinear_tf=True,
    analyzer = 'word',
    tokenizer = dummy,
    preprocessor = dummy,
    token_pattern = None, strip_accents='unicode')

vectorizer.fit(tokenized_texts_test[:640])

In [36]:
vocab = vectorizer.vocabulary_
vectorizer = TfidfVectorizer(ngram_range=(3, 5), lowercase=False, sublinear_tf=True, vocabulary=vocab,
                            analyzer = 'word',
                            tokenizer = dummy,
                            preprocessor = dummy,
                            token_pattern = None, strip_accents='unicode'
                            )


In [37]:
tf_train = vectorizer.fit_transform(tokenized_texts_train)
tf_test = vectorizer.transform(tokenized_texts_test)


In [38]:
tf_train[3]

<1x544385 sparse matrix of type '<class 'numpy.float64'>'
	with 48 stored elements in Compressed Sparse Row format>

In [39]:
tf_test

<8012x544385 sparse matrix of type '<class 'numpy.float64'>'
	with 1176207 stored elements in Compressed Sparse Row format>

# Making a Voting Classifier Model:

In [40]:
train_lable=np.array(train['label'])

In [41]:
train_lable

array([1, 0, 0, ..., 0, 0, 0])

In [42]:
sgd_model = SGDClassifier(max_iter=8000, tol=1e-4, loss="modified_huber")

In [43]:
clf1 = MultinomialNB(alpha=0.02)
clf2 = MultinomialNB(alpha=0.01)

In [44]:
cat=CatBoostClassifier(iterations=2000,
                           verbose=0,
                           l2_leaf_reg=6.6,
                           learning_rate=0.0055,
                           subsample = 0.4,
                           allow_const_label=True,loss_function = 'CrossEntropy')

In [45]:
ensemble = VotingClassifier(estimators=[('mnb1',clf1),
                                        ('mnb2',clf2),
                                        ('sgd', sgd_model),
                                        ('cat', cat)],
                                        voting='soft', n_jobs=1)

In [46]:
ensemble.fit(tf_train, train_lable)

In [47]:
from sklearn.metrics import accuracy_score

for name, model in ensemble.named_estimators_.items():
    model.fit(tf_train, train_lable)
    predictions = model.predict(tf_train)
    accuracy = accuracy_score(train_lable, predictions)
    print(f"Accuracy of {name}: {accuracy:.4f}")

ensemble_accuracy = accuracy_score(train_lable, ensemble.predict(tf_train))
print(f"\nAccuracy of the ensemble model: {ensemble_accuracy:.4f}")

Accuracy of mnb1: 0.9714
Accuracy of mnb2: 0.9729
Accuracy of sgd: 0.9991
Accuracy of cat: 0.9747

Accuracy of the ensemble model: 0.9884


In [48]:
import pickle
pickle.dump(ensemble, open('HC3ClassfierModel.pkl','wb'))

# Predicting the O/P for the Test Data

In [49]:
final_preds = ensemble.predict_proba(tf_test)

In [50]:
final_preds

array([[0.94685164, 0.05314836],
       [0.25907311, 0.74092689],
       [0.94749086, 0.05250914],
       ...,
       [0.99637806, 0.00362194],
       [0.85373869, 0.14626131],
       [0.99283522, 0.00716478]])

In [51]:
final_preds.shape

(8012, 2)

In [52]:
y_predic=final_preds.argmax(axis=1)
y_predic[:20]

array([0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1])

In [53]:
sub=pd.DataFrame()
sub['human_generated']=final_preds[:,0]
sub['AI_generated'] = final_preds[:,1]



In [54]:
sub['test_label']=test['label']
sub['predicted_label']=y_predic

In [55]:
sub.head()

Unnamed: 0,human_generated,AI_generated,test_label,predicted_label
0,0.946852,0.053148,0,0
1,0.259073,0.740927,1,1
2,0.947491,0.052509,0,0
3,0.361503,0.638497,1,1
4,0.934555,0.065445,0,0


# check the acuracy

In [56]:
print(classification_report(test['label'],y_predic))

              precision    recall  f1-score   support

           0       1.00      0.94      0.97      3980
           1       0.95      1.00      0.97      4032

    accuracy                           0.97      8012
   macro avg       0.97      0.97      0.97      8012
weighted avg       0.97      0.97      0.97      8012



# Single Instance Prediction:

In [57]:
text='Cows also play a crucial role in the environment. They are ruminant animals with a complex digestive system that allows them to break down cellulose in plant materials. This digestive process produces methane, a potent greenhouse gas. While the environmental impact of methane emissions is a concern, cows can also contribute positively to the environment. Their grazing habits can help maintain grasslands and prevent the spread of invasive plant species.'

In [58]:
tokenized_text=[]

In [59]:
tokenized_text.append(tokenizer.tokenize(text))

In [60]:
tf_tex=vectorizer.transform(tokenized_text)

In [61]:
predict_prob=ensemble.predict_proba(tf_tex)

In [62]:
predict_prob

array([[0.27171108, 0.72828892]])