In [None]:
import numpy as np
import pandas as pd
import nltk
import tensorflow as tf
from nltk.corpus import stopwords
from nltk.corpus import reuters
from nltk.corpus import brown
from nltk.corpus import gutenberg
from nltk.tokenize import RegexpTokenizer
from nltk.stem import SnowballStemmer
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import pickle
import joblib
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, f1_score, recall_score
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from preprocessing.text import Tokenizer
from keras.models import Sequential, load_model
from keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
from utils import pad_sequences
from keraslayers import Activation, Dense, Embedding, LSTM, SpatialDropout1D, Dropout, Flatten, GRU, Conv1D, MaxPooling1D, Bidirectional
from wordcloud import WordCloud,ImageColorGenerator
from PIL import Image
import urllib
import requests
import re
import ktrain
from ktrain import text
sns.set()
%matplotlib inline
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('gutenberg')
nltk.download('brown')
nltk.download("reuters")
nltk.download('words')

In [2]:
from keras.utils import to_categorical

In [4]:
json_file = 'subtaskB_train.jsonl'
tdf = pd.read_json(json_file, lines=True)


In [5]:
tdf.head(10)

Unnamed: 0,text,model,source,label,id
0,Forza Motorsport is a popular racing game that...,chatGPT,wikihow,1,0
1,Buying Virtual Console games for your Nintendo...,chatGPT,wikihow,1,1
2,Windows NT 4.0 was a popular operating system ...,chatGPT,wikihow,1,2
3,How to Make Perfume\n\nPerfume is a great way ...,chatGPT,wikihow,1,3
4,How to Convert Song Lyrics to a Song'\n\nConve...,chatGPT,wikihow,1,4
5,How to Fix a Broken Window in a Wooden Frame\n...,chatGPT,wikihow,1,5
6,Publishing your WordPress theme on Themeforest...,chatGPT,wikihow,1,6
7,Building a Railroad Tie Retaining Wall can be ...,chatGPT,wikihow,1,7
8,Teaching your dog new tricks is a great way to...,chatGPT,wikihow,1,8
9,Remote Desktop is a useful tool that allows yo...,chatGPT,wikihow,1,9


In [6]:
df = tdf

In [7]:
df['text']=df['text'].fillna("")
df.isna().sum()

text      0
model     0
source    0
label     0
id        0
dtype: int64

In [8]:
x_train = df['text']
y_train = df['model']



In [9]:
vocabulary_size = 15000
max_text_len = 1000


In [10]:
def preprocess_text(text):

    text = re.sub('[^a-zA-Z]', ' ', text)

    words = text.lower().split()

    cleaned_text = ' '.join(words)
    return cleaned_text

df['cleaned_text'] = df['text'].apply(preprocess_text)

In [11]:
tokenizer = Tokenizer(num_words=vocabulary_size)
tokenizer.fit_on_texts(df['text'].values)

le = len(tokenizer.word_index) + 1
print(le)
sequences = tokenizer.texts_to_sequences(df['text'].values)
X_DeepLearning = pad_sequences(sequences, maxlen=max_text_len)

264198


In [12]:
df.loc[df['model'] == 'human' , 'LABEL'] = 0
df.loc[df['model'] == 'chatGPT', 'LABEL'] = 1
df.loc[df['model'] == 'cohere' , 'LABEL'] = 2
df.loc[df['model'] == 'davinci', 'LABEL'] = 3
df.loc[df['model'] == 'bloomz', 'LABEL'] = 4
df.loc[df['model'] == 'dolly', 'LABEL'] = 5

labels = to_categorical(df['LABEL'], num_classes=6)
XX_train, XX_test, y_train, y_test = train_test_split(X_DeepLearning , labels, test_size=0.25, random_state=42)
print((XX_train.shape, y_train.shape, XX_test.shape, y_test.shape))

((53270, 1000), (53270, 6), (17757, 1000), (17757, 6))


In [15]:
from keras.layers import GlobalMaxPooling1D

# Assuming X_DeepLearning has shape (num_samples, max_text_len) where max_text_len is the length of each text sequence
max_text_len = X_DeepLearning.shape[1]

epochs = 7
emb_dim = 256
batch_size = 128

model_cl = Sequential()
model_cl.add(Embedding(vocabulary_size, emb_dim, input_length=max_text_len))
model_cl.add(SpatialDropout1D(0.8))
model_cl.add(Conv1D(filters=64, kernel_size=6, padding='same', activation='relu'))
model_cl.add(MaxPooling1D(pool_size=2))
model_cl.add(Conv1D(filters=32, kernel_size=6, activation='relu'))
model_cl.add(MaxPooling1D(pool_size=2))
model_cl.add(Bidirectional(LSTM(100, dropout=0.5, recurrent_dropout=0.5, return_sequences=True)))
model_cl.add(Dropout(0.5))
model_cl.add(Bidirectional(LSTM(400, dropout=0.5, recurrent_dropout=0.5, return_sequences=True)))
model_cl.add(Dropout(0.5))
model_cl.add(GlobalMaxPooling1D())  # Global Max Pooling instead of Flatten
model_cl.add(Dense(64, activation='relu'))
model_cl.add(Dropout(0.5))
model_cl.add(Dense(6, activation='softmax'))  # Output layer with softmax for multi-class classification

model_cl.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])
print(model_cl.summary())


Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 1000, 256)         3840000   
                                                                 
 spatial_dropout1d_1 (Spati  (None, 1000, 256)         0         
 alDropout1D)                                                    
                                                                 
 conv1d_2 (Conv1D)           (None, 1000, 64)          98368     
                                                                 
 max_pooling1d_2 (MaxPoolin  (None, 500, 64)           0         
 g1D)                                                            
                                                                 
 conv1d_3 (Conv1D)           (None, 495, 32)           12320     
                                                                 
 max_pooling1d_3 (MaxPoolin  (None, 247, 32)          

In [16]:
checkpoint_callback = ModelCheckpoint(filepath="cnn+lastm-best_model.h5", save_best_only=True, monitor="val_acc", mode="max", verbose=1)

early_stopping_callback = EarlyStopping(monitor="val_acc", mode="max", patience=10, verbose=1, restore_best_weights=True)

reduce_lr_callback = ReduceLROnPlateau(monitor="val_loss", factor=0.1, patience=5, verbose=1, mode="min", min_delta=0.0001, cooldown=0, min_lr=0)

callbacks=[checkpoint_callback, early_stopping_callback, reduce_lr_callback]


In [17]:
history_cl = model_cl.fit(XX_train, y_train, epochs=epochs, batch_size=batch_size,validation_split=0.2, callbacks=callbacks)

Epoch 1/7
Epoch 1: val_acc improved from -inf to 0.40436, saving model to cnn+lastm-best_model.h5


  saving_api.save_model(


Epoch 2/7
Epoch 2: val_acc improved from 0.40436 to 0.47184, saving model to cnn+lastm-best_model.h5
Epoch 3/7
Epoch 3: val_acc improved from 0.47184 to 0.50413, saving model to cnn+lastm-best_model.h5
Epoch 4/7
Epoch 4: val_acc improved from 0.50413 to 0.53698, saving model to cnn+lastm-best_model.h5
Epoch 5/7
Epoch 5: val_acc improved from 0.53698 to 0.60193, saving model to cnn+lastm-best_model.h5
Epoch 6/7
Epoch 6: val_acc improved from 0.60193 to 0.63319, saving model to cnn+lastm-best_model.h5
Epoch 7/7
Epoch 7: val_acc improved from 0.63319 to 0.65543, saving model to cnn+lastm-best_model.h5


In [19]:
history_cl = model_cl.fit(XX_train, y_train, epochs=1, batch_size=batch_size,validation_split=0.2, callbacks=callbacks)

Epoch 1: val_acc did not improve from 0.65543


In [20]:
results_4 = model_cl.evaluate(XX_test, y_test, verbose=False)
print(f'Test results - Loss: {results_4[0]} - Accuracy: {100*results_4[1]}%')

Test results - Loss: 0.9011300206184387 - Accuracy: 65.43335318565369%


In [None]:
import joblib

# Save the model to a file
model_file_path = 'NeuralNet(2).pkl'
joblib.dump(model_cl, model_file_path)

In [30]:
# Define a function to preprocess user input
def preprocess_user_input(user_text):
    # Clean and preprocess the text
    cleaned_text = preprocess_text(user_text)  # Assuming preprocess_text is defined as in your previous code

    # Tokenize and convert to sequence
    sequence = tokenizer.texts_to_sequences([cleaned_text])

    # Pad sequence to match model's input shape
    padded_sequence = pad_sequences(sequence, maxlen=max_text_len)

    return padded_sequence

index_to_label = {
    0: 'human',
    1: 'chatGPT',
    2: 'cohere',
    3: 'davinci',
    4: 'bloomz',
    5: 'dolly'
}

# Sample user input text
# user_text = """
# Martial arts, encompassing a diverse range of disciplines, are a fusion of physical prowess, mental discipline, and cultural heritage. They serve as a profound journey of self-discovery and personal growth, cultivating not only combat skills but also virtues like discipline, respect, and perseverance. From the fluid grace of Brazilian Jiu-Jitsu to the explosive power of Muay Thai and the meditative stillness of Tai Chi, martial arts offer a path to physical fitness, emotional balance, and spiritual harmony. Beyond the dojo, they inspire a deep respect for tradition and a global community of practitioners dedicated to the pursuit of excellence and self-improvement.
# """
user_text = """Blockchain technology, often touted as a revolutionary force, has permeated various industries, promising unparalleled security, transparency, and efficiency. At its core, blockchain is a decentralized ledger system, where transactions are recorded across a network of computers, or nodes, creating an immutable chain of data blocks.

One of blockchain's most notable applications is in cryptocurrencies like Bitcoin and Ethereum. These digital currencies leverage blockchain to enable peer-to-peer transactions without the need for intermediaries like banks. Blockchain's decentralized nature ensures that transactions are secure, transparent, and resistant to tampering, providing users with a level of trust previously unseen in traditional financial systems.

Beyond cryptocurrencies, blockchain finds utility in a myriad of sectors. In supply chain management, it offers a transparent and traceable record of goods from production to delivery, combating counterfeit products and ensuring authenticity. In healthcare, blockchain secures patient data, streamlines record-keeping, and facilitates interoperability between disparate systems, fostering more efficient and patient-centric care.

Moreover, blockchain has the potential to revolutionize voting systems, intellectual property rights, and even the energy sector through the concept of decentralized energy grids. Smart contracts, self-executing contracts with the terms of the agreement directly written into code, automate and enforce agreements without the need for intermediaries, reducing costs and mitigating disputes.

However, blockchain technology is not without its challenges. Scalability, energy consumption, and regulatory concerns are among the hurdles that must be addressed for widespread adoption. Additionally, while blockchain ensures data integrity, it does not guarantee data accuracy, as the information entered into the ledger is only as reliable as its source.

Despite these challenges, the potential of blockchain to transform industries and empower individuals is undeniable. As research and development continue to advance, and as society grapples with the complexities of a digital age, blockchain stands poised to redefine the way we transact, collaborate, and trust in the modern world."""
# Preprocess user input
user_input_sequence = preprocess_user_input(user_text)

# Make prediction using the trained model
predicted_probabilities = model_cl.predict(user_input_sequence)

# Determine the predicted model based on the highest probability
predicted_model_index = np.argmax(predicted_probabilities)
predicted_model_label = index_to_label[predicted_model_index]

# Print the predicted model label
print("Predicted Model:", predicted_model_label)

Predicted Model: chatGPT
