In [1]:
import os
import xml.etree.ElementTree as ET
import numpy as np
import random
import tensorflow as tf
from tqdm import tqdm
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, Flatten, Dense, Dropout
from sklearn.model_selection import StratifiedKFold
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE  # Import SMOTE for oversampling

# Define paths
annotation_dir = "./InAra-Corpus/plagiarism-annotation"
text_dir = "./InAra-Corpus/suspicious-documents"
glove_path = "glove.6B.100d.txt"

# Load GloVe embeddings
embeddings_index = {}
with open(glove_path, 'r', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

# Function to preprocess text
def preprocess_text(text):
    text = text.lower()
    text = ''.join([char if char.isalnum() or char.isspace() else ' ' for char in text])  # Remove special characters
    text = ' '.join(text.split())  # Remove extra spaces
    return text

# Create a list to store data
data = []

# Iterate through annotation files
for filename in tqdm(os.listdir(annotation_dir)):
    if filename.endswith(".xml"):
        filepath = os.path.join(annotation_dir, filename)
        tree = ET.parse(filepath)
        root = tree.getroot()
        
        # Extract document reference
        doc_id = root.attrib['reference']
        
        # Extract plagiarism information (if present)
        plagiarism_segments = []
        for feature in root.findall('feature'):
            if feature.attrib['name'] == 'plagiarism':
                start = int(feature.attrib['this_offset'])
                end = start + int(feature.attrib['this_length'])
                plagiarism_segments.append((start, end))
        
        # Read the corresponding text file
        text_path = os.path.join(text_dir, doc_id)
        with open(text_path, 'r', encoding='cp1256') as f:
            text = f.read()
            
        # Preprocess the text
        preprocessed_text = preprocess_text(text)
        
        # Create a data entry
        data.append({
            'text': preprocessed_text,
            'plagiarism_segments': plagiarism_segments,
            'doc_id': doc_id
        })

# Define parameters
MAX_SEQUENCE_LENGTH = 100
EMBEDDING_DIM = 100
NUM_FOLDS = 5
DROPOUT_RATE = 0.2
EPOCHS = 20
PATIENCE = 3

# Create tokenizer and convert text to sequences
tokenizer = Tokenizer()
tokenizer.fit_on_texts([d['text'] for d in data])
word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences([d['text'] for d in data])
padded_sequences = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post', truncating='post')

# Create embedding matrix
num_words = len(word_index) + 1
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if i < num_words:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

# Create labels
labels = np.array([1 if d['plagiarism_segments'] else 0 for d in data])

# Apply SMOTE to oversample the minority class
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(padded_sequences, labels)

# Stratified K-Fold Cross-Validation
kfold = StratifiedKFold(n_splits=NUM_FOLDS, shuffle=True, random_state=42)
fold_accuracies = []

for fold, (train_idx, test_idx) in enumerate(kfold.split(X_resampled, y_resampled), 1):
    print(f"Training Fold {fold}/{NUM_FOLDS}")
    X_train, X_test = X_resampled[train_idx], X_resampled[test_idx]
    y_train, y_test = y_resampled[train_idx], y_resampled[test_idx]
    
    # Build CNN Model
    model = Sequential([
        Embedding(num_words, EMBEDDING_DIM, weights=[embedding_matrix], trainable=False),
        Conv1D(128, 3, activation='relu'),
        MaxPooling1D(2),
        Dropout(DROPOUT_RATE),
        Conv1D(64, 3, activation='relu'),
        MaxPooling1D(2),
        Dropout(DROPOUT_RATE),
        Flatten(),
        Dense(64, activation='relu'),
        Dropout(DROPOUT_RATE),
        Dense(1, activation='sigmoid')
    ])
    
    # Compile the model
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    
    # Early stopping callback
    early_stopping = EarlyStopping(monitor='val_loss', patience=PATIENCE, restore_best_weights=True)
    
    # Train the model
    history = model.fit(
        X_train, y_train,
        epochs=EPOCHS,
        batch_size=32,
        validation_data=(X_test, y_test),
        callbacks=[early_stopping],
        verbose=1
    )
    
    # Evaluate the model
    loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
    fold_accuracies.append(accuracy)
    print(f"Fold {fold} Accuracy: {accuracy:.4f}")
    
    # Additional metrics
    y_pred = (model.predict(X_test) > 0.5).astype(int)
    print(classification_report(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Average accuracy across folds
average_accuracy = np.mean(fold_accuracies)
print(f"Average Accuracy across {NUM_FOLDS} folds: {average_accuracy:.4f}")

# Train the final model on all data
final_model = Sequential([
    Embedding(num_words, EMBEDDING_DIM, weights=[embedding_matrix], trainable=False),
    Conv1D(128, 3, activation='relu'),
    MaxPooling1D(2),
    Dropout(DROPOUT_RATE),
    Conv1D(64, 3, activation='relu'),
    MaxPooling1D(2),
    Dropout(DROPOUT_RATE),
    Flatten(),
    Dense(64, activation='relu'),
    Dropout(DROPOUT_RATE),
    Dense(1, activation='sigmoid')
])

final_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
final_model.fit(X_resampled, y_resampled, epochs=EPOCHS, batch_size=32, verbose=1)

# Final evaluation
final_loss, final_accuracy = final_model.evaluate(X_resampled, y_resampled, verbose=0)
print(f"Final Model Accuracy: {final_accuracy:.4f}")

2025-01-02 12:35:02.254933: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
100%|██████████| 1024/1024 [00:38<00:00, 26.40it/s]


Training Fold 1/5
Epoch 1/20
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 21ms/step - accuracy: 0.4805 - loss: 0.6927 - val_accuracy: 0.5000 - val_loss: 0.6987
Epoch 2/20
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 16ms/step - accuracy: 0.5035 - loss: 0.6834 - val_accuracy: 0.5061 - val_loss: 0.7123
Epoch 3/20
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 21ms/step - accuracy: 0.4829 - loss: 0.6805 - val_accuracy: 0.5061 - val_loss: 0.7173
Epoch 4/20
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 34ms/step - accuracy: 0.5168 - loss: 0.6651 - val_accuracy: 0.5061 - val_loss: 0.7340
Fold 1 Accuracy: 0.5000
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step
              precision    recall  f1-score   support

           0       0.50      0.99      0.66       165
           1       0.50      0.01      0.02       165

    accuracy                           0.50       330
   macro avg       0.5

In [2]:
final_model.save("model.keras")

import pickle
with open("tokenizer.pkl", "wb") as handle:
    pickle.dump(tokenizer, handle)