In [19]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

# Sample data (you should replace this with your own labeled dataset)
texts = [
    'This is an original sentence.',
    'Plagiarism is a serious academic offense.',
    'Innovation drives progress in society.',
    'Copying ideas without attribution is unethical.',
    'Academic integrity is essential for learning.',
    'Originality is a key aspect of creative writing.',
    'Cheating undermines the value of education.',
    'Attribution is crucial in scholarly writing.',
    'Ethical behavior is important in all aspects of life.',
    'The consequences of plagiarism can be severe.',
    'Education is about acquiring knowledge, not shortcuts.',
    'Respecting intellectual property fosters innovation.',
    'Students should take pride in their independent work.',
    'Copying and pasting is not a substitute for critical thinking.',
    'In academic writing, honesty is the best policy.',
    'The joy of learning comes from the pursuit of knowledge.',
    'Plagiarism is a stain on one\'s academic record.',
    'Academic success is built on a foundation of hard work and integrity.',
    'Cheating may offer short-term gains but leads to long-term consequences.',
    'Respecting the ideas of others fosters a culture of collaboration.',
    'Original thinking is a skill that can be developed with practice.',
    'Plagiarism is a breach of academic ethics.',
    'Innovation requires a willingness to explore new ideas.',
    'Copying without understanding is a missed opportunity for learning.',
    'Academic success is about more than just grades.',
    'The value of education extends beyond the classroom.',
    'Creativity and originality go hand in hand.',
    'Avoiding plagiarism is important in academia.',
    'Authors should give credit for borrowed ideas.',
    'This is a unique sentence.',
    'Innovation leads to novel ideas.',
    'A good education is important for success.',
    'Cheating on exams is a violation of academic integrity.',
    'The consequences of plagiarism extend beyond academic life.',
    'Proper citation is a skill that every student should develop.',
    'Creative thinking is essential for problem-solving.',
    'Plagiarism detection tools help maintain academic integrity.',
    'Original research contributes to the advancement of knowledge.',
    'Education is about learning, not copying.',
    'In the digital age, plagiarism is easier to detect.',
    'Developing original ideas is a mark of intellectual maturity.',
    'Citing sources accurately shows respect for others\' work.',
    'Students should be educated about the importance of academic integrity.',
    'Plagiarism is a betrayal of trust in academic settings.',
    'Authentic learning involves independent thinking.',
    'The value of education is diminished by plagiarism.',
    'Copying and pasting is not a substitute for critical thinking.',
    'Ethical behavior in academia is essential for a healthy learning environment.',
]

labels = [0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)
word_index = tokenizer.word_index
vocab_size = len(word_index) + 1
sequences = tokenizer.texts_to_sequences(texts)
max_sequence_length = max(len(seq) for seq in sequences)
padded_sequences = pad_sequences(sequences, maxlen=max_sequence_length)
labels = np.array(labels)

# Adjust split_index to ensure an even split of samples
split_index = len(padded_sequences) - len(padded_sequences) // 5  # 80-20 split

x_train, x_test = padded_sequences[:split_index], padded_sequences[split_index:]
y_train, y_test = labels[:split_index], labels[split_index:]

embedding_dim = 100
lstm_units = 64

model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_sequence_length))
model.add(LSTM(units=lstm_units))
model.add(Dense(units=1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
epochs = 50
batch_size = 32

model.fit(x_train, y_train, epochs=epochs, batch_size=batch_size, validation_split=0.2)

# Evaluate the model
loss, accuracy = model.evaluate(x_test, y_test)
print(f"Test Loss: {loss:.4f}")

# Make predictions
new_texts = [
    'This was a plagiarized sentence.'
]

new_sequences = tokenizer.texts_to_sequences(new_texts)
new_padded_sequences = pad_sequences(new_sequences, maxlen=max_sequence_length)
predictions = model.predict(new_padded_sequences)

for i, text in enumerate(new_texts):
    sentiment = "the text is not plagiarized " if predictions[i] >= 0.5 else "the text is plagiarized"
    print(f"'{text}' -> Predicted Sentiment: {sentiment}")

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Test Loss: 0.2412
'This was a plagiarized sentence.' -> Predicted Sentiment: the text is plagiarized
