In [4]:
import pandas as pd
import numpy as np
import string
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tqdm import tqdm
from IPython import get_ipython
from IPython.display import display

In [3]:
def load_txt_data(filepath):
    """Load data from .txt file with format: text1[TAB]text2[TAB]label"""
    texts1 = []
    texts2 = []
    labels = []

    with open(filepath, 'r', encoding='utf-8') as f:
        for line in tqdm(f, desc="Loading data"):
            parts = line.strip().split('\t')
            if len(parts) == 3:
                try:
                    texts1.append(parts[0])
                    texts2.append(parts[1])
                    labels.append(int(parts[2]))
                except ValueError:
                    continue  # Skip lines with invalid labels

    df = pd.DataFrame({
        'text1': texts1,
        'text2': texts2,
        'label': labels
    })

    print("Initial data shape:", df.shape)
    df.dropna(inplace=True)
    df.drop_duplicates(inplace=True)
    print("After cleaning data shape:", df.shape)

    return df


In [5]:
def preprocess_text(text):
    """Basic text preprocessing"""
    if isinstance(text, str):
        text = text.translate(str.maketrans("", "", string.punctuation))
        text = text.lower()
    return text

def prepare_data(df):
    """Tokenize and prepare sequences for model"""
    # Preprocess text
    df['text1'] = df['text1'].apply(preprocess_text)
    df['text2'] = df['text2'].apply(preprocess_text)

    # Tokenize texts
    tokenizer = Tokenizer(num_words=10000)
    tokenizer.fit_on_texts(pd.concat([df['text1'], df['text2']]))

    # Convert texts to sequences
    seq1 = tokenizer.texts_to_sequences(df['text1'])
    seq2 = tokenizer.texts_to_sequences(df['text2'])

    # Pad sequences
    max_len = 100  # You can adjust this based on your data
    seq1 = pad_sequences(seq1, maxlen=max_len)
    seq2 = pad_sequences(seq2, maxlen=max_len)

    # Combine sequences
    X = np.concatenate([seq1, seq2], axis=1)
    y = df['label'].values

    return train_test_split(X, y, test_size=0.2, random_state=42), tokenizer


In [6]:
def build_model(input_shape):
    """Build LSTM model for text classification"""
    model = Sequential([
        Embedding(input_dim=10000, output_dim=128, input_length=input_shape),
        LSTM(128, return_sequences=True),
        Dropout(0.5),
        LSTM(64),
        Dense(64, activation='relu'),
        Dropout(0.5),
        Dense(1, activation='sigmoid')
    ])

    model.compile(
        loss='binary_crossentropy',
        optimizer='adam',
        metrics=['accuracy']
    )
    return model


In [7]:
# # Main Execution
# %%
if __name__ == "__main__":
    # File paths
    txt_file_path = "/content/train_snli.txt"  # Replace with your .txt file path

    # Load and prepare data
    df = load_txt_data(txt_file_path)
    (xtrain, xtest, ytrain, ytest), tokenizer = prepare_data(df)

    # Build model
    model = build_model(xtrain.shape[1])

    # Show model summary
    model.summary()

    # Train model
    history = model.fit(
        xtrain, ytrain,
        batch_size=32,
        epochs=1,
        validation_data=(xtest, ytest),
        callbacks=[EarlyStopping(monitor='val_loss', patience=3)]
    )

    # Evaluate
    loss, accuracy = model.evaluate(xtest, ytest)
    y_pred = (model.predict(xtest) > 0.5).astype(int)

    print("\nEvaluation Metrics:")
    print(f"Test Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision_score(ytest, y_pred):.4f}")
    print(f"Recall: {recall_score(ytest, y_pred):.4f}")
    print(f"F1 Score: {f1_score(ytest, y_pred):.4f}")


Loading data: 367373it [00:00, 671689.88it/s]


Initial data shape: (367373, 3)
After cleaning data shape: (366919, 3)




[1m9173/9173[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4937s[0m 538ms/step - accuracy: 0.7310 - loss: 0.5259 - val_accuracy: 0.7715 - val_loss: 0.4689
[1m2294/2294[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m332s[0m 145ms/step - accuracy: 0.7733 - loss: 0.4660
[1m2294/2294[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m328s[0m 143ms/step

Evaluation Metrics:
Test Accuracy: 0.7715
Precision: 0.7613
Recall: 0.7893
F1 Score: 0.7751


In [10]:
# Colab-compatible interactive prediction
def predict_plagiarism(model, tokenizer, text1, text2, max_len=100):
    """
    Predicts if text2 is plagiarized from text1.
    """
    # Preprocess texts
    text1_processed = preprocess_text(text1)
    text2_processed = preprocess_text(text2)

    # Tokenize and pad sequences
    # Need to fit tokenizer on new texts if they contain new words,
    # but for a deployed model, you'd use the fitted tokenizer from training.
    # For simplicity here, we assume the tokenizer is already comprehensive.
    seq1 = tokenizer.texts_to_sequences([text1_processed])
    seq2 = tokenizer.texts_to_sequences([text2_processed])

    seq1_padded = pad_sequences(seq1, maxlen=max_len)
    seq2_padded = pad_sequences(seq2, maxlen=max_len)

    # Combine sequences
    X_new = np.concatenate([seq1_padded, seq2_padded], axis=1)

    # Predict
    prediction = model.predict(X_new)
    confidence = prediction[0][0]

    if confidence > 0.5:
        result = "Plagiarized"
    else:
        result = "Original"

    return result, confidence

print("\nPlagiarism Detection System")
print("Type your texts below (enter 'quit' to exit)")

while True:
    try:
        print("\n--- New Comparison ---")
        original_text = input("Enter the original text (or 'quit' to exit): ")
        if original_text.lower() == 'quit':
            break

        suspect_text = input("Enter the text to check for plagiarism: ")
        if suspect_text.lower() == 'quit':
            break

        # Pass the required arguments to the predict_plagiarism function
        result, confidence = predict_plagiarism(model, tokenizer, original_text, suspect_text)

        print(f"\nResult: {result} (Confidence: {confidence:.2%})")
        if result == "Plagiarized":
            print("Warning: This text appears to be plagiarized!")
        else:
            print(" This text appears to be original.")

    except Exception as e:
        print(f"Error: {e}\nPlease try again.")


Plagiarism Detection System
Type your texts below (enter 'quit' to exit)

--- New Comparison ---
Enter the original text (or 'quit' to exit): hi my name is sourav
Enter the text to check for plagiarism: hi sonu
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 126ms/step

Result: Plagiarized (Confidence: 68.12%)

--- New Comparison ---
Enter the original text (or 'quit' to exit): quit


In [12]:
import pickle

In [14]:
import pickle

# Save the trained model
with open('plagiarism_model.pkl1', 'wb') as file:
    pickle.dump(model, file)

# Save the tokenizer (not a TF-IDF vectorizer as the variable name suggested)
# The tokenizer object was returned from the prepare_data function
with open('tokenizer.pkl1', 'wb') as file:
    pickle.dump(tokenizer, file)

print("Model and tokenizer saved successfully!")

Model and tokenizer saved successfully!


In [17]:
# Load the saved model
with open('/content/plagiarism_model.pkl1', 'rb') as file:
    loaded_model = pickle.load(file)
# Load the saved model
with open('/content/plagiarism_model.pkl1', 'rb') as file:
    loaded_model = pickle.load(file)

# Load the saved vectorizer
with open('/content/tokenizer.pkl1', 'rb') as file:
    loaded_vectorizer = pickle.load(file)

print("Model and vectorizer loaded successfully!")
print("Model and vectorizer loaded successfully!")


Model and vectorizer loaded successfully!
Model and vectorizer loaded successfully!
