In [None]:
# # 1. Upload dataset folder in Colab
# print("Please upload your 'xyzdataset' folder containing .txt files")
# uploaded = files.upload()  # Upload the folder as a zip and extract manually if needed

# If uploaded as zip, uncomment and run this:

!unzip xyzdataset.zip -d xyzdataset


Archive:  xyzdataset.zip
  inflating: xyzdataset/train_snli.txt  


In [None]:
from sklearn.model_selection import train_test_split

# Read the dataset from xyz_dataset.txt
with open("/content/xyzdataset/train_snli.txt", "r", encoding="utf-8") as file:
    lines = [line.strip() for line in file.readlines() if line.strip()]

# Extract labels and texts (split by tabs, label is last)
labels = []
texts = []
for line in lines:
    parts = line.split("\t")  # Split by tab
    if len(parts) >= 3:  # Ensure there are at least 3 parts (text1, text2, label)
        label = int(parts[-1])  # Last part is the label (0 or 1)
        text = "\t".join(parts[:-1])  # Keep text segments with tab separator
        labels.append(label)
        texts.append(text)
    else:
        print(f"Skipping malformed line: '{line}'")

# Split: 80% train (8000 lines), 20% test (2000 lines)
train_texts, test_texts, train_labels, test_labels = train_test_split(
    texts, labels, test_size=0.2, random_state=42, stratify=labels
)

# Recombine into original format (text1\ttext2\tlabel)
train_lines = [f"{text}\t{label}" for text, label in zip(train_texts, train_labels)]
test_lines = [f"{text}\t{label}" for text, label in zip(test_texts, test_labels)]

# Save to files
with open("train.txt", "w", encoding="utf-8") as f:
    f.write("\n".join(train_lines))
with open("test.txt", "w", encoding="utf-8") as f:
    f.write("\n".join(test_lines))

# Verify sizes and distribution
print(f"Training set: {len(train_lines)} lines (0s: {train_labels.count(0)}, 1s: {train_labels.count(1)})")
print(f"Test set: {len(test_lines)} lines (0s: {test_labels.count(0)}, 1s: {test_labels.count(1)})")

Training set: 293898 lines (0s: 147173, 1s: 146725)
Test set: 73475 lines (0s: 36793, 1s: 36682)


In [None]:
import os
import string
import re
import nltk
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.metrics.pairwise import cosine_similarity
import pickle

# Download NLTK resources (run once)
nltk.download('punkt')
nltk.download('punkt')
nltk.download('stopwords')

# 2. Function to read .txt files
def read_txt(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

# 3. Load and split the dataset from xyz_dataset.txt
dataset_path = '/content/xyzdataset/train_snli.txt'  # Adjust path if needed
documents = []
labels = []

# Read the single file and split into documents and labels
if os.path.exists(dataset_path):
    with open(dataset_path, 'r', encoding='utf-8') as file:
        lines = [line.strip() for line in file.readlines() if line.strip()]

    for line in lines:
        parts = line.split('\t')  # Split by tab
        if len(parts) >= 3:  # text1, text2, label
            text = "\t".join(parts[:-1])  # Combine text1 and text2 with tab
            label = int(parts[-1])  # Last part is 0 or 1
            documents.append(text)
            labels.append(label)
else:
    raise FileNotFoundError(f"{dataset_path} not found!")

# Split: 80% train (8000 lines), 20% test (2000 lines)
X_train, X_test, y_train, y_test = train_test_split(
    documents, labels, test_size=0.2, random_state=42, stratify=labels
)

# 4. Data Preprocessing Function
def preprocess_text(text):
    # Convert to lowercase
    text = str(text).lower()

    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Remove numbers
    text = re.sub(r'\d+', '', text)

    # Tokenization
    tokens = word_tokenize(text)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # Join tokens back to text
    return ' '.join(tokens)

# 5. Preprocess all documents
processed_docs_train = [preprocess_text(doc) for doc in X_train]
processed_docs_test = [preprocess_text(doc) for doc in X_test]

# 7. Create TF-IDF Vectorizer and fit on training data
vectorizer = TfidfVectorizer()
tfidf_train = vectorizer.fit_transform(processed_docs_train)
tfidf_test = vectorizer.transform(processed_docs_test)

# 8. Plagiarism Checker Class
class PlagiarismChecker:
    def __init__(self, vectorizer, tfidf_matrix, threshold=0.8):
        self.vectorizer = vectorizer
        self.tfidf_matrix = tfidf_matrix
        self.threshold = threshold

    def check_plagiarism(self, text):
        # Preprocess input text
        processed_text = preprocess_text(text)

        # Transform text to TF-IDF
        text_tfidf = self.vectorizer.transform([processed_text])

        # Calculate cosine similarity with all documents
        similarities = cosine_similarity(text_tfidf, self.tfidf_matrix)[0]

        # Find maximum similarity
        max_similarity = np.max(similarities)

        # Return result
        return {
            'is_plagiarized': max_similarity >= self.threshold,
            'similarity_score': max_similarity,
            'most_similar_doc_index': np.argmax(similarities)
        }

# 9. Training and Evaluation
checker = PlagiarismChecker(vectorizer, tfidf_train)

# Test the model
y_pred = []
for doc in processed_docs_test:
    result = checker.check_plagiarism(doc)
    y_pred.append(1 if result['is_plagiarized'] else 0)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy * 100:.2f}%")

# 10. Working Demo Function
def demo_plagiarism_checker(text):
    result = checker.check_plagiarism(text)
    print("\nPlagiarism Check Results:")
    print(f"Input Text: {text}")
    print(f"Similarity Score: {result['similarity_score']:.4f}")
    print(f"Plagiarism Detected: {result['is_plagiarized']}")
    print(f"Most Similar Document Index: {result['most_similar_doc_index']}")
    print(f"Most Similar Document: {X_train[result['most_similar_doc_index']]}")

# 11. Test the demo with sample texts
print("\n=== Demo Tests ===")
test_texts = [
    "This is a test document about machine learning",
    "Completely unique and original content",
    "Machine learning powers modern technology"
]

for test_text in test_texts:
    demo_plagiarism_checker(test_text)

# 12. Dataset Statistics
print("\n=== xyz_dataset Statistics ===")
print(f"Total documents: {len(documents)}")
print(f"Number of training documents: {len(X_train)}")
print(f"Number of test documents: {len(X_test)}")
print(f"Vocabulary size: {len(vectorizer.vocabulary_)}")
print(f"Plagiarized documents: {sum(labels)}")
print(f"Original documents: {len(labels) - sum(labels)}")

# Optional: Save the model
with open('plagiarism_checker.pkl', 'wb') as f:
    pickle.dump({
        'vectorizer': vectorizer,
        'tfidf_matrix': tfidf_train,
        'threshold': checker.threshold
    }, f)
print("\nModel saved as 'plagiarism_checker.pkl'")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
