<a href="https://colab.research.google.com/github/Umeshb21/Tamper_detection-/blob/main/Tamper_detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#**Prototype that performs automatic tamper detection**

**Random Forest-based classifier**

In [None]:
# Install required libraries (if not already installed)
!pip install pandas scikit-learn

# Step 1: Import Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
import re

# Step 2: Create Sample Certificate Dataset
data = {
    'certificate_id': ['GL-12345', 'GL-23456', 'GL-34567', 'GL-45678', 'GL-56789',
                       'FAKE-123', 'GL-11111', '123-GL', 'GL-00000', 'GL-99999'],
    'name': ['John Doe', 'Jane Smith', 'Alice Brown', 'Robert Gray', 'Chris Blue',
             'Fake Name', 'Tampered User', 'Wrong Format', 'Invalid Entry', 'Forged Name'],
    'course': ['AI Foundations', 'Machine Learning', 'Data Science', 'Python Basics', 'Cloud Fundamentals',
               'Data Scam', 'AI Foundations', 'Random Stuff', 'Fake Course', 'Hacking 101'],
    'issue_date': ['2023-06-01', '2023-07-15', '2023-08-01', '2023-09-10', '2023-10-20',
                   '2025-10-15', '2019-01-01', '2022-03-03', '2030-01-01', '1999-12-31'],
    'valid': [1, 1, 1, 1, 1, 0, 0, 0, 0, 0]  # 1 = valid, 0 = tampered
}

df = pd.DataFrame(data)

# Step 3: Feature Engineering
def is_valid_id(cert_id):
    return 1 if re.match(r'^GL-\d{5}$', cert_id) else 0

def is_valid_course(course):
    valid_courses = ['AI Foundations', 'Machine Learning', 'Data Science',
                     'Python Basics', 'Cloud Fundamentals']
    return 1 if course in valid_courses else 0

def is_reasonable_date(date_str):
    year = int(date_str.split('-')[0])
    return 1 if 2020 <= year <= 2025 else 0

df['valid_id_format'] = df['certificate_id'].apply(is_valid_id)
df['valid_course'] = df['course'].apply(is_valid_course)
df['valid_date'] = df['issue_date'].apply(is_reasonable_date)

# Convert categorical features (e.g., name) using label encoding
le = LabelEncoder()
df['name_encoded'] = le.fit_transform(df['name'])

# Select features
features = ['valid_id_format', 'valid_course', 'valid_date', 'name_encoded']
X = df[features]
y = df['valid']

# Step 4: Train Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Evaluate
y_pred = model.predict(X_test)
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred))



Classification Report:

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         1
           1       1.00      1.00      1.00         1

    accuracy                           1.00         2
   macro avg       1.00      1.00      1.00         2
weighted avg       1.00      1.00      1.00         2



#Predict on New User Input

In [None]:
def predict_certificate(certificate_id, name, course, issue_date):
    features = {
        'valid_id_format': is_valid_id(certificate_id),
        'valid_course': is_valid_course(course),
        'valid_date': is_reasonable_date(issue_date),
        'name_encoded': le.transform([name])[0] if name in le.classes_ else -1
    }
    input_df = pd.DataFrame([features])
    prediction = model.predict(input_df)[0]
    return "Valid Certificate ✅" if prediction == 1 else "Tampered/Invalid ❌"

# 🎯 Example Usage
print(predict_certificate("GL-22222", "Jane Smith", "Machine Learning", "2023-08-12"))
print(predict_certificate("FAKE-999", "Hacker Man", "Malware Engineering", "2029-01-01"))


Valid Certificate ✅
Tampered/Invalid ❌


#PDF Metadata Analysis **(using PyPDF2)**

In [None]:
!pip install PyPDF2

from PyPDF2 import PdfReader

def analyze_pdf_metadata(file_path):
    try:
        reader = PdfReader(file_path)
        metadata = reader.metadata
        print("PDF Metadata:")
        for key, value in metadata.items():
            print(f"{key}: {value}")

        if '/ModDate' in metadata and '/CreationDate' in metadata:
            mod_date = metadata['/ModDate']
            create_date = metadata['/CreationDate']
            if mod_date != create_date:
                print("WARNING: PDF may have been modified after creation.")
            else:
                print("PDF creation and modification dates match.")
        else:
            print("Metadata missing creation or modification date.")
    except Exception as e:
        print("Error reading PDF:", e)


analyze_pdf_metadata("/content/drive/MyDrive/AWS_Academy_Graduate___AWS_Academy_Cloud_Foundations_Badge20230916-28-cm7qfo.pdf")


PDF Metadata:
/Author: PDF Generator API
/Keywords: Automated document generation with PDF Generator API. PDFLib:
/Creator: PDF Generator API (https://pdfgeneratorapi.com)
/Producer: TCPDF 6.6.5 (http://www.tcpdf.org)
/CreationDate: D:20230916054729+00'00'
/ModDate: D:20230916054729+00'00'
/Trapped: /False
PDF creation and modification dates match.
