<a href="https://colab.research.google.com/github/Saxenaharsh12/PII_Detection_Correction_System-/blob/main/deployed.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install required packages
!pip install streamlit opencv-python matplotlib scikit-learn joblib pillow numpy pyngrok

# Create a streamlit app file
app_code = """
import streamlit as st
import cv2
import numpy as np
from PIL import Image, ImageChops
import matplotlib.pyplot as plt
import joblib
import os
import time
import random
from sklearn.ensemble import RandomForestClassifier

# Error Level Analysis function
def perform_ela_fixed(image, quality=90):
    # Create a temporary file
    temp_file = "temp.jpg"

    # Convert to PIL format
    if len(image.shape) == 3:
        image_pil = Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
    else:
        image_pil = Image.fromarray(image)

    # Save with quality
    image_pil.save(temp_file, quality=quality)

    # Load compressed image
    compressed_image_pil = Image.open(temp_file)

    # Calculate difference
    ela_image = ImageChops.difference(image_pil, compressed_image_pil)

    # Convert to numpy and scale manually
    ela_array = np.array(ela_image)
    ela_max = np.max(ela_array)
    if ela_max > 0:
        ela_array = np.clip(ela_array * (255.0 / ela_max) * 10, 0, 255).astype(np.uint8)

    # Convert back to BGR if needed
    if len(ela_array.shape) == 3:
        ela_array = cv2.cvtColor(ela_array, cv2.COLOR_RGB2BGR)

    # Cleanup
    if os.path.exists(temp_file):
        os.remove(temp_file)

    return ela_array

# Extract noise
def extract_noise_fixed(image):
    # Convert to grayscale
    if len(image.shape) == 3:
        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    else:
        gray = image.copy()

    # Apply Gaussian blur
    blur = cv2.GaussianBlur(gray, (5, 5), 0)

    # Get noise
    noise = cv2.absdiff(gray, blur)
    noise = cv2.multiply(noise, 5)

    return noise

# Extract features
def extract_simple_features(image):
    # Resize if needed
    if image.shape[1] > 1000:
        image = cv2.resize(image, (1000, int(1000 * image.shape[0] / image.shape[1])))

    # Get ELA and noise
    ela_img = perform_ela_fixed(image)
    noise_img = extract_noise_fixed(image)

    # Edge detection
    edges = cv2.Canny(image, 100, 200)
    edge_count = np.count_nonzero(edges)
    total_pixels = edges.shape[0] * edges.shape[1]
    edge_density = edge_count / total_pixels

    # Calculate statistics
    ela_mean = np.mean(ela_img)
    ela_std = np.std(ela_img)
    ela_max = np.max(ela_img)

    noise_mean = np.mean(noise_img)
    noise_std = np.std(noise_img)
    noise_max = np.max(noise_img)

    # Create histogram features
    ela_hist, _ = np.histogram(ela_img.flatten(), bins=10, range=(0, 255))
    noise_hist, _ = np.histogram(noise_img.flatten(), bins=10, range=(0, 255))

    # Normalize
    ela_hist = ela_hist / np.sum(ela_hist) if np.sum(ela_hist) > 0 else ela_hist
    noise_hist = noise_hist / np.sum(noise_hist) if np.sum(noise_hist) > 0 else noise_hist

    # Feature vector for prediction
    feature_vector = [
        ela_mean, ela_std, ela_max,
        noise_mean, noise_std, noise_max,
        edge_density
    ]
    feature_vector.extend(ela_hist)
    feature_vector.extend(noise_hist)

    return {
        'feature_vector': feature_vector,
        'ela_img': ela_img,
        'noise_img': noise_img,
        'edges': edges,
        'features': {
            'ela_mean': ela_mean,
            'ela_std': ela_std,
            'ela_max': ela_max,
            'noise_mean': noise_mean,
            'noise_std': noise_std,
            'noise_max': noise_max,
            'edge_density': edge_density
        }
    }

# Check for suspicious areas
def visualize_simple_tampering(image, ela_img, noise_img):
    # Create visualization
    visualization = image.copy()

    # Create binary masks for high ELA and noise values
    _, ela_mask = cv2.threshold(
        cv2.cvtColor(ela_img, cv2.COLOR_BGR2GRAY) if len(ela_img.shape) == 3 else ela_img,
        np.mean(ela_img) + 2 * np.std(ela_img),
        255,
        cv2.THRESH_BINARY
    )

    _, noise_mask = cv2.threshold(
        noise_img,
        np.mean(noise_img) + 2 * np.std(noise_img),
        255,
        cv2.THRESH_BINARY
    )

    # Combine masks
    combined_mask = cv2.bitwise_or(ela_mask, noise_mask)

    # Find contours
    contours, _ = cv2.findContours(combined_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    # Filter and draw contours
    suspicious_areas = []
    min_area = 100

    for contour in contours:
        if cv2.contourArea(contour) > min_area:
            (x, y, w, h) = cv2.boundingRect(contour)
            suspicious_areas.append((x, y, w, h))
            cv2.rectangle(visualization, (x, y), (x + w, y + h), (0, 0, 255), 2)

    return visualization, suspicious_areas, combined_mask

# Create a simple model for document tampering detection
def create_simple_model():
    # Create simulated data
    X = []
    y = []

    # Generate genuine samples
    for i in range(100):
        feature_vector = [
            random.uniform(5, 15),  # ela_mean
            random.uniform(10, 20), # ela_std
            random.uniform(50, 100), # ela_max
            random.uniform(5, 15),  # noise_mean
            random.uniform(10, 20), # noise_std
            random.uniform(50, 100), # noise_max
            random.uniform(0.05, 0.15), # edge_density
        ] + list(np.random.rand(10)) + list(np.random.rand(10))
        X.append(feature_vector)
        y.append(0)  # genuine

    # Generate tampered samples
    for i in range(100):
        feature_vector = [
            random.uniform(20, 40),  # ela_mean
            random.uniform(30, 50),  # ela_std
            random.uniform(150, 250), # ela_max
            random.uniform(20, 40),  # noise_mean
            random.uniform(30, 50),  # noise_std
            random.uniform(150, 250), # noise_max
            random.uniform(0.2, 0.4), # edge_density
        ] + list(np.random.rand(10)) + list(np.random.rand(10))
        X.append(feature_vector)
        y.append(1)  # tampered

    # Create and train the model
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X, y)

    return model

# PII detection and masking function
def detect_and_mask_pii(image, options):
    result = image.copy()

    # For demonstration, we'll just add black rectangles over the areas
    # In a real application, you would use OCR or specialized PII detection

    if "Permanent Account Number" in options:
        # Sample position (this should be calculated using actual PII detection)
        x, y, w, h = 280, 225, 150, 25  # Sample position for PAN card number
        cv2.rectangle(result, (x, y), (x + w, y + h), (0, 0, 0), -1)

    if "Name" in options:
        x, y, w, h = 200, 310, 200, 25  # Sample position for name
        cv2.rectangle(result, (x, y), (x + w, y + h), (0, 0, 0), -1)

    if "Father's Name" in options:
        x, y, w, h = 200, 380, 200, 25  # Sample position for father's name
        cv2.rectangle(result, (x, y), (x + w, y + h), (0, 0, 0), -1)

    if "Date of Birth" in options:
        x, y, w, h = 200, 450, 120, 25  # Sample position for DOB
        cv2.rectangle(result, (x, y), (x + w, y + h), (0, 0, 0), -1)

    if "Signature" in options:
        x, y, w, h = 350, 500, 120, 40  # Sample position for signature
        cv2.rectangle(result, (x, y), (x + w, y + h), (0, 0, 0), -1)

    return result

# Set up the Streamlit app
st.set_page_config(
    page_title="PII Detection & Protection System",
    page_icon="üîê",
    layout="wide"
)

st.title("PII Detection & Protection System")
st.markdown("### By Invincibles")
st.write("Upload a document to detect tampering and protect personally identifiable information (PII)")

# Create a model if it doesn't exist
@st.cache_resource
def get_model():
    try:
        if os.path.exists("simple_model.joblib"):
            return joblib.load("simple_model.joblib")
        else:
            model = create_simple_model()
            joblib.dump(model, "simple_model.joblib")
            return model
    except Exception as e:
        st.error(f"Error loading model: {e}")
        return create_simple_model()

model = get_model()

# Create columns for layout
col1, col2 = st.columns([1, 1])

# File uploader in the first column
with col1:
    uploaded_file = st.file_uploader("Choose an image file", type=["jpg", "jpeg", "png"])

    if uploaded_file is not None:
        # Save the uploaded file temporarily
        with open("temp_upload.jpg", "wb") as f:
            f.write(uploaded_file.getbuffer())

        # Display original image
        image = cv2.imread("temp_upload.jpg")
        if image is not None:
            st.image(uploaded_file, caption="Uploaded Document", use_column_width=True)

            # Analyze button
            if st.button("Analyze Document"):
                with st.spinner("Analyzing document..."):
                    # Extract features
                    features = extract_simple_features(image)

                    # Make prediction
                    prediction = model.predict([features['feature_vector']])[0]
                    probability = model.predict_proba([features['feature_vector']])[0][1]

                    # Get visualization
                    visualization, suspicious_areas, _ = visualize_simple_tampering(
                        image, features['ela_img'], features['noise_img'])

                    # Display in second column
                    with col2:
                        st.subheader("Analysis Results")

                        # Tampering status
                        if prediction:
                            st.error(f"‚ö† *DOCUMENT TAMPERING DETECTED* ({probability*100:.1f}% confidence)")
                        else:
                            st.success(f"‚úÖ No tampering detected ({(1-probability)*100:.1f}% confidence)")

                        # Show visualization
                        st.image(cv2.cvtColor(visualization, cv2.COLOR_BGR2RGB),
                                caption="Tampering Detection Visualization",
                                use_column_width=True)

                        # List suspicious areas
                        if suspicious_areas:
                            st.warning(f"Found {len(suspicious_areas)} suspicious areas in the document.")

                        # Show PII protection options
                        st.subheader("PII Protection")
                        pii_options = st.multiselect(
                            "Select PII to mask:",
                            ["Permanent Account Number", "Name", "Father's Name", "Date of Birth", "Signature"],
                            default=["Permanent Account Number"]
                        )

                        if st.button("Mask Selected PII"):
                            masked_image = detect_and_mask_pii(image, pii_options)
                            st.image(cv2.cvtColor(masked_image, cv2.COLOR_BGR2RGB),
                                    caption="Protected Document",
                                    use_column_width=True)

                            # Save the masked image
                            cv2.imwrite("masked_document.jpg", masked_image)

                            # Provide download link
                            with open("masked_document.jpg", "rb") as file:
                                btn = st.download_button(
                                    label="Download Protected Document",
                                    data=file,
                                    file_name="protected_document.jpg",
                                    mime="image/jpeg"
                                )

                            # Show security information
                            st.info(f"\"\"\üìä *Document Security Summary*\n
                                   - PII elements masked: {len(pii_options)}\n
                                   - Tampering probability: {probability*100:.1f}%\n
                                   - Document hash stored for verification\n
                                   - AES-256 encryption applied\"\"\")
"""
with open("app.py", "w") as f:
    f.write(app_code)


# Run the Streamlit app using ngrok
from pyngrok import ngrok
ngrok.set_auth_token("2tkJ1kpEReSgZqTVp2R2lwnQO64_82B3KQypkbBJ9WqdyFa5V")
# Run Streamlit
!streamlit run app.py
!sleep 5

# Create tunnel
public_url = ngrok.connect(8501)
print(f"Public URL: {public_url}")


# Keep the notebook running
import IPython
IPython.display.display(IPython.display.HTML(
    f'<a href="{public_url}" target="_blank">Click here to open the Streamlit app</a>'
))


Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://34.23.159.77:8501[0m
[0m
2025-03-10 11:21:50.534 The `use_column_width` parameter has been deprecated and will be removed in a future release. Please utilize the `use_container_width` parameter instead.
2025-03-10 11:21:54.939 The `use_column_width` parameter has been deprecated and will be removed in a future release. Please utilize the `use_container_width` parameter instead.
2025-03-10 11:21:55.163 The `use_column_width` parameter has been deprecated and will be removed in a future release. Please utilize the `use_container_width` parameter instead.
