In [None]:
from flask import Flask, request, jsonify
from werkzeug.utils import secure_filename
import logging
import re
import traceback
from flask_cors import CORS
import pandas as pd
from google.cloud import storage
import tempfile
import os
import uuid
import json

app = Flask(__name__)
CORS(app)

ALLOWED_EXTENSIONS = {'csv'}
MAX_FILE_SIZE = 2 * 1024 * 1024 * 1024
GCP_PROJECT_ID = 'decisive-sylph-449809-j4'
USER_BUCKET_MAP_FILE = 'user_buckets.json'
BUCKET_LOCATION = 'asia-south1'  # Mumbai region

app.config['MAX_CONTENT_LENGTH'] = MAX_FILE_SIZE
app.secret_key = 'prmis'

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s",
    handlers=[
        logging.FileHandler("app.log"),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

def allowed_file(filename):
    return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS

def is_valid_csv(file_path):
    try:
        pd.read_csv(file_path, nrows=5)
        return True
    except Exception as e:
        logger.error(f"Invalid CSV format: {e}")
        return False

pattern = re.compile(r'(\.exe|\.bat|\.sh|\.cmd|\.msi|\.vbs|\.js|\.ps1)', re.IGNORECASE)

def scan_for_executables_in_chunk(chunk):
    for col in chunk.columns:
        for value in chunk[col].dropna().astype(str):
            cleaned_value = value.strip().replace('"', '').replace("'", "")
            if pattern.search(cleaned_value):
                logger.warning(f"Executable reference detected in column '{col}': {value}")
                return True
    return False

def upload_to_gcs(bucket_name, file_path, destination_blob_name):
    storage_client = storage.Client(project=GCP_PROJECT_ID)
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)
    blob.upload_from_filename(file_path)
    logger.info(f"File {file_path} uploaded to {bucket_name}/{destination_blob_name}.")

def create_gcs_bucket(bucket_name, location=BUCKET_LOCATION):
    storage_client = storage.Client(project=GCP_PROJECT_ID)
    bucket = storage_client.bucket(bucket_name)
    bucket.location = location
    bucket = storage_client.create_bucket(bucket)
    logger.info(f"Bucket {bucket.name} created in {bucket.location}")
    return bucket.name

def get_user_bucket_map():
    try:
        with open(USER_BUCKET_MAP_FILE, 'r') as f:
            return json.load(f)
    except FileNotFoundError:
        return {}

def save_user_bucket_map(user_buckets):
    with open(USER_BUCKET_MAP_FILE, 'w') as f:
        json.dump(user_buckets, f, indent=4)

@app.route("/upload", methods=["POST"])
def upload_file():
    if "files" not in request.files:
        logger.error("No files part in the request")
        return jsonify({"error": "No files part in the request"}), 400

    files = request.files.getlist("files")

    username = request.form.get('username')
    if not username:
        logger.error("Username is required")
        return jsonify({"error": "Username is required"}), 400

    user_buckets = get_user_bucket_map()

    if username not in user_buckets:
        # Create a new bucket if the user doesn't have one yet
        bucket_name = f"user-{username}-{uuid.uuid4()}"
        create_gcs_bucket(bucket_name)
        user_buckets[username] = bucket_name
        save_user_bucket_map(user_buckets)
    else:
        bucket_name = user_buckets[username]

    uploaded_files = []
    for file in files:
        if file and allowed_file(file.filename):
            try:
                with tempfile.NamedTemporaryFile(delete=False) as temp_file:
                    file.save(temp_file.name)
                    file_path = temp_file.name
                    logger.info(f"File saved to temporary location: {file_path}")

                    if not is_valid_csv(file_path):
                        os.remove(file_path)
                        return jsonify({"error": f"File '{file.filename}' rejected: Invalid CSV format!"}), 400

                    chunk_size = 500000
                    for chunk in pd.read_csv(file_path, chunksize=chunk_size, dtype=str):
                        if scan_for_executables_in_chunk(chunk):
                            os.remove(file_path)
                            return jsonify({"error": f"File '{file.filename}' rejected: Executable references detected!"}), 400

                    # Use the original filename for the destination
                    destination_filename = secure_filename(file.filename)

                    upload_to_gcs(bucket_name, file_path, destination_filename)
                    os.remove(file_path)

                    uploaded_files.append({
                        "filename": destination_filename,
                        "bucket_name": bucket_name
                    })

            except Exception as e:
                trace = traceback.format_exc()
                logger.error(f"Error processing file {file.filename}: {e}\n{trace}")
                return jsonify({"error": "Internal server error"}), 500
        else:
            return jsonify({"error": f"Invalid file type uploaded: {file.filename}"}), 400

    return jsonify({
        "message": "Files uploaded successfully to GCP",
        "uploaded_files": uploaded_files
    }), 200

@app.route("/files", methods=["GET"])
def list_files():
    try:
        storage_client = storage.Client(project=GCP_PROJECT_ID)
        buckets = storage_client.list_buckets()
        all_files = []
        for bucket in buckets:
            blobs = bucket.list_blobs()
            files = [{"bucket": bucket.name, "file": blob.name} for blob in blobs if blob.name.endswith(".csv")]
            all_files.extend(files)
        logger.info("Listing uploaded files from GCS")
        return jsonify({"files": all_files, "user_buckets": get_user_bucket_map()}), 200
    except Exception as e:
        logger.error(f"Error listing files from GCS: {e}")
        return jsonify({"error": "Internal server error"}), 500

@app.route('/webhook', methods=['POST'])
def webhook():
    if request.method == 'POST':
        return 'Webhook received!', 200
    return 'Invalid request', 400

if __name__ == "__main__":
    logger.info("Starting Flask application")
    app.run(host="0.0.0.0", port=5000, debug=False)

2025-03-10 10:35:15,671 - INFO - Starting Flask application


 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on all addresses (0.0.0.0)
 * Running on http://127.0.0.1:5000
 * Running on http://10.160.0.3:5000
2025-03-10 10:35:15,680 - INFO - [33mPress CTRL+C to quit[0m
2025-03-10 10:35:25,005 - INFO - 47.247.157.210 - - [10/Mar/2025 10:35:25] "[31m[1mGET /upload HTTP/1.1[0m" 405 -
2025-03-10 10:35:25,775 - INFO - 47.247.157.210 - - [10/Mar/2025 10:35:25] "[31m[1mGET /upload HTTP/1.1[0m" 405 -
  bucket.location = location
2025-03-10 10:35:39,886 - INFO - Bucket user-rohan-f1525898-95e7-4ebc-bb14-a10630830bb9 created in ASIA-SOUTH1
2025-03-10 10:35:39,889 - INFO - File saved to temporary location: /var/tmp/tmpyi6jbf_3
2025-03-10 10:35:40,188 - INFO - File /var/tmp/tmpyi6jbf_3 uploaded to user-rohan-f1525898-95e7-4ebc-bb14-a10630830bb9/House_Price_India.csv.
2025-03-10 10:35:40,191 - INFO - 47.247.157.210 - - [10/Mar/2025 10:35:40] "POST /upload HTTP/1.1" 200 -
2025-03-10 11:48:22,784 - INFO - 64.62.197.25 - - [10/Mar/2025 11:48:22] "[33mGET / HTTP/1.1[0m" 404 -
2025-03-10 1