In [None]:
import re
import psycopg2
import argparse
import json
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer

DATABASE_CONFIG = {
    'dbname': 'sms_firewall',
    'user': 'postgres',
    'password': 'postgres',
    'host': 'localhost',
    'port': '5432'
}

nlp = spacy.load("en_core_web_sm")

otp_type_sms_keywords = {
    "code": 2, "pin": 2, "otp": 3, "otp is:": 3, "otp is :": 3, "otp is -": 3, "otp is-": 3, "code is:": 3,
    "code is :": 3, "code is -": 3, "code is-": 3, "login otp": 2, "verification": 1, "authentication": 1,
    "credential": 1, "password": 2, "reference": 1, "number": 1, "vrn": 1, "login": 1, "confirmation": 1,
    "secret": 1, "security code": 2, "auth code": 2, "কোড": 2, "পিন": 2, "ওটিপি": 3, "লগইন ওটিপি": 2,
    "যাচাইকরণ": 1, "প্রমাণীকরণ": 1, "শংসাপত্র": 1, "পাসওয়ার্ড": 2, "রেফারেন্স": 1, "নম্বর": 1,
    "ভিআরএন": 1, "লগইন": 1, "নিশ্চিতকরণ": 1, "নতুন পাসওয়ার্ড": 1, "গোপন": 1
}

def retrieve_url(text):
    pattern = r'(?:[A-Za-z0-9\-]+\.[A-Za-z]{2,}|' \
              r'(?:http|ftp)s?://' \
              r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' \
              r'localhost|' \
              r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|' \
              r'\[?[A-F0-9]*:[A-F0-9:]+\]?)' \
              r'(?::\d+)?' \
              r'(?:/?|[/?]\S+))'

    match = re.search(pattern, text, re.IGNORECASE)
    if match:
        if re.search(r'\.\w{2,3}$', match.group()):
            return match.group()
        else:
            return None
    else:
        return None

def remove_urls(text):
    detected_url = retrieve_url(text)
    if detected_url:
        text = re.sub(re.escape(detected_url), '', text)
    return text

def score_sentences(text):
    doc = nlp(text)
    sentences = list(doc.sents)
    sentence_scores = []

    for sentence in sentences:
        sentence_text = sentence.text.lower()
        score = sum(otp_type_sms_keywords.get(keyword, 0) for keyword in otp_type_sms_keywords if keyword in sentence_text)
        if score > 0:
            sentence_scores.append((score, sentence_text))

    if not sentence_scores:
        print("It doesn't seem like an OTP SMS.")
        return None

    best_sentence = max(sentence_scores, key=lambda x: x[0])[1]
    return best_sentence

def find_closest_chunk(digit_chunks, best_sentence):
    keyword_positions = {}
    for keyword, weight in otp_type_sms_keywords.items():
        if keyword.lower() in best_sentence:
            keyword_positions[keyword.lower()] = best_sentence.find(keyword.lower())

    closest_chunk = None
    min_weighted_distance = float('inf')

    for chunk in digit_chunks:
        chunk_position = best_sentence.find(chunk)
        for keyword, position in keyword_positions.items():
            distance = abs(chunk_position - position)
            weighted_distance = distance / otp_type_sms_keywords[keyword]
            if weighted_distance < min_weighted_distance:
                min_weighted_distance = weighted_distance
                closest_chunk = chunk

    return closest_chunk

def extract_otp(text):
    text = remove_urls(text)
    best_sentence = score_sentences(text)

    if not best_sentence:
        return None

    digit_chunks = re.findall(r'\b\w*[\w@-]*\d{4,}\b', best_sentence)

    if len(digit_chunks) == 1:
        return digit_chunks[0]

    closest_chunk = find_closest_chunk(digit_chunks, best_sentence)
    print("Closest Chunk:", closest_chunk)
    return closest_chunk

def init_db():
    conn = psycopg2.connect(**DATABASE_CONFIG)
    cursor = conn.cursor()
    cursor.execute('''CREATE TABLE IF NOT EXISTS rules (
                        id SERIAL PRIMARY KEY,
                        sms_type TEXT NOT NULL,
                        rule_type TEXT NOT NULL CHECK(rule_type IN ('block', 'pass')),
                        regex_pattern TEXT NOT NULL,
                        status BOOLEAN NOT NULL)''')
    conn.commit()
    conn.close()

def regex_length(length, comparison='greater'):
    if comparison == 'greater':
        pattern = r'^\d{' + str(length + 1) + r',}$'
    elif comparison == 'less':
        pattern = r'^\d{1,' + str(length - 1) + r'}$'
    elif comparison == 'equals':
        pattern = r'^\d{' + str(length) + r'}$'
    else:
        raise ValueError("Invalid comparison. Use 'greater', 'less', or 'equal'.")
    return pattern

def regex_sequence(sequence, comparison='starts with'):
    escaped_sequence = re.escape(sequence)
    if comparison == 'starts with':
        pattern = r'^' + escaped_sequence
    elif comparison == 'ends with':
        pattern = escaped_sequence + r'$'
    elif comparison == 'contains':
        pattern = r'.*' + escaped_sequence + r'.*'
    else:
        raise ValueError("Invalid comparison. Use 'starts with', 'ends with', 'contains'.")
    return pattern

def generate_regex(sms_type, category, features):
    features = json.loads(features)
    if sms_type == 'OTP':
        if category == 'length':
            pattern = regex_length(features['length'], features.get('comparison', 'greater'))
        elif category == 'sequence':
            pattern = regex_sequence(features['sequence'], features.get('comparison', 'starts with'))
        else:
            raise ValueError("Invalid category. Use 'length' or 'sequence'.")
    else:
        raise ValueError("Invalid sms_type. Only 'OTP' is supported.")
    return pattern

def add_rule(sms_type, rule_type, category, features, status=True):
    regex_pattern = generate_regex(sms_type, category, features)
    conn = psycopg2.connect(**DATABASE_CONFIG)
    cursor = conn.cursor()

    try:
        cursor.execute('''
            SELECT regex_pattern FROM rules
            WHERE sms_type = %s AND rule_type = %s
        ''', (sms_type, rule_type))
        existing_patterns = [row[0] for row in cursor.fetchall()]

        if regex_pattern not in existing_patterns:
            cursor.execute('''
                INSERT INTO rules (sms_type, rule_type, regex_pattern, status)
                VALUES (%s, %s, %s, %s)
            ''', (sms_type, rule_type, regex_pattern, status))
            print(f"Added new rule: {rule_type} - {regex_pattern} with status {status}")
        else:
            print(f"Pattern already exists: {regex_pattern}")

        conn.commit()
    except psycopg2.Error as e:
        print(f"Database error: {e}")
    finally:
        conn.close()

def get_rules(sms_type):
    conn = psycopg2.connect(**DATABASE_CONFIG)
    cursor = conn.cursor()
    query = '''
        SELECT rule_type, regex_pattern
        FROM rules
        WHERE sms_type = %s AND status = TRUE
    '''
    cursor.execute(query, (sms_type,))
    rules = cursor.fetchall()
    conn.close()

    block_rules = []
    pass_rules = []

    for rule in rules:
        rule_type, regex_pattern = rule
        if rule_type == 'block':
            block_rules.append(regex_pattern)
        elif rule_type == 'pass':
            pass_rules.append(regex_pattern)
        else:
            print("No rule_type matched")
    return block_rules, pass_rules

def tfidf_vectorizer(sentences):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(sentences)
    return tfidf_matrix, vectorizer

def nlp_based_regex_generation(texts):
    nlp = spacy.load("en_core_web_sm")
    digit_chunks = []
    for text in texts:
        doc = nlp(text)
        for ent in doc.ents:
            if ent.label_ == "CARDINAL":
                digit_chunks.append(ent.text)
    return digit_chunks

# Example Usage
texts = [
    "Your OTP is 123456. Use this to verify your login.",
    "Please use the code 987654 to complete your transaction."
]

digit_chunks = nlp_based_regex_generation(texts)
tfidf_matrix, vectorizer = tfidf_vectorizer(texts)
print("Digit Chunks:", digit_chunks)
print("TF-IDF Matrix:\n", tfidf_matrix.toarray())
print("Feature Names:", vectorizer.get_feature_names_out())



Digit Chunks: []
TF-IDF Matrix:
 [[0.33264172 0.         0.         0.         0.33264172 0.33264172
  0.33264172 0.         0.         0.33264172 0.23667732 0.
  0.23667732 0.33264172 0.47335464]
 [0.         0.36469323 0.36469323 0.36469323 0.         0.
  0.         0.36469323 0.36469323 0.         0.25948224 0.36469323
  0.25948224 0.         0.25948224]]
Feature Names: ['123456' '987654' 'code' 'complete' 'is' 'login' 'otp' 'please' 'the'
 'this' 'to' 'transaction' 'use' 'verify' 'your']
