In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import re
from collections import Counter
from math import log
import spacy
from typing import List, Dict

In [None]:
# Configurações globais
EMOTICON_PATTERN = r'[:;=8][\-^]?[)DpP(]'
URL_PATTERN = r'http\S+|www\S+|https\S+'
MENTION_PATTERN = r'@\w+'
HASHTAG_PATTERN = r'#\w+'

# Inicializa spaCy
nlp = spacy.load("en_core_web_sm")

In [None]:
# Input do usuário
text_input = input("Digite um texto para classificação: ")

In [None]:
# Lista de n-grams específicos
SPECIFIC_NGRAMS = [
    ('https',), ('modi',), ('co',), ('https', 'co'), ('the',), ('of',)
]

def extract_features(text: str) -> Dict:
    """Extrai features selecionadas e n-grams específicos de uma frase"""
    doc = nlp(text)
    words = [token.text.lower() for token in doc if not token.is_punct and not token.is_space]
    word_count = len(words)
    char_count = sum(len(word) for word in words)

    # Syntactic features
    pos_tags = [token.pos_ for token in doc]
    pos_bigrams = list(zip(pos_tags[:-1], pos_tags[1:]))
    sentences = list(doc.sents)
    sentence_lengths = [len([token for token in sent if not token.is_punct and not token.is_space]) for sent in sentences]

    def entropy(counter, total):
        return -sum((count / total) * log(count / total) for count in counter.values()) if total else 0

    features = {
        # Lexical
        'lexical_avg_word_length': char_count / max(1, word_count),
        'lexical_word_length_variance': np.var([len(word) for word in words]) if word_count > 1 else 0,
        
        # Syntactic
        'syntactic_pos_tag_entropy': entropy(Counter(pos_tags), len(pos_tags)),
        'syntactic_pos_bigram_entropy': entropy(Counter(pos_bigrams), len(pos_bigrams)),
        'syntactic_avg_sentence_length': np.mean(sentence_lengths) if sentence_lengths else 0,
        'syntactic_comma_ratio': sum(1 for token in doc if token.text == ',') / max(1, len(sentences)),
        'syntactic_punct_ratio': sum(1 for token in doc if token.is_punct) / max(1, len(doc)),

        # Stylistic
        'stylistic_random_uppercase': int(bool(re.search(r'\b[a-zA-Z]*[A-Z][a-zA-Z]*[A-Z][a-zA-Z]*\b', text))),
        'stylistic_capitalization_inconsistency': sum(1 for word in text.split() if not word.isupper() and not word.islower() and not word.istitle()) / max(1, word_count),

        # Structural
        'structural_has_url': int(bool(re.search(URL_PATTERN, text))),
        'structural_has_hashtag': int(bool(re.search(HASHTAG_PATTERN, text))),
        'structural_hashtag_density': len(re.findall(HASHTAG_PATTERN, text)) / max(1, word_count),
        'structural_extra_spaces': len(re.findall(r'\s{2,}', text)) / max(1, word_count),
        'structural_url_density': len(re.findall(URL_PATTERN, text)) / max(1, word_count),
    }

    # N-grams específicos
    for ngram in SPECIFIC_NGRAMS:
        n = len(ngram)
        ngrams_in_text = zip(*[words[i:] for i in range(n)])
        count = sum(1 for ng in ngrams_in_text if ng == ngram)
        features[f'ngram_{"_".join(ngram)}'] = count

    return features