In [1]:
import re
from collections import Counter
import spacy
from math import log
from typing import Dict

Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x7f20c5a62190>>
Traceback (most recent call last):
  File "/home/tammy.kojima/miniconda3/envs/meu_ambiente/lib/python3.9/site-packages/ipykernel/ipkernel.py", line 775, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(
KeyboardInterrupt: 


In [11]:
def extract_lexical_features(doc, words) -> Dict:
    word_count = len(words)
    unique_words = set(words)
    char_count = sum(len(word) for word in words)

    features = {
        'lexical_type_token_ratio': len(unique_words) / max(1, word_count),
        'lexical_word_count': word_count,
        'lexical_unique_words': len(unique_words),
        'lexical_avg_word_length': char_count / max(1, word_count),
        'lexical_word_length_variance': (
            sum((len(word) - (char_count / max(1, word_count))) ** 2 for word in words) / max(1, word_count)
        ),
        'lexical_stopword_ratio': sum(1 for word in words if word.lower() in nlp.Defaults.stop_words) / max(1, word_count),
    }

    for n in range(1, 4):
        ngrams = zip(*[words[i:] for i in range(n)])
        features.update({f'lexical_{n}gram_{k}': v for k, v in Counter(ngrams).items()})
    
    return features

In [12]:
def extract_syntactic_features(doc) -> Dict:
    pos_tags = [token.pos_ for token in doc]
    pos_bigrams = list(zip(pos_tags[:-1], pos_tags[1:]))
    sentences = list(doc.sents)
    sentence_lengths = [len([token for token in sent if not token.is_punct and not token.is_space]) for sent in sentences]

    def entropy(counter, total):
        return -sum((count / total) * log(count / total) for count in counter.values()) if total else 0

    return {
        'syntactic_pos_tag_entropy': entropy(Counter(pos_tags), len(pos_tags)),
        'syntactic_pos_bigram_entropy': entropy(Counter(pos_bigrams), len(pos_bigrams)),
        'syntactic_avg_sentence_length': sum(sentence_lengths) / max(1, len(sentence_lengths)),
        'syntactic_subordinating_conj': sum(1 for token in doc if token.dep_ == 'mark') / max(1, len(sentences)),
        'syntactic_comma_ratio': sum(1 for token in doc if token.text == ',') / max(1, len(sentences)),
        'syntactic_punct_ratio': sum(1 for token in doc if token.is_punct) / max(1, len(doc)),
    }

In [13]:
EMOTICON_PATTERN = r'[:;=8][\-^]?[)DpP(]'
def extract_stylistic_features(text, word_count) -> Dict:
    return {
        'stylistic_random_uppercase': int(bool(re.search(r'\b[a-zA-Z]*[A-Z][a-zA-Z]*[A-Z][a-zA-Z]*\b', text))),
        'stylistic_repeated_chars': int(bool(re.search(r'(.)\1{1,}', text))),  # agora aceita >=2 repetições
        'stylistic_repeated_words': int(bool(re.search(r'\b(\w+)\s+\1\b', text.lower()))),
        'stylistic_exclamation_density': text.count('!') / max(1, word_count),
        'stylistic_question_density': text.count('?') / max(1, word_count),
        'stylistic_ellipsis_count': text.count('...'),
        'stylistic_emoji_density': len(re.findall(r'[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF]', text)) / max(1, word_count),
        'stylistic_emoticon_density': len(re.findall(EMOTICON_PATTERN, text)) / max(1, word_count),
        'stylistic_capitalization_inconsistency': (
            sum(1 for word in text.split() if not word.isupper() and not word.islower() and not word.istitle()) 
            / max(1, word_count)
        ),
    }

In [14]:
def extract_structural_features(text, words, word_count) -> Dict:
    return {
        'structural_has_url': int(bool(re.search(r'http\S+|www\S+|https\S+', text))),
        'structural_has_mention': int(bool(re.search(r'@\w+', text))),
        'structural_has_hashtag': int(bool(re.search(r'#\w+', text))),
        'structural_is_retweet': int(text.strip().startswith('RT')),
        'structural_url_density': len(re.findall(r'http\S+|www\S+|https\S+', text)) / max(1, word_count),
        'structural_mention_density': len(re.findall(r'@\w+', text)) / max(1, word_count),
        'structural_hashtag_density': len(re.findall(r'#\w+', text)) / max(1, word_count),
        'structural_extra_spaces': len(re.findall(r'\s{2,}', text)) / max(1, word_count),
        'structural_temporal_markers': len(
            [word for word in words if word.lower() in {'today', 'yesterday', 'tomorrow', 'now', 'later'}]
        ) / max(1, word_count),
    }

In [15]:
nlp = spacy.load("en_core_web_sm")
def extract_all_features(text) -> Dict:
    doc = nlp(text)
    words = [token.text.lower() for token in doc if not token.is_punct and not token.is_space]
    word_count = len(words)

    features = {}
    features.update(extract_lexical_features(doc, words))
    features.update(extract_syntactic_features(doc))
    features.update(extract_stylistic_features(text, word_count))
    features.update(extract_structural_features(text, words, word_count))
    
    return features

In [17]:
features = extract_all_features("OMG!!! I can't believe this is happening... 😱😱 #shocked @friend")
print(features)

{'lexical_type_token_ratio': 0.9166666666666666, 'lexical_word_count': 12, 'lexical_unique_words': 11, 'lexical_avg_word_length': 3.9166666666666665, 'lexical_word_length_variance': 7.409722222222222, 'lexical_stopword_ratio': 0.4166666666666667, "lexical_1gram_('omg',)": 1, "lexical_1gram_('i',)": 1, "lexical_1gram_('ca',)": 1, 'lexical_1gram_("n\'t",)': 1, "lexical_1gram_('believe',)": 1, "lexical_1gram_('this',)": 1, "lexical_1gram_('is',)": 1, "lexical_1gram_('happening',)": 1, "lexical_1gram_('😱',)": 2, "lexical_1gram_('shocked',)": 1, "lexical_1gram_('@friend',)": 1, "lexical_2gram_('omg', 'i')": 1, "lexical_2gram_('i', 'ca')": 1, 'lexical_2gram_(\'ca\', "n\'t")': 1, 'lexical_2gram_("n\'t", \'believe\')': 1, "lexical_2gram_('believe', 'this')": 1, "lexical_2gram_('this', 'is')": 1, "lexical_2gram_('is', 'happening')": 1, "lexical_2gram_('happening', '😱')": 1, "lexical_2gram_('😱', '😱')": 1, "lexical_2gram_('😱', 'shocked')": 1, "lexical_2gram_('shocked', '@friend')": 1, "lexical_3g