In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import re
import string
from unidecode import unidecode
import emoji

# import torchtext.vocab
from collections import Counter
import nltk
from nltk.tokenize import word_tokenize, TweetTokenizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [2]:
file_train = 'train_spam.csv'

data_train_all = pd.read_csv(file_train)

In [3]:
texts_train_all = data_train_all['text']
labels_all = data_train_all['text_type']
target_all = pd.Categorical(labels_all, categories=['ham', 'spam']).codes

In [4]:
lemmatizer = WordNetLemmatizer()

tokenizer = TweetTokenizer()

ENGLISH_STOP_WORDS = set(stopwords.words('english'))

In [8]:
def remove_hyperlink(words):
    return  re.sub(r"http\S+", "", words)

def to_lower(words):
    result = words.lower()
    return result


def remove_underscore(words):
    return re.sub(r'_', ' ', words)

def remove_punctuation(words):
    result = re.sub(r'[!"#%&\'()*+,-./:;<=>?@\[\]^`{|}~]', ' ', words) # everything except underscore
    return result


def emoji_to_text(words):
    return emoji.demojize(words)

def replace_emoji(words,  replace=' <EMOJI> '):
    return emoji.replace_emoji(words, replace=replace)

def remove_emoji(words):
    return replace_emoji(words, '')


def replace_non_ascii(words, replace=' <NONASCII> '):
    pattern = r'\b[^\x00-\x7F]+\b'
    non_ascii_words = re.sub(pattern, replace, words)
    return non_ascii_words


def replace_number(words, replace=' <NUMBER> '):
    result = re.sub(r'\b\w*\d\w*\b', replace, words)
    return result

def remove_number(words):
    return replace_number(words, '')


def replace_currency(words, replace=' <CURRENCY> '):
    currency_pattern = r'[£$€₹]'
    return re.sub(currency_pattern, replace, words)

def remove_currency(words):
    return replace_currency(words, '')


def replace_tag(words, replace=' <TAG> '):
    return re.sub(r'\b\w*@\w+\b', replace, words)

def replace_exclamation(words, replace=' <EXCLAMATION> '):
    return re.sub(r'!', replace, words)

def replace_question(words, replace=' <QUESTION> '):
    return re.sub(r'?', replace, words)

def replace_slash(words, replace=' <SLASH> '):
    return re.sub(r'/', replace, words)

def replace_colon(words, replace=' <COLON> '):
    return re.sub(r':', replace, words)


def remove_whitespace(words):
    return words.strip()

def replace_newline(words):
    return words.replace('\n', '')

def remove_stop_words(words):
    return [i for i in words if i not in ENGLISH_STOP_WORDS]

def word_lemmatizer(words):
    return [lemmatizer.lemmatize(s) for s in words]

In [6]:
def preprocess_pipeline(sentence, preprocess_utils=None):
    if preprocess_utils is None:
        preprocess_utils = [
                            remove_hyperlink,
                            replace_newline,
                            to_lower,
                            remove_underscore,
                            replace_currency,
                            replace_number,
                            replace_emoji,
                            replace_non_ascii,
                            remove_punctuation,
                            remove_whitespace,
                            tokenizer.tokenize,
                            remove_stop_words,
                            word_lemmatizer,
                        ]
    for func in preprocess_utils:
        sentence = func(sentence)
    return sentence

Features:
- Number of characters in original sentences
- Number of words in preprocessed with keyword replacement and tokenized by words
- Number of varius keywords (EMOJI, NUMBER, NONASCII, AT, QUESTION, EXCLAMATION, COLON, SLASH, etc)
- Number of special words

In [None]:
def featurize(texts):
    ### character length of text
    length_in_charactes = []
    for text in texts:
        length_in_charactes.append(len(text))
    
    ### Word length of text
    length_in_words = []

    preprocess_utils_1 = [
        remove_hyperlink,
        replace_newline,
        to_lower,
        remove_underscore,
        replace_emoji,
        unidecode,       # просто стандартизируем их
        remove_punctuation,
        remove_whitespace,
        tokenizer.tokenize,
        remove_stop_words,
        word_lemmatizer,
    ]
    texts_tokenized_1 = [
        preprocess_pipeline(sent, preprocess_utils_1) for sent in texts
    ]
    for text in texts_tokenized_1:
        length_in_words.append(len(text))
    
    ### Preprocess for keywords count
    preprocess_utils_2 = [
        remove_hyperlink,
        replace_newline,
        to_lower,
        remove_underscore,
        replace_currency,
        replace_number,
        replace_emoji,
        replace_non_ascii,
        replace_tag,
        replace_exclamation,
        replace_question,
        replace_slash,
        replace_colon,
        remove_punctuation,
        remove_whitespace,
        tokenizer.tokenize,
        remove_stop_words,
        word_lemmatizer,
    ]
    texts_tokenized_2 = [
        preprocess_pipeline(sent, preprocess_utils_2) for sent in texts
    ]

    n_emojis = []
    n_nonascii = []
    n_currancy = []
    n_number = []
    n_tags = []
    n_excamation = []
    n_question = []
    n_slash = []
    n_colon = []
    for text in texts_tokenized_2:
        length_in_words.append(len(text))

    
