In [26]:
import pandas as pd
from nltk import PunktSentenceTokenizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import re
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
import pyap
import time
import textstat
from prettytable import PrettyTable

In [19]:

def sentence_tokenizer(text):
    tokenizer = PunktSentenceTokenizer()
    return tokenizer.tokenize(text)

def word_tokenizer(text):
    return word_tokenize(text)

def find_emoticons(text):
    return re.findall(r'(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)

def find_stop_words(text):
    stop_words = set(stopwords.words('english'))
    return [word for word in text if word in stop_words]

def find_address(text):
    address = pyap.parse(text, country='US')
    return [str(addr) for addr in address]

def find_phone_numbers(text):
    return re.findall(r'\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}', text)

def find_account_numbers(text):
    return re.findall(r'\b\d{9}\b', text)

def preprocess_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = re.sub(r'\s+', ' ', text)
    text = text.lower()
    text = ' '.join([word for word in text.split() if word not in stopwords.words('english')])
    return text

def stem_lem(text):
    stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    return [stemmer.stem(lemmatizer.lemmatize(word)) for word in text]


In [20]:
train_df = pd.read_csv('./data/train.csv')
train_df.columns = ['0', 'Id', 'Timestamp', 'Query', 'User', 'Text']

train_df_cleaned = train_df.copy()

## Mark time
normal_start = time.time()

train_df['Sentences'] = train_df['Text'].apply(sentence_tokenizer)
train_df['Words'] = train_df['Text'].apply(word_tokenizer)
train_df['Emoticons'] = train_df['Text'].apply(find_emoticons)
train_df['StopWords'] = train_df['Words'].apply(find_stop_words)
train_df['Addresses'] = train_df['Text'].apply(find_address)
train_df['PhoneNumbers'] = train_df['Text'].apply(find_phone_numbers)
train_df['AccountNumbers'] = train_df['Text'].apply(find_account_numbers)

normal_end = time.time()

In [None]:
cleaned_start = time.time()

train_df_cleaned['Text'] = train_df_cleaned['Text'].apply(preprocess_text)

train_df_cleaned['Sentences'] = train_df_cleaned['Text'].apply(sentence_tokenizer)
train_df_cleaned['Words'] = train_df_cleaned['Text'].apply(word_tokenizer)
train_df_cleaned['Emoticons'] = train_df_cleaned['Text'].apply(find_emoticons)
train_df_cleaned['StopWords'] = train_df_cleaned['Words'].apply(find_stop_words)
train_df_cleaned['Addresses'] = train_df_cleaned['Text'].apply(find_address)
train_df_cleaned['PhoneNumbers'] = train_df_cleaned['Text'].apply(find_phone_numbers)
train_df_cleaned['AccountNumbers'] = train_df_cleaned['Text'].apply(find_account_numbers)

train_df_cleaned['Words'] = train_df_cleaned['Words'].apply(stem_lem)
cleaned_end = time.time()

In [21]:
stats_before = {
    'avg_sent_length': train_df['Sentences'].apply(len).mean(),
    'max_sent_length': train_df['Sentences'].apply(len).max(),
    'min_sent_length': train_df['Sentences'].apply(len).min(),
    'sent_count': train_df['Sentences'].apply(len).sum(),
    'word_count': train_df['Words'].apply(len).sum(),
    'vocab_size': len(set([word for words in train_df['Words'] for word in words])),
    'max_word_length': max([len(word) for words in train_df['Words'] for word in words]),
    'num_emoticons': train_df['Emoticons'].apply(len).sum(),
    'num_stop_words': train_df['StopWords'].apply(len).sum(),
    'num_lowercase': train_df['Words'].apply(lambda x: len([word for word in x if word.islower()])).sum(),
    'num_special_chars': train_df['Text'].apply(lambda x: len([char for char in x if not char.isalnum()])).sum(),
    'num_addresses': train_df['Addresses'].apply(len).sum(),
    'num_phone_numbers': train_df['PhoneNumbers'].apply(len).sum(),
    'num_account_numbers': train_df['AccountNumbers'].apply(len).sum(),
    'processing_time': normal_end - normal_start
    
}

print(stats_before)

{'avg_sent_length': 1.7032660645412903, 'max_sent_length': 64, 'min_sent_length': 1, 'sent_count': 2725224, 'word_count': 26247410, 'vocab_size': 874199, 'max_word_length': 136, 'num_emoticons': 14450, 'num_stop_words': 7556112, 'num_lowercase': 18166684, 'num_special_chars': 27811922, 'num_addresses': 731, 'num_phone_numbers': 946, 'num_account_numbers': 114, 'processing_time': 315.4849579334259}


In [22]:
stats_after = {
    'avg_sent_length': train_df_cleaned['Sentences'].apply(len).mean(),
    'max_sent_length': train_df_cleaned['Sentences'].apply(len).max(),
    'min_sent_length': train_df_cleaned['Sentences'].apply(len).min(),
    'sent_count': train_df_cleaned['Sentences'].apply(len).sum(),
    'word_count': train_df_cleaned['Words'].apply(len).sum(),
    'vocab_size': len(set([word for words in train_df_cleaned['Words'] for word in words])),
    'max_word_length': max([len(word) for words in train_df_cleaned['Words'] for word in words]),
    'num_emoticons': train_df_cleaned['Emoticons'].apply(len).sum(),
    'num_stop_words': train_df_cleaned['StopWords'].apply(len).sum(),
    'num_lowercase': train_df_cleaned['Words'].apply(lambda x: len([word for word in x if word.islower()])).sum(),
    'num_special_chars': train_df_cleaned['Text'].apply(lambda x: len([char for char in x if not char.isalnum()])).sum(),
    'num_addresses': train_df_cleaned['Addresses'].apply(len).sum(),
    'num_phone_numbers': train_df_cleaned['PhoneNumbers'].apply(len).sum(),
    'num_account_numbers': train_df_cleaned['AccountNumbers'].apply(len).sum(),
    'processing_time': cleaned_end - cleaned_start
}

print(stats_after)

{'avg_sent_length': 0.9997556248472655, 'max_sent_length': 1, 'min_sent_length': 0, 'sent_count': 1599608, 'word_count': 12350113, 'vocab_size': 726849, 'max_word_length': 123, 'num_emoticons': 0, 'num_stop_words': 6053, 'num_lowercase': 12350113, 'num_special_chars': 10698244, 'num_addresses': 15, 'num_phone_numbers': 0, 'num_account_numbers': 0, 'processing_time': 1176.6899199485779}


In [27]:
## Output Stats

table = PrettyTable()
table.field_names = ['Stats', 'Before', 'After', 'Diff']
table.add_row(['Avg Sentence Length', stats_before['avg_sent_length'], stats_after['avg_sent_length'], stats_after['avg_sent_length'] - stats_before['avg_sent_length']])
table.add_row(['Max Sentence Length', stats_before['max_sent_length'], stats_after['max_sent_length'], stats_after['max_sent_length'] - stats_before['max_sent_length']])
table.add_row(['Min Sentence Length', stats_before['min_sent_length'], stats_after['min_sent_length'], stats_after['min_sent_length'] - stats_before['min_sent_length']])
table.add_row(['Sentence Count', stats_before['sent_count'], stats_after['sent_count'], stats_after['sent_count'] - stats_before['sent_count']])
table.add_row(['Word Count', stats_before['word_count'], stats_after['word_count'], stats_after['word_count'] - stats_before['word_count']])
table.add_row(['Vocab Size', stats_before['vocab_size'], stats_after['vocab_size'], stats_after['vocab_size'] - stats_before['vocab_size']])
table.add_row(['Max Word Length', stats_before['max_word_length'], stats_after['max_word_length'], stats_after['max_word_length'] - stats_before['max_word_length']])
table.add_row(['Num Emoticons', stats_before['num_emoticons'], stats_after['num_emoticons'], stats_after['num_emoticons'] - stats_before['num_emoticons']])
table.add_row(['Num Stop Words', stats_before['num_stop_words'], stats_after['num_stop_words'], stats_after['num_stop_words'] - stats_before['num_stop_words']])
table.add_row(['Num Lowercase Words', stats_before['num_lowercase'], stats_after['num_lowercase'], stats_after['num_lowercase'] - stats_before['num_lowercase']])
table.add_row(['Num Special Chars', stats_before['num_special_chars'], stats_after['num_special_chars'], stats_after['num_special_chars'] - stats_before['num_special_chars']])
table.add_row(['Num Addresses', stats_before['num_addresses'], stats_after['num_addresses'], stats_after['num_addresses'] - stats_before['num_addresses']])
table.add_row(['Num Phone Numbers', stats_before['num_phone_numbers'], stats_after['num_phone_numbers'], stats_after['num_phone_numbers'] - stats_before['num_phone_numbers']])
table.add_row(['Num Account Numbers', stats_before['num_account_numbers'], stats_after['num_account_numbers'], stats_after['num_account_numbers'] - stats_before['num_account_numbers']])
table.add_row(['Processing Time', stats_before['processing_time'], stats_after['processing_time'], stats_after['processing_time'] - stats_before['processing_time']])
print("Table 1: Stats Before and After Preprocessing: - indicates decreased value")
print(table)

+---------------------+--------------------+--------------------+---------------------+
|        Stats        |       Before       |       After        |         Diff        |
+---------------------+--------------------+--------------------+---------------------+
| Avg Sentence Length | 1.7032660645412903 | 0.9997556248472655 | -0.7035104396940247 |
| Max Sentence Length |         64         |         1          |         -63         |
| Min Sentence Length |         1          |         0          |          -1         |
|    Sentence Count   |      2725224       |      1599608       |       -1125616      |
|      Word Count     |      26247410      |      12350113      |      -13897297      |
|      Vocab Size     |       874199       |       726849       |       -147350       |
|   Max Word Length   |        136         |        123         |         -13         |
|    Num Emoticons    |       14450        |         0          |        -14450       |
|    Num Stop Words   |      755

In [28]:
avg_reading_ease = train_df_cleaned['Text'].apply(textstat.flesch_reading_ease).mean()
avg_grade_level = train_df_cleaned['Text'].apply(textstat.flesch_kincaid_grade).mean()
lexical_diversity = stats_after['vocab_size'] / stats_after['word_count']

print(f'Average Reading Ease: {avg_reading_ease}')
print(f'Average Grade Level: {avg_grade_level}')
print(f'Lexical Diversity: {lexical_diversity}')

Average Reading Ease: 66.28469337793335
Average Grade Level: 5.922668264167667
Lexical Diversity: 0.05885363154167091


In [30]:
## Run Time

print(f'Normal Run Time: {round((normal_end - normal_start) / 60, 2)} minutes')
print(f'Cleaned Run Time: {round((cleaned_end - cleaned_start) / 60, 2)} minutes')
print(f'Total Run Time: {round((cleaned_end - normal_start) / 60, 2)} minutes')

Normal Run Time: 5.26 minutes
Cleaned Run Time: 19.61 minutes
Total Run Time: 24.87 minutes
