# Text Pre-processing 

In [11]:
import numpy as np
import pandas as pd
import re
import nltk
import spacy
import string
import glob
from IPython.display import display, HTML


pd.options.mode.chained_assignment = None


In [27]:
# get the data
sk_data1_folder = "/Users/nacosta/Documents/research_topic/sk_data1"
sk_data1_files  = glob.glob(sk_data1_folder+"/*.csv")

In [28]:
# set dataframes
full_df = least_vic_df = pd.read_csv(sk_data1_files[0])


In [29]:
full_df.columns

Index(['Name', 'Country', 'Years active', 'Proven victims', 'Possible victims',
       'Notes'],
      dtype='object')

In [30]:
display(full_df.head())

Unnamed: 0,Name,Country,Years active,Proven victims,Possible victims,Notes
0,Charlie Brandt,United States,1971 to 2004,4,29,Committed suicide by hanging after murdering h...
1,Robert Black,United Kingdom\r\nIreland (suspected)\r\nWest ...,1981 to 1986,4,18+,"Convicted of kidnapping, raping and murdering ..."
2,Max Gufler,Austria,1946 to 1958,4,18,"Poisoned and drowned four women, but suspected..."
3,Ernesto Picchioni,Italy,1949 and earlier,4,16,Murdered people who approached his home; died ...
4,Baekuni,Indonesia,1993 to 2010,4,14,Pedophile who raped and killed young boys; ini...


In [31]:
df = full_df[['Notes']]

In [34]:
df['text'] = df['Notes'].astype(str)
df = df[['text']]

---

## Lower Casing  
- common text processing technique  
- by converting text(i.e. 'string', 'String', 'STRING') to a uniform case, we can treat the text the same way  
- helpful for text featurization techniques like **frequency**, and  **term frequency-inverse document frequency(tfidf)** as it helps to combine the same words together thereby reducing the duplication and get correct counts / tfidf values.  
- may not be helpful when we do tasks like **(Part of Speech tagging)** (where proper casing gives some information about Nouns and so on) and **Sentiment Analysis** (where upper casing refers to anger and so on)  
- by default, lower casing is done my most of the modern day vecotirzers and tokenizers like sklearn TfidfVectorizer and Keras Tokenizer. *may need to set them to false as needed depending on use case.*

In [35]:
df["text_lower"] = df["text"].str.lower()
df.head()

Unnamed: 0,text,text_lower
0,Committed suicide by hanging after murdering h...,committed suicide by hanging after murdering h...
1,"Convicted of kidnapping, raping and murdering ...","convicted of kidnapping, raping and murdering ..."
2,"Poisoned and drowned four women, but suspected...","poisoned and drowned four women, but suspected..."
3,Murdered people who approached his home; died ...,murdered people who approached his home; died ...
4,Pedophile who raped and killed young boys; ini...,pedophile who raped and killed young boys; ini...


## Punctuation Removal  
- carefully choose the list of punctuations to exclude depending on the use case.  
        `string.punctuation` in python contains the following punctuation symbols: `!"#$%&\'()*+,-./:;<=>?@[\\]^_{|}~` `



In [36]:
# drop the new column created in last cell
df.drop(["text_lower"], axis=1, inplace=True)
PUNCT_TO_REMOVE = string.punctuation
def remove_punctuation(text):
    """custom function to remove the punctuation"""
    return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))

df["text_wo_punct"] = df["text"].apply(lambda text: remove_punctuation(text))
df.head()

Unnamed: 0,text,text_wo_punct
0,Committed suicide by hanging after murdering h...,Committed suicide by hanging after murdering h...
1,"Convicted of kidnapping, raping and murdering ...",Convicted of kidnapping raping and murdering f...
2,"Poisoned and drowned four women, but suspected...",Poisoned and drowned four women but suspected ...
3,Murdered people who approached his home; died ...,Murdered people who approached his home died o...
4,Pedophile who raped and killed young boys; ini...,Pedophile who raped and killed young boys init...


---