## References

Pre-Processing for NLP: https://towardsdatascience.com/text-cleaning-methods-for-natural-language-processing-f2fc1796e8c7

In [118]:
import pandas as pd
import numpy as np

## Load the data

In [119]:
data=pd.read_table('doj_api_data_new.csv',delimiter=',')

In [120]:
print(data)

       Unnamed: 0                                               body
0               0  WASHINGTON - INTERPOL Washington, the United S...
1               1  WASHINGTON - A joint investigation between the...
2               2  USDOJ: INTERPOL Washington: Updates\n\n \n\nDe...
3               3  USDOJ: INTERPOL Washington: Updates\n\n \n\n\n...
4               4  WASHINGTON - The United States has extradited ...
...           ...                                                ...
48045       48045  DENVER – Francis Hector Calar, age 54, of Colo...
48046       48046  WICHITA, KAN. – A former director of the Montg...
48047       48047  WASHINGTON –William Smallwood, 23, has been se...
48048       48048  SPRINGFIELD, Mo. – Tammy Dickinson, United Sta...
48049       48049  SPRINGFIELD, Mo. – Tammy Dickinson, United Sta...

[48050 rows x 2 columns]


In [121]:
data.head()

Unnamed: 0.1,Unnamed: 0,body
0,0,"WASHINGTON - INTERPOL Washington, the United S..."
1,1,WASHINGTON - A joint investigation between the...
2,2,USDOJ: INTERPOL Washington: Updates\n\n \n\nDe...
3,3,USDOJ: INTERPOL Washington: Updates\n\n \n\n\n...
4,4,WASHINGTON - The United States has extradited ...


In [122]:
print(f'There are {data.shape[0]} rows and {data.shape[1]} columns.')  # f-string

There are 48050 rows and 2 columns.


In [102]:
data.head

<bound method NDFrame.head of        Unnamed: 0                                               body
0               0  WASHINGTON - INTERPOL Washington, the United S...
1               1  WASHINGTON - A joint investigation between the...
2               2  USDOJ: INTERPOL Washington: Updates\n\n \n\nDe...
3               3  USDOJ: INTERPOL Washington: Updates\n\n \n\n\n...
4               4  WASHINGTON - The United States has extradited ...
...           ...                                                ...
48045       48045  DENVER – Francis Hector Calar, age 54, of Colo...
48046       48046  WICHITA, KAN. – A former director of the Montg...
48047       48047  WASHINGTON –William Smallwood, 23, has been se...
48048       48048  SPRINGFIELD, Mo. – Tammy Dickinson, United Sta...
48049       48049  SPRINGFIELD, Mo. – Tammy Dickinson, United Sta...

[48050 rows x 2 columns]>

## Describe the data
This function returns the count, mean, standard deviation, minimum and maximum values and the quantiles of the data.

In [103]:
data.describe()

Unnamed: 0.1,Unnamed: 0
count,48050.0
mean,24024.5
std,13870.984554
min,0.0
25%,12012.25
50%,24024.5
75%,36036.75
max,48049.0


In [104]:
print(f'There are {data.shape[0]} rows and {data.shape[1]} columns.')  # f-string

There are 48050 rows and 2 columns.


In [105]:
data.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48050 entries, 0 to 48049
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  48050 non-null  int64 
 1   body        48050 non-null  object
dtypes: int64(1), object(1)
memory usage: 750.9+ KB


In [106]:
data.drop(data.columns[0], axis=1)

Unnamed: 0,body
0,"WASHINGTON - INTERPOL Washington, the United S..."
1,WASHINGTON - A joint investigation between the...
2,USDOJ: INTERPOL Washington: Updates\n\n \n\nDe...
3,USDOJ: INTERPOL Washington: Updates\n\n \n\n\n...
4,WASHINGTON - The United States has extradited ...
...,...
48045,"DENVER – Francis Hector Calar, age 54, of Colo..."
48046,"WICHITA, KAN. – A former director of the Montg..."
48047,"WASHINGTON –William Smallwood, 23, has been se..."
48048,"SPRINGFIELD, Mo. – Tammy Dickinson, United Sta..."


In [107]:
data.isnull().sum() # lots of columns don't have missingness

Unnamed: 0    0
body          0
dtype: int64

## Pre-Processing
### Normalization

In [108]:
train_data = pd.read_csv('doj_api_data_new.csv')

In [109]:
import re

def  clean_text(data, text_field, new_text_field_name):
    data[new_text_field_name] = data[text_field].str.lower()
    data[new_text_field_name] = data[new_text_field_name].apply(lambda elem: re.sub(r"(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", elem))  
    # remove numbers
    data[new_text_field_name] = data[new_text_field_name].apply(lambda elem: re.sub(r"\d+", "", elem))
    
    return data

In [110]:
data_clean = clean_text(train_data, 'body', 'text_clean')

data_clean.head()

Unnamed: 0.1,Unnamed: 0,body,text_clean
0,0,"WASHINGTON - INTERPOL Washington, the United S...",washington interpol washington the united sta...
1,1,WASHINGTON - A joint investigation between the...,washington a joint investigation between the ...
2,2,USDOJ: INTERPOL Washington: Updates\n\n \n\nDe...,usdoj interpol washington updatesdepartment of...
3,3,USDOJ: INTERPOL Washington: Updates\n\n \n\n\n...,usdoj interpol washington updatesdepartment of...
4,4,WASHINGTON - The United States has extradited ...,washington the united states has extradited s...


### Stop Words

In [111]:
import nltk.corpus
nltk.download('stopwords')
from nltk.corpus import stopwords
stop = stopwords.words('english')
data_clean['text_clean'] = data_clean['text_clean'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
data_clean.head()

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/starrcorbin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0.1,Unnamed: 0,body,text_clean
0,0,"WASHINGTON - INTERPOL Washington, the United S...",washington interpol washington united states n...
1,1,WASHINGTON - A joint investigation between the...,washington joint investigation national oceani...
2,2,USDOJ: INTERPOL Washington: Updates\n\n \n\nDe...,usdoj interpol washington updatesdepartment ju...
3,3,USDOJ: INTERPOL Washington: Updates\n\n \n\n\n...,usdoj interpol washington updatesdepartment ju...
4,4,WASHINGTON - The United States has extradited ...,washington united states extradited sulejman m...


### Stemming and Tokenization
Use the PorterStemmer to stem words to reduce words to their root form. 

In [112]:
import nltk 
nltk.download('punkt')
from nltk.tokenize import sent_tokenize, word_tokenize
data_clean['text_tokens'] = data_clean['text_clean'].apply(lambda x: word_tokenize(x))
data_clean.head()

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/starrcorbin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Unnamed: 0.1,Unnamed: 0,body,text_clean,text_tokens
0,0,"WASHINGTON - INTERPOL Washington, the United S...",washington interpol washington united states n...,"[washington, interpol, washington, united, sta..."
1,1,WASHINGTON - A joint investigation between the...,washington joint investigation national oceani...,"[washington, joint, investigation, national, o..."
2,2,USDOJ: INTERPOL Washington: Updates\n\n \n\nDe...,usdoj interpol washington updatesdepartment ju...,"[usdoj, interpol, washington, updatesdepartmen..."
3,3,USDOJ: INTERPOL Washington: Updates\n\n \n\n\n...,usdoj interpol washington updatesdepartment ju...,"[usdoj, interpol, washington, updatesdepartmen..."
4,4,WASHINGTON - The United States has extradited ...,washington united states extradited sulejman m...,"[washington, united, states, extradited, sulej..."


The code below uses the PorterStemmer method from NLTK to apply stemming to the text_tokens and outputs the processed text to a new column.

In [113]:
from nltk.stem import PorterStemmer 
from nltk.tokenize import word_tokenize

In [115]:
def word_stemmer(text):
    stem_text = [PorterStemmer().stem(i) for i in text]
    return stem_text
data_clean['text_tokens_stem'] = data_clean['text_tokens'].apply(lambda x: word_stemmer(x))
data_clean.head()

Unnamed: 0.1,Unnamed: 0,body,text_clean,text_tokens,text_tokens_stem
0,0,"WASHINGTON - INTERPOL Washington, the United S...",washington interpol washington united states n...,"[washington, interpol, washington, united, sta...","[washington, interpol, washington, unit, state..."
1,1,WASHINGTON - A joint investigation between the...,washington joint investigation national oceani...,"[washington, joint, investigation, national, o...","[washington, joint, investig, nation, ocean, a..."
2,2,USDOJ: INTERPOL Washington: Updates\n\n \n\nDe...,usdoj interpol washington updatesdepartment ju...,"[usdoj, interpol, washington, updatesdepartmen...","[usdoj, interpol, washington, updatesdepart, j..."
3,3,USDOJ: INTERPOL Washington: Updates\n\n \n\n\n...,usdoj interpol washington updatesdepartment ju...,"[usdoj, interpol, washington, updatesdepartmen...","[usdoj, interpol, washington, updatesdepart, j..."
4,4,WASHINGTON - The United States has extradited ...,washington united states extradited sulejman m...,"[washington, united, states, extradited, sulej...","[washington, unit, state, extradit, sulejman, ..."


### Lemmatization
Reduce words to their root ("lemma") form. 

In [116]:
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

def word_lemmatizer(text):
    lem_text = [WordNetLemmatizer().lemmatize(i) for i in text]
    return lem_text

data_clean['text_tokens_lemma'] = data_clean['text_tokens'].apply(lambda x: word_lemmatizer(x))
data_clean.head()

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/starrcorbin/nltk_data...


Unnamed: 0.1,Unnamed: 0,body,text_clean,text_tokens,text_tokens_stem,text_tokens_lemma
0,0,"WASHINGTON - INTERPOL Washington, the United S...",washington interpol washington united states n...,"[washington, interpol, washington, united, sta...","[washington, interpol, washington, unit, state...","[washington, interpol, washington, united, sta..."
1,1,WASHINGTON - A joint investigation between the...,washington joint investigation national oceani...,"[washington, joint, investigation, national, o...","[washington, joint, investig, nation, ocean, a...","[washington, joint, investigation, national, o..."
2,2,USDOJ: INTERPOL Washington: Updates\n\n \n\nDe...,usdoj interpol washington updatesdepartment ju...,"[usdoj, interpol, washington, updatesdepartmen...","[usdoj, interpol, washington, updatesdepart, j...","[usdoj, interpol, washington, updatesdepartmen..."
3,3,USDOJ: INTERPOL Washington: Updates\n\n \n\n\n...,usdoj interpol washington updatesdepartment ju...,"[usdoj, interpol, washington, updatesdepartmen...","[usdoj, interpol, washington, updatesdepart, j...","[usdoj, interpol, washington, updatesdepartmen..."
4,4,WASHINGTON - The United States has extradited ...,washington united states extradited sulejman m...,"[washington, united, states, extradited, sulej...","[washington, unit, state, extradit, sulejman, ...","[washington, united, state, extradited, sulejm..."


### Part of Speech (POS) tagging and chunking
Part of speech (POS) tagging is a method to categorise words which gives some information relating to the way in which that word is used in speech.

In [117]:
def word_pos_tagger(text):
    pos_tagged_text = nltk.pos_tag(text)
    return pos_tagged_text

nltk.download('averaged_perceptron_tagger')

data_clean['text_tokens_pos_tagged'] = data_clean['text_tokens'].apply(lambda x: word_pos_tagger(x))
data_clean.head()

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/starrcorbin/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


Unnamed: 0.1,Unnamed: 0,body,text_clean,text_tokens,text_tokens_stem,text_tokens_lemma,text_tokens_pos_tagged
0,0,"WASHINGTON - INTERPOL Washington, the United S...",washington interpol washington united states n...,"[washington, interpol, washington, united, sta...","[washington, interpol, washington, unit, state...","[washington, interpol, washington, united, sta...","[(washington, NN), (interpol, VBZ), (washingto..."
1,1,WASHINGTON - A joint investigation between the...,washington joint investigation national oceani...,"[washington, joint, investigation, national, o...","[washington, joint, investig, nation, ocean, a...","[washington, joint, investigation, national, o...","[(washington, NN), (joint, NN), (investigation..."
2,2,USDOJ: INTERPOL Washington: Updates\n\n \n\nDe...,usdoj interpol washington updatesdepartment ju...,"[usdoj, interpol, washington, updatesdepartmen...","[usdoj, interpol, washington, updatesdepart, j...","[usdoj, interpol, washington, updatesdepartmen...","[(usdoj, JJ), (interpol, NN), (washington, NN)..."
3,3,USDOJ: INTERPOL Washington: Updates\n\n \n\n\n...,usdoj interpol washington updatesdepartment ju...,"[usdoj, interpol, washington, updatesdepartmen...","[usdoj, interpol, washington, updatesdepart, j...","[usdoj, interpol, washington, updatesdepartmen...","[(usdoj, JJ), (interpol, NN), (washington, NN)..."
4,4,WASHINGTON - The United States has extradited ...,washington united states extradited sulejman m...,"[washington, united, states, extradited, sulej...","[washington, unit, state, extradit, sulejman, ...","[washington, united, state, extradited, sulejm...","[(washington, NN), (united, JJ), (states, NNS)..."
