# Description of Notebook

This notebook preprocesses and cleans the scrapped wikipedia pages to prepare them for generating Tf-Idf vector and Word embeddings

In [3]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
import string
import numpy as np
from collections import Counter

# Data loading

Load the scrapped wikipedia csv file

In [15]:
df = pd.read_csv('wiki.csv')

In [16]:
df

Unnamed: 0,crime,content
0,Organized_crime,Organized crime is a category of transnational...
1,Violent_crime,A violent crime or crime of violence is a crim...
2,Money_laundering,\nMoney laundering is the illegal process of c...
3,International_sanctions,International sanctions are political and econ...
4,Terrorism_financing,Terrorism financing is the provision of funds ...
5,Illegal_drug_trade,The illegal drug trade or drug trafficking is ...
6,Political_corruption,\nPolitical corruption is the use of powers by...
7,Human_trafficking,\nHuman trafficking is the trade of humans fo...
8,Sex_trafficking,Sex trafficking is human trafficking for the p...
9,Underground_poker,Underground poker is poker played in a venue t...


# Pre-processing steps

Perform all the pre-processing steps such as lemmatizing, lower case conversion, removing stop words & punctuations

In [2]:
def lemmatize_stemming(data):
    """lemmatizes content of each wikipedia page"""
    tokens = word_tokenize(str(data))
    new_text = ""
    for w in tokens:
        new_text = new_text + " " + WordNetLemmatizer().lemmatize(w)
    return new_text

In [18]:
def convert_lower_case(data):
    """converts content of each wikipedia page into lower case"""
    return np.char.lower(data)

In [19]:
def remove_numbers(data):
    """removes numbers from the content of each wikipedia page"""
    data = str(data)
    new_text = ' '.join(s for s in data.split() if not any(c.isdigit() for c in s))
    return new_text

In [20]:
def remove_stop_words(data):
    """removes stop words from the content of each wikipedia page"""
    stop_words = stopwords.words('english')
    words = word_tokenize(str(data))
    new_text = ""
    for w in words:
        if w not in stop_words and len(w) > 1:
            new_text = new_text + " " + w
    return new_text

In [21]:
def remove_punctuation(data):
    """removes punctuations from the content of each wikipedia page"""
    symbols = "!\"#$%&()*+-./:;<=>?@[\]^_`{|}~\n"
    for i in range(len(symbols)):
        data = np.char.replace(data, symbols[i], ' ')
        data = np.char.replace(data, "  ", " ")
    data = np.char.replace(data, ',', '')
    return data

In [22]:
def remove_apostrophe(data):
    """removes apostrophe from the content of each wikipedia page"""
    return np.char.replace(data, "'", "")

In [23]:
def preprocess(data):
    """single function that combines all the preprocessing functions
       You may notice that some functions are called more than once that is 
       beccause after lemmatizing step you may encounter few stop words again 
    """
    data = convert_lower_case(data)
    data = remove_numbers(data)
    data = remove_punctuation(data) 
    data = remove_apostrophe(data)
    data = remove_stop_words(data)
    data = lemmatize_stemming(data)
    data = remove_punctuation(data)
    data = lemmatize_stemming(data) 
    data = remove_punctuation(data) 
    data = remove_stop_words(data) 
    return data

add new column to the dataframe which contains cleaned content column

In [24]:
df['processed_content'] = df.content.apply(preprocess) 

In [25]:
df

Unnamed: 0,crime,content,processed_content
0,Organized_crime,Organized crime is a category of transnational...,organized crime category transnational nation...
1,Violent_crime,A violent crime or crime of violence is a crim...,violent crime crime violence crime offender p...
2,Money_laundering,\nMoney laundering is the illegal process of c...,money laundering illegal process concealing o...
3,International_sanctions,International sanctions are political and econ...,international sanction political economic dec...
4,Terrorism_financing,Terrorism financing is the provision of funds ...,terrorism financing provision fund providing ...
5,Illegal_drug_trade,The illegal drug trade or drug trafficking is ...,illegal drug trade drug trafficking global bl...
6,Political_corruption,\nPolitical corruption is the use of powers by...,political corruption use power government off...
7,Human_trafficking,\nHuman trafficking is the trade of humans fo...,human trafficking trade human purpose forced ...
8,Sex_trafficking,Sex trafficking is human trafficking for the p...,sex trafficking human trafficking purpose sex...
9,Underground_poker,Underground poker is poker played in a venue t...,underground poker poker played venue operatin...


Save the processed wikipedia file for further use

In [41]:
x.to_csv('processed_wiki.csv', index = False)