In [25]:
import numpy as np
import pandas as pd
import nltk
import string
from nltk.corpus import stopwords
import re
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [32]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [2]:
df = pd.read_csv('chunk4.csv')

In [3]:
df1_lower = df.copy()
df1_lower[['article','summary']] = df[['article','summary']].applymap(lambda x: x.lower())
df1_lower.head()

Unnamed: 0,article,summary
0,by . jonathan petre . and jo knowsley . warnin...,education secretary michael gove warned about ...
1,an austrian girl has said she feels she 'can r...,sabina selimovic spoke via text message to par...
2,british muslims are being urged to wear a new ...,£22 headscarf is backed by muslim groups as a ...
3,by . associated press reporter . officials say...,a woman's body was found on mount rainer sunda...
4,"more issues in ferguson: former ferguson, miss...","jaris hayden, 29, will be arraigned december 3..."


# Remove punctuations

In [21]:
#remove punctuation
df1_lower[['article','summary']] = df[['article','summary']].apply(lambda x: re.sub('[%s]' % re.escape(string.punctuation), '' , str(x)))
df1_lower.head()

# Remove Stopwords

In [14]:
stop_words = set(stopwords.words('english'))
stop_words.add('subject')
stop_words.add('http')
def remove_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in stop_words])
df1_lower[['article','summary']] = df[['article','summary']].apply(lambda x: remove_stopwords(x))
df1_lower.head()

Unnamed: 0,article,summary
0,0 By . Jonathan Petre . Jo Knowsley . Warnin.....,0 Education Secretary Michael Gove warned ... ...
1,0 By . Jonathan Petre . Jo Knowsley . Warnin.....,0 Education Secretary Michael Gove warned ... ...
2,0 By . Jonathan Petre . Jo Knowsley . Warnin.....,0 Education Secretary Michael Gove warned ... ...
3,0 By . Jonathan Petre . Jo Knowsley . Warnin.....,0 Education Secretary Michael Gove warned ... ...
4,0 By . Jonathan Petre . Jo Knowsley . Warnin.....,0 Education Secretary Michael Gove warned ... ...


# Rephrase text

In [40]:
df1_lower["article"] = df1_lower["article"].apply(lambda x: re.sub('b[w-]+?@w+?.w{2,4}b', 'emailadd',x))
#url
df1_lower["article"] = df1_lower["article"].apply(lambda x:re.sub('(http[s]?S+)|(w+.[A-Za-z]{2,4}S*)', 'urladd', x))
df1_lower.head()

Unnamed: 0,article,summary
0,0 0 by jonathan petr an jo knourladd ... 1 0 b...,0 0 eucat secretari michael gove urladd... 1 0...
1,0 0 by jonathan petr an jo knourladd ... 1 0 b...,0 0 eucat secretari michael gove urladd... 1 0...
2,0 0 by jonathan petr an jo knourladd ... 1 0 b...,0 0 eucat secretari michael gove urladd... 1 0...
3,0 0 by jonathan petr an jo knourladd ... 1 0 b...,0 0 eucat secretari michael gove urladd... 1 0...
4,0 0 by jonathan petr an jo knourladd ... 1 0 b...,0 0 eucat secretari michael gove urladd... 1 0...


In [39]:
df1_lower["summary"] = df1_lower["summary"].apply(lambda x: re.sub('b[w-]+?@w+?.w{2,4}b', 'emailadd',x))
#url
df1_lower["summary"] = df1_lower["summary"].apply(lambda x:re.sub('(http[s]?S+)|(w+.[A-Za-z]{2,4}S*)', 'urladd', x))
df1_lower.head()

Unnamed: 0,article,summary
0,0 0 by jonathan petr an jo knowsley ... 1 0 by...,0 0 eucat secretari michael gove urladd... 1 0...
1,0 0 by jonathan petr an jo knowsley ... 1 0 by...,0 0 eucat secretari michael gove urladd... 1 0...
2,0 0 by jonathan petr an jo knowsley ... 1 0 by...,0 0 eucat secretari michael gove urladd... 1 0...
3,0 0 by jonathan petr an jo knowsley ... 1 0 by...,0 0 eucat secretari michael gove urladd... 1 0...
4,0 0 by jonathan petr an jo knowsley ... 1 0 by...,0 0 eucat secretari michael gove urladd... 1 0...


# Remove words and digits containing digits

In [24]:
df1_lower[['article','summary']] = df1_lower[['article','summary']].apply(lambda x: re.sub('W*dw*','',str(x)))
df1_lower.head()

Unnamed: 0,article,summary
0,0 0 By Jonathan Petre an Jo Kn...,0 0 Eucation Secretary Michael G...
1,0 0 By Jonathan Petre an Jo Kn...,0 0 Eucation Secretary Michael G...
2,0 0 By Jonathan Petre an Jo Kn...,0 0 Eucation Secretary Michael G...
3,0 0 By Jonathan Petre an Jo Kn...,0 0 Eucation Secretary Michael G...
4,0 0 By Jonathan Petre an Jo Kn...,0 0 Eucation Secretary Michael G...


# Stemming and Lemmatization

**#stemming**

In [34]:
stemmer = PorterStemmer()
def stem_words(text):
    return " ".join([stemmer.stem(word) for word in text.split()])
df1_lower["article"] = df1_lower["article"].apply(lambda x: stem_words(x))
df1_lower["summary"] = df1_lower["summary"].apply(lambda x: stem_words(x))

#Lemmatization



In [35]:
lemmatizer = WordNetLemmatizer()
def lemmatize_words(text):
    return " ".join([lemmatizer.lemmatize(word) for word in text.split()])
df1_lower["article"] = df1_lower["article"].apply(lambda text: lemmatize_words(text))
df1_lower["summary"] = df1_lower["summary"].apply(lambda text: lemmatize_words(text))

# Remove Extra Spaces

In [46]:
df1_lower["article"] = df1_lower["article"].apply(lambda text: re.sub(' +', ' ', str(text)))
df1_lower["summary"] = df1_lower["summary"].apply(lambda text: re.sub(' +', ' ', str(text)))