In [4]:
import numpy as np
import pandas as pd
import re
import nltk
import spacy

In [5]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [6]:
train_data.head()

Unnamed: 0,id,url_legal,license,excerpt,target,standard_error
0,c12129c31,,,When the young people returned to the ballroom...,-0.340259,0.464009
1,85aa80a4c,,,"All through dinner time, Mrs. Fayre was somewh...",-0.315372,0.480805
2,b69ac6792,,,"As Roger had predicted, the snow departed as q...",-0.580118,0.476676
3,dd1000b26,,,And outside before the palace a great garden w...,-1.054013,0.450007
4,37c1b32fb,,,Once upon a time there were Three Bears who li...,0.247197,0.510845


In [7]:
test_data.head()

Unnamed: 0,id,url_legal,license,excerpt
0,c0f722661,,,My hope lay in Jack's promise that he would ke...
1,f0953f0a5,,,Dotty continued to go to Mrs. Gray's every nig...
2,0df072751,,,It was a bright and cheerful scene that greete...
3,04caf4e0c,https://en.wikipedia.org/wiki/Cell_division,CC BY-SA 3.0,Cell division is the process by which a parent...
4,0e63f8bea,https://en.wikipedia.org/wiki/Debugging,CC BY-SA 3.0,Debugging is the process of finding and resolv...


In [8]:
train_data = train_data[['excerpt']]
test_data = test_data[['excerpt']]

In [9]:
train_data['excerpt_lower'] = train_data['excerpt'].str.lower()   # First we need to convert the given texts to string and then apply case conversion methods
test_data['excerpt_lower'] = test_data['excerpt'].str.lower()

In [10]:
train_data.head()

Unnamed: 0,excerpt,excerpt_lower
0,When the young people returned to the ballroom...,when the young people returned to the ballroom...
1,"All through dinner time, Mrs. Fayre was somewh...","all through dinner time, mrs. fayre was somewh..."
2,"As Roger had predicted, the snow departed as q...","as roger had predicted, the snow departed as q..."
3,And outside before the palace a great garden w...,and outside before the palace a great garden w...
4,Once upon a time there were Three Bears who li...,once upon a time there were three bears who li...


In [11]:
test_data.head()

Unnamed: 0,excerpt,excerpt_lower
0,My hope lay in Jack's promise that he would ke...,my hope lay in jack's promise that he would ke...
1,Dotty continued to go to Mrs. Gray's every nig...,dotty continued to go to mrs. gray's every nig...
2,It was a bright and cheerful scene that greete...,it was a bright and cheerful scene that greete...
3,Cell division is the process by which a parent...,cell division is the process by which a parent...
4,Debugging is the process of finding and resolv...,debugging is the process of finding and resolv...


In [12]:
import requests

data = requests.get('http://www.gutenberg.org/cache/epub/8001/pg8001.html')
content = data.text
print(content[2745:3948])

 <body> level - not in <div> or <table>  */
    text-align: justify;
    /* or left?? */
    text-indent: 1em;
    /* first-line indent */
    }
/* suppress indentation on paragraphs following heads  */
h2 + p, h3 + p, h4 + p {
    text-indent: 0
    }
/* tighter spacing for list item paragraphs */
dd, li {
    margin-top: 0.25em;
    margin-bottom: 0;
    line-height: 1.2em;
    /* a bit closer than p's */
    }
/* ************************************************************************
 * Head 2 is for chapter heads. 
 * ********************************************************************** */
h2 {
    /* text-align:center;  left-aligned by default. */
    margin-top: 3em;
    /* extra space above.. */
    margin-bottom: 2em;
    /* ..and below */
    clear: both;
    /* don't let sidebars overlap */
    }
/* ************************************************************************
 * Head 3 is for main-topic heads.
 * ***********************************

In [13]:
import re
from bs4 import BeautifulSoup

def strip_html_tags(text):
    """
    This function will remove the HTML tags & noise from the scraped data.
    """
    soup = BeautifulSoup(text, "html.parser")
    [s.extract() for s in soup(['iframe', 'script'])]
    stripped_text = soup.get_text()
    stripped_text = re.sub(r'[\r|\n|\r\n]+', '\n', stripped_text)
    return stripped_text

clean_content = strip_html_tags(content)
print(clean_content[1163:1957])

 darkness was
           upon the face of the deep. And the Spirit of God moved upon
           the face of the waters.
01:001:003 And God said, Let there be light: and there was light.
01:001:004 And God saw the light, that it was good: and God divided the
           light from the darkness.
01:001:005 And God called the light Day, and the darkness he called
           Night. And the evening and the morning were the first day.
01:001:006 And God said, Let there be a firmament in the midst of the
           waters, and let it divide the waters from the waters.
01:001:007 And God made the firmament, and divided the waters which were
           under the firmament from the waters which were above the
           firmament: and it was so.
01:001:008 And God called the firmament Heaven. A


In [14]:
train_data['excerpt_lower'] = train_data['excerpt_lower'].apply(strip_html_tags)
test_data['excerpt_lower'] = test_data['excerpt_lower'].apply(strip_html_tags)

In [15]:
import unicodedata

def remove_accented_chars(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text

In [16]:
s = 'Sómě Áccěntěd těxt'
s

'Sómě Áccěntěd těxt'

In [17]:
remove_accented_chars(s)

'Some Accented text'

In [18]:
train_data['excerpt_lower'] = train_data['excerpt_lower'].apply(remove_accented_chars)
test_data['excerpt_lower'] = test_data['excerpt_lower'].apply(remove_accented_chars)

In [19]:
import re

def remove_special_characters(text, remove_digits=True):
    pattern = r'[^a-zA-Z0-9\s]' if not remove_digits else r'[^a-zA-Z\s]'
    text = re.sub(pattern, '', text)
    return text

In [20]:
s = "Well this was fun! See you at 7:30, What do you think!!? #$@@9318@ 🙂🙂🙂"
s

'Well this was fun! See you at 7:30, What do you think!!? #$@@9318@ 🙂🙂🙂'

In [21]:
remove_special_characters(s, remove_digits=False)

'Well this was fun See you at 730 What do you think 9318 '

In [22]:
remove_special_characters(s)

'Well this was fun See you at  What do you think  '

In [23]:
train_data['excerpt_lower'] = train_data['excerpt_lower'].apply(remove_special_characters)
test_data['excerpt_lower'] = test_data['excerpt_lower'].apply(remove_special_characters)

In [24]:
s = "Y'all can't expand contractions I'd think! You wouldn't be able to. How'd you do it?"
s

"Y'all can't expand contractions I'd think! You wouldn't be able to. How'd you do it?"

In [25]:
import contractions

list(contractions.contractions_dict.items())[:10]

[("I'm", 'I am'),
 ("I'm'a", 'I am about to'),
 ("I'm'o", 'I am going to'),
 ("I've", 'I have'),
 ("I'll", 'I will'),
 ("I'll've", 'I will have'),
 ("I'd", 'I would'),
 ("I'd've", 'I would have'),
 ('Whatcha', 'What are you'),
 ("amn't", 'am not')]

In [26]:
contractions.fix(s)

'You all cannot expand contractions I would think! You would not be able to. How did you do it?'

In [27]:
train_data['excerpt_lower'] = train_data['excerpt_lower'].apply(contractions.fix)
test_data['excerpt_lower'] = test_data['excerpt_lower'].apply(contractions.fix)

In [28]:
# Porter Stemmer
from nltk.stem import PorterStemmer
ps = PorterStemmer()

ps.stem('jumping'), ps.stem('jumps'), ps.stem('jumped')

('jump', 'jump', 'jump')

In [29]:
ps.stem('lying')

'lie'

In [30]:

ps.stem('strange')

'strang'

In [31]:
train_data['excerpt_lower'] = train_data['excerpt_lower'].apply(ps.stem)
test_data['excerpt_lower'] = test_data['excerpt_lower'].apply(ps.stem)

In [41]:
from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()

In [42]:
help(wnl.lemmatize)

Help on method lemmatize in module nltk.stem.wordnet:

lemmatize(word: str, pos: str = 'n') -> str method of nltk.stem.wordnet.WordNetLemmatizer instance
    Lemmatize `word` using WordNet's built-in morphy function.
    Returns the input word unchanged if it cannot be found in WordNet.
    
    :param word: The input word to lemmatize.
    :type word: str
    :param pos: The Part Of Speech tag. Valid options are `"n"` for nouns,
        `"v"` for verbs, `"a"` for adjectives, `"r"` for adverbs and `"s"`
        for satellite adjectives.
    :param pos: str
    :return: The lemma of `word`, for the given `pos`.



In [44]:
# lemmatize nouns
print(wnl.lemmatize)
#print(wnl.lemmatize('cars', 'n'))
#print(wnl.lemmatize('boxes', 'n'))

<bound method WordNetLemmatizer.lemmatize of <WordNetLemmatizer>>


In [None]:
s = 'The brown foxes are quick and they are jumping over the sleeping lazy dogs!'
tokens = nltk.word_tokenize(s)
print(tokens)

In [None]:
train_data['excerpt_lower'] = train_data['excerpt_lower'].apply(nltk.word_tokenize)
test_data['excerpt_lower'] = test_data['excerpt_lower'].apply(nltk.word_tokenize)

In [38]:
from nltk.corpus import stopwords
", ".join(stopwords.words('english'))

"i, me, my, myself, we, our, ours, ourselves, you, you're, you've, you'll, you'd, your, yours, yourself, yourselves, he, him, his, himself, she, she's, her, hers, herself, it, it's, its, itself, they, them, their, theirs, themselves, what, which, who, whom, this, that, that'll, these, those, am, is, are, was, were, be, been, being, have, has, had, having, do, does, did, doing, a, an, the, and, but, if, or, because, as, until, while, of, at, by, for, with, about, against, between, into, through, during, before, after, above, below, to, from, up, down, in, out, on, off, over, under, again, further, then, once, here, there, when, where, why, how, all, any, both, each, few, more, most, other, some, such, no, nor, not, only, own, same, so, than, too, very, s, t, can, will, just, don, don't, should, should've, now, d, ll, m, o, re, ve, y, ain, aren, aren't, couldn, couldn't, didn, didn't, doesn, doesn't, hadn, hadn't, hasn, hasn't, haven, haven't, isn, isn't, ma, mightn, mightn't, mustn, mus

In [39]:
STOPWORDS = set(stopwords.words('english'))
def remove_stopwords(text):
    """custom function to remove the stopwords"""
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

train_data["excerpt_lower"] = train_data["excerpt_lower"].apply(lambda text: remove_stopwords(text))
test_data["excerpt_lower"] = test_data["excerpt_lower"].apply(lambda text: remove_stopwords(text))