In [1]:
import pandas as pd
import numpy as np
import nltk
import re

In [2]:
train_set = pd.read_csv('dataset/train.csv')
test_set = pd.read_csv('dataset/test.csv')

In [3]:
train_set.comment_text.map(lambda x: len(x)).describe()

count    159571.000000
mean        394.073221
std         590.720282
min           6.000000
25%          96.000000
50%         205.000000
75%         435.000000
max        5000.000000
Name: comment_text, dtype: float64

most of sentences are less than 400 words, so we only use first 400 words to reduce computation load

In [4]:
text = train_set.comment_text[3]
text

'"\nMore\nI can\'t make any real suggestions on improvement - I wondered if the section statistics should be later on, or a subsection of ""types of accidents""  -I think the references may need tidying so that they are all in the exact same format ie date format etc. I can do that later on, if no-one else does first - if you have any preferences for formatting style on references or want to do it yourself please let me know.\n\nThere appears to be a backlog on articles for review so I guess there may be a delay until a reviewer turns up. It\'s listed in the relevant form eg Wikipedia:Good_article_nominations#Transport  "'

In [5]:
from nltk import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

stop_words = stopwords.words('english')
stop_words = set(stop_words)
porter = PorterStemmer()

## when using glove, we don't use stop words and stem

In [6]:
def clean_text(text, text_len=400, stop=True, stem=True):
    
    # lower text
    text = text.lower()
    
    if len(text) > 400:
        text =text[:400]
    
    # Restore word
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    
    res = ""
    tokens = word_tokenize(text)
    for word in tokens:
        if word.isalpha():
#             if stop and word in stop_words:
#                 continue
#             if stem:
#                 word = porter.stem(word)
            res = res + word + " "
    if len(res) > 0:
        return res[:-1]
    else:
        return "unknown"

In [7]:
clean_text(text)

'more i can not make any real suggestions on improvement i wondered if the section statistics should be later on or a subsection of types of accidents think the references may need tidying so that they are all in the exact same format ie date format etc i can do that later on if else does first if you have any preferences for formatting style on references or want to do it you'

In [8]:
train_set.shape

(159571, 8)

In [12]:
i = 0
while i < len(train_set):
    train_set['comment_text'][i: i+5000] = train_set['comment_text'][i:i+5000].map(lambda x: clean_text(x))
    i = i + 5000
    print(i)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


5000
10000
15000
20000
25000
30000
35000
40000
45000
50000
55000
60000
65000
70000
75000
80000
85000
90000
95000
100000
105000
110000
115000
120000
125000
130000
135000
140000
145000
150000
155000
160000


In [13]:
train_set['comment_text'][:5]

0    explanation why the edits made under my userna...
1    he matches this background colour i am seeming...
2    hey man i am really not trying to edit war it ...
3    more i can not make any real suggestions on im...
4    you sir are my hero any chance you remember wh...
Name: comment_text, dtype: object

In [14]:
train_set.to_csv('dataset/rnn_cleaned_train.csv', index=False)

In [15]:
i = 0
while i < len(test_set):
    test_set['comment_text'][i: i+5000] =test_set['comment_text'][i:i+5000].map(lambda x: clean_text(x))
    i = i + 5000

In [16]:
test_set['comment_text']

0         yo bitch ja rule is more succesful then you wi...
1                   from rfc the title is fine as it is imo
2                            sources zawe ashton on lapland
3         if you have a look back at the source the info...
4                 i do not anonymously edit articles at all
                                ...                        
153159            i totally agree this stuff is nothing but
153160    throw from out field to home plate does it get...
153161    okinotorishima categories i see your changes a...
153162    one of the founding nations of the eu germany ...
153163    stop already your bullshit is not welcome here...
Name: comment_text, Length: 153164, dtype: object

In [17]:
test_set.to_csv('dataset/rnn_cleaned_test.csv', index=False)

In [18]:
test_set.head()

Unnamed: 0,id,comment_text
0,00001cee341fdb12,yo bitch ja rule is more succesful then you wi...
1,0000247867823ef7,from rfc the title is fine as it is imo
2,00013b17ad220c46,sources zawe ashton on lapland
3,00017563c3f7919a,if you have a look back at the source the info...
4,00017695ad8997eb,i do not anonymously edit articles at all
