In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install dataprep
!pip install pyspellchecker

In [None]:
from spellchecker import SpellChecker
from dataprep.clean import clean_text
import pandas as pd
from nltk import SnowballStemmer
import time

<a id=0></a>
## <p style="background-color:lightblue; font-family:newtimeroman; font-size:120%; text-align:left; border-radius: 15px 50px;">Table of Content</p>
* [1. Loading Data 💎](#1)
* [2. Data Preprocessing](#2)
    * [2.1 Remove 92 duplicated rows](#2.1)
    * [2.2 Cleaning text](#2.2)
    * [2.3 Spelling Checker](#2.3)
    * [2.4 Remove Stemming](#2.4)
* [3. Saving Data](#3)

<a id='1'></a>
# <p style="background-color:skyblue; font-family:newtimeroman; font-size:150%; text-align:center; border-radius: 15px 50px;">1. Loading Data 💎</p>

Just load the dataset and global variables for colors and so on.

[Content](#0)

In [None]:
train_full = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")
test_full = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")

print('Training Set Shape = {}'.format(train_full.shape))
print('Training Set Memory Usage = {:.2f}MB'.format(train_full.memory_usage().sum()/2**20))

print('Test Set Shape = {}'.format(test_full.shape))
print('Test Set Memory Usage = {:.2f}MB'.format(test_full.memory_usage().sum()/2**20))

<a id='2'></a>
# <p style="background-color:skyblue; font-family:newtimeroman; font-size:150%; text-align:center; border-radius: 15px 50px;">2. Data Pre-processing </p>

Now we are going to engineering the data to make it easier for the model to clasiffy.

This section is very important to reduce the dimensions of the problem.




[Content](#0)

<a id=2.1 ></a>
## <p style="background-color:skyblue; font-family:newtimeroman; font-size:140%; text-align:left; border-radius: 15px 50px;">2.1 Remove 92 duplicated Rows</p>


[Content](#0)

In [None]:
duplicate_data = train_full[train_full.duplicated(['text','target'], keep=False)]

In [None]:
train_full.drop_duplicates(['text','target'], inplace=True, ignore_index=True)
train_full.shape

<a id=2.2 ></a>
## <p style="background-color:skyblue; font-family:newtimeroman; font-size:140%; text-align:left; border-radius: 15px 50px;">2.2 Cleaning text</p>


[Content](#0)

# In this kernel, I use **dataprep** library to clean data productively. 
    1. fillna: Replace all null values with NaN.

    2. lowercase: Convert all characters to lowercase.

    3. remove_digits: Remove numbers.

    4. remove_html Remove HTML tags.

    5. remove_urls: Remove URLs.

    6. remove_punctuation: Remove punctuation marks. (do not use)

    7. remove_accents: Remove accent marks. 

    8. remove_stopwords: Remove stopwords. (do not use)

    9. remove_whitespace: Remove extra spaces, and tabs and newlines.

In [None]:
custom_pipeline = [
    {"operator": "fillna", "parameters":{"value":""}},
    {"operator": "lowercase"},
    {"operator": "remove_digits"},
    {"operator": "remove_html"},
    {"operator": "remove_urls"},
    {"operator": "remove_accents"},
    {"operator": "remove_whitespace"},
]

In [None]:
df_train = clean_text(train_full, "text", pipeline = custom_pipeline)
df_test = clean_text(test_full, "text", pipeline= custom_pipeline)

<a id=2.3 ></a>
   ## <p style="background-color:skyblue; font-family:newtimeroman; font-size:140%; text-align:left; border-radius: 20px 50px;">2.3 Spelling Checker</p>
   
Spend so much time to apply Spelling Checker.

So I will use commited-dataset (df_train/df_test) applied and saved in the last time) to save time.

[Content](#0)

In [None]:
spell = SpellChecker()
def correct_spelling(text):    
    misspelled_words = spell.unknown(text.split())
    corrected_text = [spell.correction(w) if w in misspelled_words else w for w in text.split()] 
    return " ".join(corrected_text)

### We lost arround 1hour to run SpellChecker for Training Data

In [None]:
t1 = time.time()
df_train.text = df_train.text.apply(correct_spelling)
deltaT1 = time.time() - t1

In [None]:
t2 = time.time()
df_test.text = df_test.text.apply(correct_spelling)
deltaT2 = time.time() - t2

<a id=2.4 ></a>
## <p style="background-color:skyblue; font-family:newtimeroman; font-size:140%; text-align:left; border-radius: 15px 50px;">2.4 Stemming</p>


[Content](#0)

In [None]:
stemmer = SnowballStemmer("english")

def stemming_text(text):
    return ' '.join(stemmer.stem(w) for w in text.split(' '))

df_train.text = df_train.text.apply(stemming_text)
df_test.text = df_test.text.apply(stemming_text)

<a id='3'></a>
# <p style="background-color:skyblue; font-family:newtimeroman; font-size:150%; text-align:center; border-radius: 15px 50px;">3. Saving Data 💎</p>

In [None]:
df_train[['text','target']].to_csv('train_prepared.csv', index=False)
df_test.text.to_csv('test_prepared.csv', index=False)

In [None]:
pd.read_csv('train_prepared.csv') , pd.read_csv('test_prepared.csv')