# Preprocessing for the questions

In [29]:
import time as time
import numpy as np
import pandas as pd
import re
from bs4 import BeautifulSoup
import nltk
from nltk.tokenize import ToktokTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.tag.util import untag
import contractions
# import pycontractions # Alternative better package for removing contractions
from autocorrect import Speller



In [30]:
dtypes_questions = {'Id':'int32', 'Score': 'int16', 'Title': 'str', 'Body': 'str'}

In [31]:
df_questions = pd.read_csv('data/Questions.csv', usecols=['Id', 'Score', 'Title', 'Body'], dtype=dtypes_questions, nrows=100)

In [32]:
df_questions[['Title', 'Body']] = df_questions[['Title', 'Body']]\
    .applymap(lambda x: str(x)\
              .encode("utf-8", errors='surrogatepass')\
              .decode("ISO-8859-1", errors='surrogatepass'))

In [33]:
# remove all the questions with a negative score
df_questions = df_questions[df_questions['Score'] > 0]

In [34]:
spell = Speller()
token = ToktokTokenizer()
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()
charac = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~0123456789'
stop_words = set(stopwords.words("english"))
adjective_tag_list = set(['JJ','JJR', 'JJS', 'RBR', 'RBS']) # List of Adjective's tag from nltk package

In [35]:
df_questions.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100 entries, 0 to 99
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Id      100 non-null    int32 
 1   Score   100 non-null    int16 
 2   Title   100 non-null    object
 3   Body    100 non-null    object
dtypes: int16(1), int32(1), object(2)
memory usage: 2.9+ KB


### Now let's remove HTML tags from the questions

In [36]:
# Parse question and title then return only the text
df_questions['Body'] = df_questions['Body'].apply(lambda x: BeautifulSoup(x, 'html.parser').get_text())
df_questions['Title'] = df_questions['Title'].apply(lambda x: BeautifulSoup(x, 'html.parser').get_text())



In [37]:
df_questions['Body'][13]

'In many places, (1,2,3) and [1,2,3] can be used interchangeably.\nWhen should I use one or the other, and why?\n'

### every other things to removes such as \n

In [38]:
def clean_text(text):
    text = re.sub(r"\'", "'", text) # apostrophe characters to whitespace
    text = re.sub(r"\n", " ", text) # newlines to whitespace
    text = re.sub(r"\xa0", " ", text) # non-breakable to whitespace
    text = re.sub('\s+', ' ', text) # more than one whitespace character to a single whitespace
    text = text.strip(' ')
    return text

In [39]:
df_questions['Title'] = df_questions['Title'].apply(lambda x: clean_text(x))
df_questions['Body'] = df_questions['Body'].apply(lambda x: clean_text(x))

In [40]:
df_questions['Body'][43]

'I\'m creating an ZIP file with ZipFile in Python 2.5, it works ok so far: import zipfile, os locfile = "test.txt" loczip = os.path.splitext (locfile)[0] + ".zip" zip = zipfile.ZipFile (loczip, "w") zip.write (locfile) zip.close() but I couldn\'t find how to encrypt the files in the ZIP file. I could use system and call PKZIP -s, but I suppose there must be a more "Pythonic" way. I\'m looking for an open source solution.'

In [41]:
# expand_contractions (i.e "wasn't", don't', isn't, 'i've')
def expand_contractions(text):
    text = contractions.fix(text)
    return text

In [42]:
df_questions['Title'] = df_questions['Title'].apply(lambda x: expand_contractions(x))
df_questions['Body'] = df_questions['Body'].apply(lambda x: expand_contractions(x))

In [43]:
df_questions['Body'][42]

'What is the best way to sanitise user input for a Python-based web application? Is there a single function to remove HTML characters and any other necessary characters combinations to prevent an XSS or SQL injection attack?'

## Spelling Correction

In [44]:
def autocorrect(text):
    words = token.tokenize(text)
    words_correct = [spell(w) for w in words]
    return ' '.join(map(str, words_correct)) # Return the text untokenize

In [45]:
df_questions['Title'] = df_questions['Title'].apply(lambda x: autocorrect(x))
df_questions['Body'] = df_questions['Body'].apply(lambda x: autocorrect(x))

In [46]:
df_questions['Title'][11]

'How do I make a menu that does not require the user to press [ enter ] to make a selection ?'

In [25]:
df_questions.to_csv('data/questions_preprocessed.csv', encoding='utf-8', errors='surrogatepass')

## Lowering

In [47]:
df_questions['Title'] = df_questions['Title'].str.lower()
df_questions['Body'] = df_questions['Body'].str.lower()

In [None]:
df_questions['Body'][11]

### remove all non alphabetical characters

In [26]:
def remove_punctuation_and_number(text):
    """remove all punctuation and number"""
    return text.translate(str.maketrans(" ", " ", charac))



def remove_non_alphabetical_character(text):
    """remove all non-alphabetical character"""
    text = re.sub("[^a-z]+", " ", text) # remove all non-alphabetical character
    text = re.sub("\s+", " ", text) # remove whitespaces left after the last operation
    return text

In [27]:
df_questions['Title'] = df_questions['Title'].apply(lambda x: remove_non_alphabetical_character(x))
df_questions['Body'] = df_questions['Body'].apply(lambda x: remove_non_alphabetical_character(x))

In [28]:
df_questions['Title'][11]


' ow do make a menu that does not require the user to press enter to make a selection '

## STOPWORDS

In [33]:
def remove_stopwords(text):
    words = token.tokenize(text)
    filtered = [w for w in words if not w in stop_words]

    return ' '.join(map(str, filtered)) # Return the text untokenize

In [34]:
df_questions['Title'] = df_questions['Title'].apply(lambda x: remove_stopwords(x))
df_questions['Body'] = df_questions['Body'].apply(lambda x: remove_stopwords(x))

## Stemming and Lemmatization

Can help wth calculations: smaller vocabulary
make verbs go back to the radical root

In [35]:
def stem_text(text):
    words = nltk.word_tokenize(text) # tokenize the text then return a list of tuple (token, nltk_tag)
    stem_text = []
    for word in words:
        stem_text.append(stemmer.stem(word)) # Stem every words
    return " ".join(stem_text) # Return the text untokenize

In [36]:
df_questions['Title'] = df_questions['Title'].apply(lambda x: stem_text(x))
df_questions['Body'] = df_questions['Body'].apply(lambda x: stem_text(x))

In [37]:
def lemmatize_text(text):
    tokens_tagged = nltk.pos_tag(nltk.word_tokenize(text))
    lemmatized_text = []
    for word, tag in tokens_tagged:
        if tag.startswith('J'):
            lemmatized_text.append(lemmatizer.lemmatize(word,'a')) # adjectives
        elif tag.startswith('V'):
            lemmatized_text.append(lemmatizer.lemmatize(word,'v')) # verbs
        elif tag.startswith('N'):
            lemmatized_text.append(lemmatizer.lemmatize(word,'n')) # nouns
        elif tag.startswith('R'):
            lemmatized_text.append(lemmatizer.lemmatize(word,'r')) # adverbs
        else:
            lemmatized_text.append(lemmatizer.lemmatize(word)) # If no tags found.
    return " ".join(lemmatized_text) # Return the text untokenize

In [42]:
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/samuel.adone@cdbdx.biz/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/samuel.adone@cdbdx.biz/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [43]:
df_questions['Title'] = df_questions['Title'].apply(lambda x:lemmatize_text(x))
df_questions['Body'] = df_questions['Body'].apply(lambda x: lemmatize_text(x))

In [44]:
df_questions['Text'] = df_questions['Title'] + ' ' + df_questions['Body']

## Export to CSV