# Preprocessing for the questions

In [19]:
import time as time
import numpy as np
import pandas as pd
import re
from bs4 import BeautifulSoup
import nltk
from nltk.tokenize import ToktokTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.tag.util import untag
import contractions
# import pycontractions # Alternative better package for removing contractions
from autocorrect import Speller



In [20]:
dtypes_questions = {'Id':'int32', 'Score': 'int16', 'Title': 'str', 'Body': 'str'}

In [21]:
df_questions = pd.read_csv('../Maj2NLP/data/Questions.csv', usecols=['Id', 'Score', 'Title', 'Body'], dtype=dtypes_questions, encoding_errors= 'replace')
df_questions.shape

(607282, 4)

In [22]:
df_questions[['Title', 'Body']] = df_questions[['Title', 'Body']]\
    .applymap(lambda x: str(x)\
              .encode("utf-8", errors='surrogatepass')\
              .decode("ISO-8859-1", errors='surrogatepass'))

In [23]:
# remove all the questions with a negative score
df_questions = df_questions[df_questions['Score'] > 0]

In [24]:
spell = Speller()
token = ToktokTokenizer()
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()
charac = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~0123456789'
stop_words = set(stopwords.words("english"))
adjective_tag_list = set(['JJ','JJR', 'JJS', 'RBR', 'RBS']) # List of Adjective's tag from nltk package

In [25]:
df_questions.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 327688 entries, 0 to 607280
Data columns (total 4 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   Id      327688 non-null  int32 
 1   Score   327688 non-null  int16 
 2   Title   327688 non-null  object
 3   Body    327688 non-null  object
dtypes: int16(1), int32(1), object(2)
memory usage: 9.4+ MB


### Now let's remove HTML tags from the questions

In [26]:
# Parse question and title then return only the text
df_questions['Body'] = df_questions['Body'].apply(lambda x: BeautifulSoup(x, 'html.parser').get_text())
df_questions['Title'] = df_questions['Title'].apply(lambda x: BeautifulSoup(x, 'html.parser').get_text())



In [27]:
df_questions['Body'][768]

"Ruby can add methods to the Number class and other core types to get effects like:\n\n1.should_equal(1)\n\nBut it seems like python cannot do this. Is this true? And if so, why? Does it have something to do with the fact that type can't be modified?\nUpdate: Rather than talking about different definitions of monkey patching, I would like to just focus on the example above. I have already concluded that it cannot be done as a few of you have answered. But I would like a more detailed explanation of why it cannot be done, and maybe what feature, if available in python, would allow this.\nTo answer some of you: The reason I might want to do this is simply aesthetics/readability. \n\nitem.price.should_equal(19.99)\n\nreads more like English and clearly indicates which is the tested value and which is the expected value, as supposed to:\n\nshould_equal(item.price, 19.99)\n\nThis concept is what Rspec and some other Ruby frameworks are based on.\n"

### every other things to removes such as \n

In [28]:
def clean_text(text):
    text = re.sub(r"\'", "'", text) # apostrophe characters to whitespace
    text = re.sub(r"\n", " ", text) # newlines to whitespace
    text = re.sub(r"\xa0", " ", text) # non-breakable to whitespace
    text = re.sub('\s+', ' ', text) # more than one whitespace character to a single whitespace
    text = text.strip(' ')
    return text

In [29]:
df_questions['Title'] = df_questions['Title'].apply(lambda x: clean_text(x))
df_questions['Body'] = df_questions['Body'].apply(lambda x: clean_text(x))

In [30]:
df_questions['Body'][43]

'I\'m creating an ZIP file with ZipFile in Python 2.5, it works ok so far: import zipfile, os locfile = "test.txt" loczip = os.path.splitext (locfile)[0] + ".zip" zip = zipfile.ZipFile (loczip, "w") zip.write (locfile) zip.close() but I couldn\'t find how to encrypt the files in the ZIP file. I could use system and call PKZIP -s, but I suppose there must be a more "Pythonic" way. I\'m looking for an open source solution.'

In [31]:
# expand_contractions (i.e "wasn't", don't', isn't, 'i've')
def expand_contractions(text):
    text = contractions.fix(text)
    return text

In [32]:
df_questions['Title'] = df_questions['Title'].apply(lambda x: expand_contractions(x))
df_questions['Body'] = df_questions['Body'].apply(lambda x: expand_contractions(x))

In [33]:
df_questions['Body'][42]

'What is the best way to sanitise user input for a Python-based web application? Is there a single function to remove HTML characters and any other necessary characters combinations to prevent an XSS or SQL injection attack?'

## Spelling Correction

In [34]:
def autocorrect(text):
    words = token.tokenize(text)
    words_correct = [spell(w) for w in words]
    return ' '.join(map(str, words_correct)) # Return the text untokenize

In [35]:
#df_questions['Title'] = df_questions['Title'].apply(lambda x: autocorrect(x))
#df_questions['Body'] = df_questions['Body'].apply(lambda x: autocorrect(x))

In [36]:
df_questions['Body'][0]

"I am using the Photoshop's javascript API to find the fonts in a given PSD. Given a font name returned by the API, I want to find the actual physical font file that that font name corresponds to on the disc. This is all happening in a python program running on OSX so I guess I am looking for one of: Some Photoshop javascript A Python function An OSX API that I can call from python"

In [37]:
df_questions.to_csv('../Maj2NLP/data/questions_preprocessed.csv', encoding='utf-8', errors='surrogatepass')

## Lowering

In [38]:
df_questions['Title'] = df_questions['Title'].str.lower()
df_questions['Body'] = df_questions['Body'].str.lower()

In [39]:
df_questions['Body'][0]

"i am using the photoshop's javascript api to find the fonts in a given psd. given a font name returned by the api, i want to find the actual physical font file that that font name corresponds to on the disc. this is all happening in a python program running on osx so i guess i am looking for one of: some photoshop javascript a python function an osx api that i can call from python"

### remove all non alphabetical characters

In [40]:
def remove_punctuation_and_number(text):
    """remove all punctuation and number"""
    return text.translate(str.maketrans(" ", " ", charac))



def remove_non_alphabetical_character(text):
    """remove all non-alphabetical character"""
    text = re.sub("[^a-z]+", " ", text) # remove all non-alphabetical character
    text = re.sub("\s+", " ", text) # remove whitespaces left after the last operation
    return text

In [41]:
df_questions['Title'] = df_questions['Title'].apply(lambda x: remove_non_alphabetical_character(x))
df_questions['Body'] = df_questions['Body'].apply(lambda x: remove_non_alphabetical_character(x))

In [42]:
df_questions['Body'][0]

'i am using the photoshop s javascript api to find the fonts in a given psd given a font name returned by the api i want to find the actual physical font file that that font name corresponds to on the disc this is all happening in a python program running on osx so i guess i am looking for one of some photoshop javascript a python function an osx api that i can call from python'

In [43]:
df_questions.to_csv('../Maj2NLP/data/questions_preprocessed_long.csv', encoding='utf-8', errors='surrogatepass')


# Answers

In [45]:
df_answers = pd.read_csv('../Maj2NLP/data/Answers.csv', usecols=['Id', 'Score', 'Body', 'ParentId'],
                           dtype=dtypes_questions, encoding_errors= 'replace')
df_answers[['Body']] = df_answers[['Body']] \
    .applymap(lambda x: str(x) \
              .encode("utf-8", errors='surrogatepass') \
              .decode("ISO-8859-1", errors='surrogatepass'))
# remove all the questions with a negative score
df_answers = df_answers[df_answers['Score'] > 0]
spell = Speller()
token = ToktokTokenizer()
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()
charac = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~0123456789'
stop_words = set(stopwords.words("english"))
adjective_tag_list = set(['JJ', 'JJR', 'JJS', 'RBR', 'RBS'])  # List of Adjective's tag from nltk package
df_answers.info()

df_answers['Body'] = df_answers['Body'].apply(lambda x: BeautifulSoup(x, 'html.parser').get_text())

df_answers['Body'] = df_answers['Body'].apply(lambda x: clean_text(x))

df_answers['Body'] = df_answers['Body'].apply(lambda x: expand_contractions(x))

#df_answers['Body'] = df_answers['Body'].apply(lambda x: autocorrect(x))
## Lowering
df_answers['Body'] = df_answers['Body'].str.lower()

df_answers['Body'] = df_answers['Body'].apply(lambda x: remove_non_alphabetical_character(x))
df_answers.to_csv('../Maj2NLP/data/answers_preprocessed.csv', encoding='utf-8', errors='surrogatepass')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 653168 entries, 0 to 987118
Data columns (total 4 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   Id        653168 non-null  int32 
 1   ParentId  653168 non-null  int64 
 2   Score     653168 non-null  int16 
 3   Body      653168 non-null  object
dtypes: int16(1), int32(1), int64(1), object(1)
memory usage: 18.7+ MB
