## Importing Libraries

In [1]:
import os
import pandas as pd
import re
import numpy as np
from tqdm import tqdm
import nltk
nltk.download('punkt')
from nltk.corpus import stopwords

[nltk_data] Downloading package punkt to C:\Users\SAI
[nltk_data]     DEEPTHI\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Reading the Dataset

In [2]:
total_df = pd.read_pickle("QuestionsDataSet.pkl")

In [3]:
total_df.head(5)

Unnamed: 0,Id,Text,Topic
0,0,"<p>Besides being ""one of the 7 meta questions ...",/AImeta
1,1,"<p>I've clicked on <em>chat</em> link, but the...",/AImeta
2,2,<p>I think this will be a crucial thing to fig...,/AImeta
3,3,<p>Are all questions asked on stats and data s...,/AImeta
4,4,<p>I've seen several questions that use the <a...,/AImeta


In [None]:
for i in range(10):
    print(total_df.iloc[i]['Text'])
    print("="*40)

In [4]:
print(total_df['Topic'].unique())

['/AImeta' '/AI' '/ComputerGraphicsMeta' '/ComputerGraphics' '/CSMeta'
 '/CS' '/DataScienceMeta' '/DataScience']


In [5]:
posts_text = total_df['Text'].values
print("Posts_text shape", posts_text.shape)

Posts_text shape (161423,)


In [6]:
posts_text[18]

'<p><strong>Yes</strong></p>\r\n\r\n<p>I am sorry to be the one who posts Yes, but as we are in the beta, I want to be straight forward.</p>\r\n\r\n<p>In addition to that, AI is also on-topic in the CS site. <a href="https://area51.meta.stackexchange.com/q/22939/142759">I was the one who raised this in the definition phase</a>.</p>\r\n\r\n<p>So, a lot of topic which this site aims to cover are already covered in the existing sites.</p>\r\n'

In [7]:
import re
#https://stackoverflow.com/questions/9662346/python-code-to-remove-html-tags-from-a-string
def cleanhtml(raw_html):
    """Remove HTML TAG and convert text to lower case"""
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', raw_html)
    return cleantext.lower()

In [9]:
preprocessed_post_text = []
for i in range(posts_text.shape[0]):
    preprocessed_post_text.append(cleanhtml(posts_text[i]))
preprocessed_post_text = np.array(preprocessed_post_text)

In [10]:
preprocessed_post_text[18]

'yes\r\n\r\ni am sorry to be the one who posts yes, but as we are in the beta, i want to be straight forward.\r\n\r\nin addition to that, ai is also on-topic in the cs site. i was the one who raised this in the definition phase.\r\n\r\nso, a lot of topic which this site aims to cover are already covered in the existing sites.\r\n'

In [None]:

url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+';
for i in range(preprocessed_post_text.shape[0]):
    preprocessed_post_text[i] = re.sub(url_regex, '', preprocessed_post_text[i]);

In [11]:
preprocessed_post_text[18]

'yes\r\n\r\ni am sorry to be the one who posts yes, but as we are in the beta, i want to be straight forward.\r\n\r\nin addition to that, ai is also on-topic in the cs site. i was the one who raised this in the definition phase.\r\n\r\nso, a lot of topic which this site aims to cover are already covered in the existing sites.\r\n'

In [12]:
def cleanpunc(sentence): 
    """function to clean the word of any punctuation or special characters"""
    cleaned = re.sub(r'[?|!|"|#|:|=|+|_|{|}|[|]|-|$|%|^|&|]',r'',sentence)
    cleaned = re.sub(r'[.|,|)|(|\|/|-|~|`|>|<|*|$|@|;|â†’]',r'',cleaned)
    return  cleaned

In [None]:

for i in range(preprocessed_post_text.shape[0]):
    preprocessed_post_text[i] = cleanpunc(preprocessed_post_text[i])

In [14]:
preprocessed_post_text[18]

'yes\r\n\r\ni am sorry to be the one who posts yes but as we are in the beta i want to be straight forward\r\n\r\nin addition to that ai is also ontopic in the cs site i was the one who raised this in the definition phase\r\n\r\nso a lot of topic which this site aims to cover are already covered in the existing sites\r\n'

In [15]:
import re

def decontracted(phrase):
    # https://stackoverflow.com/questions/19790188/expanding-english-language-contractions-in-python same code snippet from assignment as well
    """ expanding and creating common English contractions in text"""
    # specific
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    phrase = re.sub(r"\n", "", phrase)
    return phrase

In [16]:
for i in range(preprocessed_post_text.shape[0]):
    preprocessed_post_text[i] = decontracted(preprocessed_post_text[i])

In [17]:

non_stop_word_removed_posts = preprocessed_post_text

In [18]:
non_stop_word_removed_posts[18]

'yes\r\ri am sorry to be the one who posts yes but as we are in the beta i want to be straight forward\r\rin addition to that ai is also ontopic in the cs site i was the one who raised this in the definition phase\r\rso a lot of topic which this site aims to cover are already covered in the existing sites\r'

In [19]:

non_stop_word_removed_posts_df = pd.DataFrame(non_stop_word_removed_posts, columns=['non_stopword_removed_preprocessed_text'])
non_stop_word_removed_posts_df.index = total_df.index
non_stop_word_removed_posts_df['Id'] = total_df['Id']
total_df = total_df.merge(non_stop_word_removed_posts_df, on='Id',how='left')
total_df.to_pickle('non_stop_word_removed_posts.pkl')

In [20]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
print ('list of stop words:', stop_words)

def nlp_preprocessing(total_text):
    """Removes stop words and alpha numeric values"""
    if type(total_text) is not int:# Numbers doesn't make any sense in searching them
        string = ""
        for words in total_text.split():
            # remove the special chars in review like '"#$@!%^&*()_+-~?>< etc.
            word = ("".join(e for e in words if e.isalnum()))
            # stop-word removal
            if not word in stop_words:
                string += word + " "
        return string

list of stop words: {'shouldn', 'of', 'their', 'is', 'between', 'by', 'against', 'than', 'has', 'all', 'just', "doesn't", "won't", 'those', 'needn', 'why', 'been', 'isn', 'i', 'at', 'such', 'above', 'down', 'on', 'hers', 'o', 'hasn', 'which', 'mightn', 'other', 'few', 'won', 'to', 't', "isn't", 'once', "wouldn't", 'who', "she's", 'more', 'couldn', 'her', 'd', 'your', 'nor', 'doing', 'too', 'they', 'did', 'doesn', 'weren', 'himself', 'as', "you'd", 'there', 're', 'ourselves', 'herself', 'don', 'about', 'these', 'y', 'through', 'only', 'how', 'can', 'mustn', 'wouldn', "aren't", 'some', 'the', 'aren', 'from', 'ours', 'again', 'further', 'does', 'own', "that'll", 'myself', 'themselves', 'when', "should've", 'or', 'were', 'was', 'had', 'wasn', 'shan', "couldn't", "wasn't", 'didn', 'being', 'that', 'yours', 'you', 'he', 'before', 'his', "haven't", 'him', 'each', 'most', "shouldn't", "weren't", 'and', 'an', 'no', 'm', 'ain', "you're", "mightn't", 'its', "didn't", 'me', 'ma', 'them', 'then', '

In [23]:

# we take each title and we text-preprocess it.
for i in range(preprocessed_post_text.shape[0]):
    preprocessed_post_text[i] = nlp_preprocessing(preprocessed_post_text[i])


In [24]:
preprocessed_post_text[18]

'yes sorry one posts yes beta want straight forward addition ai also ontopic cs site one raised definition phase lot topic site aims cover already covered existing sites '

In [25]:
preprocesses_text_df = pd.DataFrame(preprocessed_post_text, columns=['preprocessed_text'])
preprocesses_text_df.index = total_df.index
preprocesses_text_df['Id'] = total_df['Id']
total_df = total_df.merge(preprocesses_text_df, on='Id',how='left')

In [26]:
print("Shape", total_df.shape)
total_df.head(5)

Shape (161423, 5)


Unnamed: 0,Id,Text,Topic,non_stopword_removed_preprocessed_text,preprocessed_text
0,0,"<p>Besides being ""one of the 7 meta questions ...",/AImeta,besides being one of the 7 meta questions ever...,besides one 7 meta questions every site ask pl...
1,1,"<p>I've clicked on <em>chat</em> link, but the...",/AImeta,i have clicked on chat link but the list is em...,clicked chat link list empty also tried create...
2,2,<p>I think this will be a crucial thing to fig...,/AImeta,i think this will be a crucial thing to figure...,think crucial thing figure one hand think impo...
3,3,<p>Are all questions asked on stats and data s...,/AImeta,are all questions asked on stats and data scie...,questions asked stats data science se also top...
4,4,<p>I've seen several questions that use the <a...,/AImeta,i have seen several questions that use the art...,seen several questions use artificialintellige...


In [27]:
total_df = total_df[total_df['preprocessed_text'] != '']
total_df = total_df[total_df['preprocessed_text'] != ' ']
total_df = total_df.reset_index(drop=True)
total_df.to_pickle('Preprocessed_questions_text.pkl')