# EPAM

In [1]:
import re
import warnings
import pandas as pd

from tqdm import tqdm
from lazy import lazy
from bs4 import BeautifulSoup
from sklearn.base import BaseEstimator, TransformerMixin

In [2]:
posts_file = 'NLP_dataset_final/Posts.csv'

In [3]:
%time posts_df = pd.read_csv(posts_file, encoding='ISO-8859-1')

CPU times: user 12.3 s, sys: 1.08 s, total: 13.4 s
Wall time: 13.7 s




In [4]:
class Extractor:
    def __init__(self, raw_text, sep='\n', delete_code=True, 
                 code_replacer='CODE', tags={'p', 'pre'}):
        self.raw_text = raw_text
        self.sep = sep
        self.delete_code = delete_code
        self.code_replacer = code_replacer
        self.tags = tags
        
        self._soup = BeautifulSoup(self.raw_text, 'lxml')
    
    @lazy
    def text(self):
        # `CODE` parts.
        self._codes = []
        for i, c_tag in enumerate(self._soup.find_all('code')):
            self._codes.append(self._replace_unescape(c_tag.text))
            if self.delete_code: c_tag.replaceWith(self.code_replacer)
        
        # Main part.
        return self.sep.join(
            self._delete_tag(self._replace_unescape(p_tag.text))
            for p_tag in self._soup.find_all(self.tags)
        )
    
    def find_pos(self, pos, len_):
        before_cnum = self.text[:pos].split().count(self.code_replacer)
        answer = self.text[pos: pos + len_]
        cnum = answer.split().count(self.code_replacer)
        raw_nice_text = self._replace_unescape(self.raw_text)
        return raw_nice_text.find(answer if cnum == 0 else self._codes[before_cnum])
    
    def _delete_tag(self, text):
        for tag in self.tags:
            text = text.replace('<{}>'.format(tag), '')
            text = text.replace('</{}>'.format(tag), '')
        return text
    
    @staticmethod
    def _replace_unescape(text, unescape_dict={'&lt;': '<', '&gt;': '>', '&amp;': '&', '\r': ''}):
        for k, v in unescape_dict.items():
            text = text.replace(k, v)
        for k, v in unescape_dict.items():
            text = text.replace(k, v)
        return text


class Preprocessor(BaseEstimator):
    def __init__(self):
        # Switch warning off for `bs4`.
        warnings.filterwarnings("ignore", category=UserWarning, module='bs4')
    
    def transform(self, X):
        return [Extractor(x).text for x in X]

In [5]:
sample = posts_df.Body[310581]
sample

'<p>You\'re using the shorter form of lambda expressions, which only allow a single expressions.<br>\r\r\r\nYou need to the long form, which allows multiple statements.</p>\r\r\r\n\r\r\r\n<p>For example:</p>\r\r\r\n\r\r\r\n<pre><code>items.ForEach(item =&gt; {\r\r\r\n    if (item.Contains("I Care About")) \r\r\r\n        whatICareAbout += item + ", ";\r\r\r\n});\r\r\r\n</code></pre>\r\r\r\n'

In [6]:
Extractor(sample).text

"You're using the shorter form of lambda expressions, which only allow a single expressions.\nYou need to the long form, which allows multiple statements.\nFor example:\nCODE"

In [7]:
Extractor(sample).find_pos(166, 166 + 4)  # answer='CODE'

197

In [8]:
clean_bodies = []
for body in tqdm(posts_df.Body):
    clean_bodies.append(Extractor(body).text)

posts_df['CleanBody'] = clean_bodies

100%|██████████| 740551/740551 [10:49<00:00, 1140.53it/s]
