# EPAM

In [1]:
import re
import copy
import warnings
import pandas as pd

from tqdm import tqdm
from lazy import lazy
from bs4 import BeautifulSoup
from difflib import SequenceMatcher
from IPython.core.display import HTML
from sklearn.base import BaseEstimator, TransformerMixin

In [2]:
posts_file = 'NLP_dataset_final/Posts.csv'

In [3]:
%time posts_df = pd.read_csv(posts_file, encoding='ISO-8859-1')

CPU times: user 12.4 s, sys: 1.07 s, total: 13.5 s
Wall time: 13.9 s




In [None]:
#         before_cnum = self.text[:pos].split().count(self.code_replacer)
#         answer = self.text[pos: pos + len_]
#        
#         parts = []
#         for pans in answer.split():
#             if pans == self.code_replacer:
#                 parts.append(self._codes[before_cnum])
#                 before_cnum += 1
#             else:
#                 parts.append(pans)
#         answer = ' '.join(parts)
#        
#         node = self._find_node(self._soup, answer)
#         return str(node if node is not None else self._soup)

#     def _find_node(self, node, text):
#         score = self._cmp(node.get_text(), text)
#         if score > self.treshold:
#             return node
#         else:
#             for child in node.findChildren():
#                 ret = self._find_node(child, text)
#                 if ret is not None: return ret
#             return None

#     @staticmethod
#     def _cmp(s1, s2):
#         return SequenceMatcher(None, s1, s2).ratio()

# def _hightlight(self, node, texts):
#         pattern = re.compile(r'\b(' + '|'.join(texts) + r')\b')
#         text_to_rep = re.sub(pattern, r"<span class=“highlight”>\1</span>", node.text)
#         if node.text != text_to_rep:
#             print(text_to_rep)
#             node.replaceWith(text_to_rep)
#             return -1
#         for child in node.findChildren():
#             if self._hightlight(child, texts) == -1:
#                 return -1

In [165]:
class Extractor:
    def __init__(self, raw_text, delete_code=True, code_replacer='CODE', 
                 delete_r=True, treshold=0.9):
        self.raw_text = raw_text
        self.delete_code = delete_code
        self.code_replacer = code_replacer
        self.delete_r = delete_r
        self.treshold = treshold
    
    @lazy
    def text(self):
        text = self._replace_unescape(self.raw_text)
        self._soup = BeautifulSoup(text, 'lxml')
        
        self._codes = []
        new_soup = copy.copy(self._soup)
        self._change_code(new_soup)
        
        text = str(new_soup.get_text())
        return text.replace('\r', '') if self.delete_r else text
    
    def highlight(self, pos, len_):
        # `Split('CODE')`
        answer = self.text[pos: pos + len_]
        full, parts = [], []
        for pans in answer.split():
            if pans != self.code_replacer:
                parts.append(pans)
            else:
                if len(parts):
                    full.append(' '.join(parts))
                    parts = []
        if len(parts):
            full.append(' '.join(parts))
            parts = []
        
        pattern = re.compile(r'\b(' + '|'.join(full) + r')\b')
        return re.sub(pattern, r"<span class=“highlight”>\1</span>", str(self._soup))
    
    def _change_code(self, tag):
        for child in tag.findChildren():
            if child.name == 'code':
                code = child.text
                self._codes.append(code)
                if self.delete_code: child.replaceWith(self.code_replacer)
    
    @staticmethod
    def _replace_unescape(text, unescape_dict={'&lt;': '<', '&gt;': '>', '&amp;': '&'}):
        def round_(text):
            for k, v in unescape_dict.items():
                text = text.replace(k, v)
            return text
        
        old_text, text = text, round_(text)
        while old_text != text:
            old_text, text = text, round_(text)

        return text


class Preprocessor(BaseEstimator):
    def __init__(self):
        # Switch warning off for `bs4`.
        warnings.filterwarnings("ignore", category=UserWarning, module='bs4')
    
    def transform(self, X):
        return [Extractor(x).text for x in X]

In [169]:
sample = posts_df.Body[1488]
sample

'<p>Consider the following classes:</p>\r\r\r\r\n\r\r\r\r\n<pre><code>@Entity\r\r\r\r\npublic class MyDomain{\r\r\r\r\n\r\r\r\r\n    @Id\r\r\r\r\n    @GeneratedValue(strategy = GenerationType.AUTO)\r\r\r\r\n    private Long id;\r\r\r\r\n\r\r\r\r\n    @OneToOne\r\r\r\r\n    private AnotherDomain anotherDomain;\r\r\r\r\n\r\r\r\r\n    //getters and setters here\r\r\r\r\n}\r\r\r\r\n</code></pre>\r\r\r\r\n\r\r\r\r\n<hr>\r\r\r\r\n\r\r\r\r\n<pre><code>@Repository\r\r\r\r\npublic MyDomainDao extends DaoBase&lt;MyDomain&gt;{\r\r\r\r\n\r\r\r\r\n    public List&lt;MyDomain&gt; doSomething(AnotherDomain parameter){\r\r\r\r\n        //code does something here\r\r\r\r\n    }\r\r\r\r\n\r\r\r\r\n}\r\r\r\r\n</code></pre>\r\r\r\r\n\r\r\r\r\n<hr>\r\r\r\r\n\r\r\r\r\n<pre><code>public class DaoBase&lt;I&gt;{\r\r\r\r\n\r\r\r\r\n    @Autowired\r\r\r\r\n    private SessionFactory sessionFactory;\r\r\r\r\n\r\r\r\r\n    public void save(I object){\r\r\r\r\n        sessionFactory.getCurrentSession().saveOrUpdate

In [170]:
Extractor(sample).text

'Consider the following classes:\nCODE\n\nCODE\n\nCODE\n\nCODE\nTo summarize, I have a simple entity class (CODE) and a domain dao (CODE) that extends from the super class CODE.  It is in this super class where the session for the persistence is called and it is also the responsibility of this super class to save/update/delete the entity class.  By virtue of inheritance, subclasses only has to define child-specific methods.\nThe problem begins when I run the unit/integration test CODE.  I wanted to test that the method CODE defined in CODE.  In order to do that, I need to five test rows in the database (I am using HSQLDB in-memory), thus the loop in the method CODE.  What is strange about the loop is that I get this error on the second iteration:\n\nERROR JDBCExceptionReporter - integrity constraint violation: unique constraint or index violation; SYS_CT_10231 table: MyDomain \n\nIt cannot get any cryptic than that.  I know that an ID is generated on the first iteration. Why am I getti

In [168]:
Extractor(sample).text[152: 152 + 13]  # answer='double quotes'

'double quotes'

In [148]:
Extractor(sample).highlight(152, 13)

'<html><body><p>The below code:</p>\n<pre><code>public void insertNewStudent(int id, String pass, String fname, String lname, String   street, String city, String state, int Zip, String Email, double GPA) {\r\r\r\r\n    try {\r\r\r\r\n    Class.forName("sun.jdbc.odbc.JdbcOdbcDriver");\r\r\r\r\n    conn = DriverManager.getConnection("jdbc:odbc:RegistrationDB", "", "");\r\r\r\r\n    String query = "INSERT INTO Students (ID, Password, FirstName, LastName, Street, City, State, Zip, EMail, GPA)" + "VALUES (?,?,?,?,?,?,?,?,?,?)";\r\r\r\r\n    PreparedStatement ps = conn.prepareStatement(query);\r\r\r\r\n    ps.setInt(1, id);\r\r\r\r\n    ps.setString(2, pass);\r\r\r\r\n    ps.setString(3, fname);\r\r\r\r\n    ps.setString(4, lname);\r\r\r\r\n    ps.setString(5, street);\r\r\r\r\n    ps.setString(6, city);\r\r\r\r\n    ps.setString(7, state);\r\r\r\r\n    ps.setInt(8, Zip);\r\r\r\r\n    ps.setString(9, Email);\r\r\r\r\n    ps.setDouble(10, GPA);\r\r\r\r\n    ps.executeUpdate();\r\r\r\r\n    }

In [10]:
clean_bodies = []
for body in tqdm(posts_df.Body):
    clean_bodies.append(Extractor(body).text)

posts_df['CleanBody'] = clean_bodies

100%|██████████| 740551/740551 [18:06<00:00, 681.49it/s]
