## Imports

In [1]:
import math
from tqdm import tqdm_notebook
import re
import pickle
import spacy
nlp = spacy.load('en_core_web_sm')

## Classes

In [80]:
class Post(object):
    
    @staticmethod
    def decode(regex, sep=None):
        if regex is None:
            return None
        else:
            if sep is None:
                return regex.group(0)
            else:
                return [w.strip() for w in regex.group(0).split(sep) if w != '']
    
    def __init__(self,entry):
        section_regex = lambda section: r'(?<= {} \=\=)[\s\S]+?(?=\=\= |\Z)'.format(section)
        
        self.title = entry[0]
        self.text = entry[1]
        cat = re.search(r'(?<=\[\[category:)[^\s\\\]]*',val[1],flags=re.I)
        self.category =  cat.group(0) if cat is not None else None
        self.intro = Post.decode(re.search(r'[\s\S]+(?=\[\[category)',self.text,re.M|re.I))
        self.steps = Post.decode(re.search(section_regex('steps'),self.text,re.M | re.I), sep='\n#')
        self.tips = Post.decode(re.search(section_regex('tips'),self.text,re.M | re.I), sep='\n*')
        self.warnings = Post.decode(re.search(section_regex('warnings'),self.text,re.M | re.I), sep='\n*')
        self.tools = Post.decode(re.search(section_regex('you\'ll need'),self.text,re.M | re.I), sep='\n*')
        self.related_posts = Post.decode(re.search(section_regex('related wikihows'),self.text,re.M | re.I), sep='\n*')
    
    @property
    def get_task(self):
        subject_match = re.search(r'["\'](.+)["\']',self.title.lower())
        doc = nlp(self.title.lower())

        if subject_match is None:
            return (None,[t.lemma_ for t in doc.noun_chunks])
        action = re.sub(r'["\']','',
                       subject_match.group(0))
        return (action,
                [t.lemma_ for t in doc if (t.pos_=='NOUN' or t.pos_=='PROPN') and t.text not in action])
    
    @property
    def get_tool_nouns(self):
        if self.tools is None:
            return None
        return set([np.lemma_ for tool in self.tools for np in nlp(tool).noun_chunks])

## Read Posts

In [81]:
post_list = pickle.load(open("C:\Programming\Python\Programs_NLP\posts.p",'rb'))

## Begin Parsing Post Language

In [82]:
tool_posts = [p for p in post_list if p.tools is not None]