## Imports

In [80]:
import math
from tqdm import tqdm_notebook
import re
import pickle
import spacy
import numpy as np
nlp = spacy.load('en_core_web_sm')

## Classes

In [2]:
class Post(object):
    
    @staticmethod
    def decode(regex, sep=None):
        if regex is None:
            return None
        else:
            if sep is None:
                return regex.group(0)
            else:
                return [w.strip() for w in regex.group(0).split(sep) if w != '']
    
    def __init__(self,entry):
        section_regex = lambda section: r'(?<= {} \=\=)[\s\S]+?(?=\=\= |\Z)'.format(section)
        
        self.title = entry[0]
        self.text = entry[1]
        cat = re.search(r'(?<=\[\[category:)[^\s\\\]]*',val[1],flags=re.I)
        self.category =  cat.group(0) if cat is not None else None
        self.intro = Post.decode(re.search(r'[\s\S]+(?=\[\[category)',self.text,re.M|re.I))
        self.steps = Post.decode(re.search(section_regex('steps'),self.text,re.M | re.I), sep='\n#')
        self.tips = Post.decode(re.search(section_regex('tips'),self.text,re.M | re.I), sep='\n*')
        self.warnings = Post.decode(re.search(section_regex('warnings'),self.text,re.M | re.I), sep='\n*')
        self.tools = Post.decode(re.search(section_regex('you\'ll need'),self.text,re.M | re.I), sep='\n*')
        self.related_posts = Post.decode(re.search(section_regex('related wikihows'),self.text,re.M | re.I), sep='\n*')
    
    @property
    def get_task(self):
        subject_match = re.search(r'["\'](.+)["\']',self.title.lower())
        doc = nlp(self.title.lower())

        if subject_match is None:
            return ([w.lemma_ for w in doc if w.pos_=='VERB'],[t.lemma_ for t in doc.noun_chunks])
        action = re.sub(r'["\']','',
                       subject_match.group(0))
        return ([action],
                [t.lemma_ for t in doc if (t.pos_=='NOUN' or t.pos_=='PROPN') and t.text not in action])
    
    @property
    def get_tool_nouns(self):
        if self.tools is None:
            return None
        return set([np.lemma_ for tool in self.tools for np in nlp(tool).noun_chunks])

## Read Posts

In [3]:
post_list = pickle.load(open("C:\Programming\Python\Programs_NLP\posts.p",'rb'))

## Begin Parsing Post Language

In [4]:
tool_posts = [p for p in post_list if p.tools is not None]

### Start with exact matching

In [5]:
tool_posts[4000].get_task

(['become'], ['an exchange student', 'germany'])

In [6]:
t = nlp(tool_posts[4000].title)

In [13]:
tool_posts[4000].get_tool_nouns

{'camera',
 'fundraiser',
 'german language class',
 'good grade',
 'google hangout or skype account',
 'journal',
 'laptop',
 'money',
 'parental approval',
 'part - time job',
 'scholarship'}

In [99]:
class QuestionMatcher(object):
    def __init__(self,post_list, tools_only=True):
        if tools_only:
            post_list = [p for p in post_list if p.tools is not None]
        self.answer_dict = {}
        print("Parsing and matching questions and answers...")
        self.task_title_dict = {}
        
        for p in tqdm_notebook(post_list):
            task = tuple([tuple(set(i)) for i in p.get_task])
            task = task[0] + task[1]
            if task in self.answer_dict:
                self.answer_dict[task].update(p.get_tool_nouns)
                self.task_title_dict[task].append(p.title)
            else:
                self.answer_dict[task] = p.get_tool_nouns
                self.task_title_dict[task] = [p.title]
        
                
    @staticmethod
    def get_task(sentence):
        subject_match = re.search(r'["\'](.+)["\']',sentence.lower())
        doc = nlp(sentence.lower())

        if subject_match is None:
            res = ([w.lemma_ for w in doc if w.pos_=='VERB'],[t.lemma_ for t in doc.noun_chunks])
        else:    
            action = re.sub(r'["\']','', subject_match.group(0))
            res = ([action], [t.lemma_ for t in doc if (t.pos_=='NOUN' or t.pos_=='PROPN') and t.text not in action])
        res = tuple([tuple(set(t)) for t in res])
        return res[0]+res[1]
    
    def ask(self,question):
        parsed_question = QuestionMatcher.get_task(question)
        if len(parsed_question)==0:
            print("No idea!")
            return None
        
        if self.answer_dict.get(parsed_question) is not None:
            return self.answer_dict[parsed_question]
        
        words_q = parsed_question
        words_q_doc = [nlp(w) for w in words_q]
        words_a = [key for key in self.answer_dict if len(key) == len(words_q)]
        if len(words_a)==0:
            print("No idea!")
            return None
        
        best_sim = 0
        best_match = None
        for tup in words_a:
            sim = np.mean([q.similarity(nlp(a)) for q,a in zip(words_q_doc,tup)])
            if sim > best_sim:
                best_sim = sim
                best_match = tup
        
        print('Total similarity: {}'.format(best_sim))
        print("Matched on {}".format(' and '.join(self.task_title_dict[best_match])))
        return self.answer_dict[best_match]
            
        
        

In [100]:
robot = QuestionMatcher(tool_posts[:100])

Parsing and matching questions and answers...





In [103]:
robot.ask('build a house')

Total similarity: 0.8315824374943797
Matched on Accessorize a Deck


{'basket',
 'candle',
 'chair',
 'chimera',
 'hook',
 'light',
 'plant',
 'speaker',
 'table'}