## Imports

In [15]:
import math
# This module creates progress bars that display nicely in notebooks, but is not necesary.
# from tqdm import tqdm_notebook
import re
import pickle
import spacy
import numpy as np

# It's important to use the large (lg) model to get 
# good word similarities
nlp = spacy.load('en_core_web_lg')

## Classes

In [16]:
class Post(object):
    
    
    @staticmethod
    def decode(regex, sep=None):
        if regex is None:
            return None
        else:
            if sep is None:
                return regex.group(0)
            else:
                return [w.strip() for w in regex.group(0).split(sep) if w != '']
    
    def __init__(self,entry):
        section_regex = lambda section: r'(?<= {} \=\=)[\s\S]+?(?=\=\= |\Z)'.format(section)
        
        self.title = entry[0]
        self.text = entry[1]
        cat = re.search(r'(?<=\[\[category:)[^\s\\\]]*',val[1],flags=re.I)
        self.category =  cat.group(0) if cat is not None else None
        self.intro = Post.decode(re.search(r'[\s\S]+(?=\[\[category)',self.text,re.M|re.I))
        self.steps = Post.decode(re.search(section_regex('steps'),self.text,re.M | re.I), sep='\n#')
        self.tips = Post.decode(re.search(section_regex('tips'),self.text,re.M | re.I), sep='\n*')
        self.warnings = Post.decode(re.search(section_regex('warnings'),self.text,re.M | re.I), sep='\n*')
        self.tools = Post.decode(re.search(section_regex('you\'ll need'),self.text,re.M | re.I), sep='\n*')
        self.related_posts = Post.decode(re.search(section_regex('related wikihows'),self.text,re.M | re.I), sep='\n*')
    
    @property
    def get_task(self):
        subject_match = re.search(r'["\'](.+)["\']',self.title.lower())
        doc = nlp(self.title.lower())

        if subject_match is None:
            return ([w.lemma_ for w in doc if w.pos_=='VERB'],[t.lemma_ for t in doc.noun_chunks])
        action = re.sub(r'["\']','',
                       subject_match.group(0))
        return ([action],
                [t.lemma_ for t in doc if (t.pos_=='NOUN' or t.pos_=='PROPN') and t.text not in action])
    
    @property
    def get_tool_nouns(self):
        if self.tools is None:
            return None
        return set([np.lemma_ for tool in self.tools for np in nlp(tool).noun_chunks])

In [25]:
class QuestionMatcher(object):
    def __init__(self,post_list, tools_only=True):
        if tools_only:
            post_list = [p for p in post_list if p.tools is not None]
        self.answer_dict = {}
        print("Parsing and matching questions and answers...")
        self.task_title_dict = {}
        self.phrase_to_doc_dict = {}
        
        for p in post_list:
            task = tuple([tuple(set(i)) for i in p.get_task])
            task = task[0] + task[1]
            
            if task in self.answer_dict:
                self.answer_dict[task].update(p.get_tool_nouns)
                self.task_title_dict[task].append(p.title)
            else:
                self.answer_dict[task] = p.get_tool_nouns
                self.task_title_dict[task] = [p.title]
                
            # Fill out dictionary of parsed words and phrases
            for w in task:
                if w not in self.phrase_to_doc_dict:
                    self.phrase_to_doc_dict[w] = nlp(w)
        
                
    @staticmethod
    def get_task(sentence):
        subject_match = re.search(r'["\'](.+)["\']',sentence.lower())
        doc = nlp(sentence.lower())

        if subject_match is None:
            res = ([w.lemma_ for w in doc if w.pos_=='VERB'],[t.lemma_ for t in doc.noun_chunks])
        else:    
            action = re.sub(r'["\']','', subject_match.group(0))
            res = ([action], [t.lemma_ for t in doc if (t.pos_=='NOUN' or t.pos_=='PROPN') and t.text not in action])
        res = tuple([tuple(set(t)) for t in res])
        return res[0]+res[1]
    
    def ask(self,question):
        parsed_question = QuestionMatcher.get_task(question)
        if len(parsed_question)==0:
            print("No idea!")
            return None
        
        if self.answer_dict.get(parsed_question) is not None:
            return self.answer_dict[parsed_question]
        
        words_q = parsed_question
        words_q_doc = [nlp(w) for w in words_q]
        words_a = [key for key in self.answer_dict if len(key) == len(words_q)]
        if len(words_a)==0:
            print("No idea!")
            return None
        
        best_sim = 0
        best_match = None
        for tup in words_a:
            sim = np.mean([q.similarity(self.phrase_to_doc_dict[a]) for q,a in zip(words_q_doc,tup)])
            if sim > best_sim:
                best_sim = sim
                best_match = tup
        
        print('Total similarity: {}'.format(best_sim))
        print("Matched on {}".format(' and '.join(self.task_title_dict[best_match])))
        return self.answer_dict[best_match]
            
        
        

## Read Posts

In [26]:
post_list = pickle.load(open("C:\Programming\Python\Programs_NLP\posts.p",'rb'))

## Begin Parsing Post Language

In [27]:
tool_posts = [p for p in post_list if p.tools is not None]

In [32]:
robot = QuestionMatcher(tool_posts)

Parsing and matching questions and answers...





In [33]:
robot.ask('clean my clothes')


Total similarity: 0.8173856195757754
Matched on Wash Your Clothes


{'a bucket / sink',
 'a dryer',
 'a washing machine',
 'bleach',
 'clothe',
 'color - safe bleach',
 'detergent',
 'dryer sheet',
 'fabric softener',
 'the damp clothe'}

In [34]:
robot.ask('paint your room')


Total similarity: 0.9327035024469195
Matched on Paint a Room


{"' ' ' bristle paint brush",
 "' ' ' cloth rag",
 "' ' ' dropcloth",
 "' ' ' indoor paint",
 "' ' ' indoor primer",
 "' ' ' masking tape",
 "' ' ' painting clothe",
 "' ' ' paper cup",
 "' ' ' roller handle",
 "' ' ' screwdriver(s",
 "' ' ' spackle",
 "' ' ' stepladder",
 "' ' ' work light",
 "* ' ' ' edge roller",
 "* ' ' ' edging pad",
 "* ' ' ' spray paint",
 "* a ' ' ' paint roller",
 "* a ' ' ' paint sprayer",
 '* durable plastic tarp',
 '* traditional',
 '-PRON-',
 '-PRON- 5 in 1 tool',
 '-PRON- clothe',
 '-PRON- floor',
 '-PRON- hair',
 '-PRON- hand',
 '-PRON- ladder',
 '-PRON- nice new paint',
 '-PRON- paintbrush',
 '-PRON- plastic',
 '-PRON- tarp',
 '-PRON- time',
 '-PRON- tool',
 '-PRON- wall',
 '2 " tape',
 '2 ladder',
 '3 to 6 inch',
 '= = = optional tools===',
 'a " ceiling edge pad',
 "a ' ' ' brush spinner",
 "a ' ' ' paint tray",
 "a ' ' ' painter 's 5-in-1 tool",
 "a ' ' ' sink",
 "a ' ' ' sponge mop",
 'a 2"x12 " plank',
 'a 4 foot section',
 'a 5 gallon bucket',
 'a

In [35]:
robot.ask('fix a hole in the wall')


Total similarity: 0.9619860490105135
Matched on Fix a Hole in a Wall


{'120 grit sandpaper',
 'a good six or four inch putty knife',
 'standard wallboard joint compound',
 'the block or sanding device'}

In [36]:
robot.ask('play video games')

{'-PRON- local video game / electronic store',
 'a game',
 'a game system(handheld',
 'computers count',
 'home system',
 'one or more controller'}

In [37]:
robot.ask('find a girlfriend')


Total similarity: 0.8844088023403976
Matched on Find a Skatespot


{'creativity',
 'imagination',
 'ledge',
 'safety gear',
 'skateboard',
 'skateboard wax'}

## Isolate Industry-Related Posts

This came after an investigation into which categories apply best to our target audience.

In [48]:
cat_list=[]
#cat_list.append(nlp(posts[0:100].category))
for x in post_list:
    if x.category not in cat_list:
        cat_list.append(x.category)
cat_list
import pandas as pd
cat_df = pd.DataFrame(cat_list)
cat_df_2 = cat_df.drop([0,1,2,4,5,6,7,8,11,21,26,12,14,16,18,23,28,25,30,33,35,38,40,43,46,47,48,49,50,55,58,59,60,61,62,100,102,111,110,106,104,105,108,112,118,119,120,127,128,141,142,143,144,145,146,147,148,164,166,167,168,177,178,179,180,181,188,189,190,139,158,155,130,133,374,375,376,377,378,379,381,383,384,389,393,397,398,403,408,409,411,420,460,462,466,469,472,473,474,475,476,478,482,484,486,487,495,496,504,506,509,818,841,853,816,817,823,824,826,820,819,827,831,834,835,837,838,839,840,848,851,852,856,857,858,859,860,861,862,868,863,864,907,908,869,906,911,917,918,919,923,927,928,930,932,934,935,937,1365,1372,1374,1376,1377,1380,1381,1384,1387,1389,1390,1391,1394,1395,1400,1401,1402,1403,1407,1416,1420,1421,1422,1423,1424,1425,1426,1429,1430,1431,1433,1434,1435,1692,1693,1695,1696,1699,1701,1702,1705,1707,1708,1714,1716,1717,1718,1720,1721,1723,1724,1725,1726,1727,1728,1729,1730,1731,1732,1733,1734,1735,1736,1737,1739,1740,1741,1742,1743,1744,1745,1746,1747,1748,1749,1751,1760,1762,1764,1768,1769,1771,1773,1774,1775,1777,1781,1785,1787,1788,1791,1792,1795,1797,1801,1803,1805,1807,1808,1809,1813,1815,1817,1821,1822,1823,1824,1828,1832,1831,1833,1835,1851,1855,1852,1853,1854,1845,1856,1857,1858,1859,1860,1861,1862,1863,1864,1865,1866,1869,1841,1991,1993,1999,2000,2001,2002,2003,2006,2007,2009,2012,2011,2013,2014,2015,2016,2017,2018,2020,2024,2026,2028,2030,2033,2036,2040,2041,2042,2043,2044,2045,2046,2047,2048,2049,2050,2051,2054,2055,2056,2058,2059,2060,2061,2063,2064])
cat_df_3 = cat_df_2.drop([13,79,80,71,72,68,56,73,74,85,87,88,89,90,91,92,93,94,95,98,124,154,160,161,201,211,212,213,215,219,222,223,224,227,229,231,235,237,241,242,243,251,254,257,260,262,263,264,266,269,273,274,275,277,280,283,286,287,288,289,291,292,295,300,301,305,306,307,309,316,317,318,323,324,325,326,328,329,330,331,332,333,334,335,336,337,339,341,345,346,347,348,349,350,352,353,354,355,357,361,362,363,364,366,367,368,369,371,372,373,387,392,396,400,404,405,422,423,427,432,433,434,436,438,430,440,441,442,444,457,458,459,488,512,514,515,517,518,521,522,523,526,529,530,532,534,537,538,542,543,544,546,547,550,553,554,555,556,560,564,565,566,568,570,577,579,580,582,588,593,594,596,599,1804,1819,1829,1836,1839,1840,1844,1846,1872,1875,1881,1882,1884,1885,1886,1887,1888,1889,1890,1894,1897,1899,1900,1904,1906,1907,1908,1910,1911,1912,1914,1915,1916,1918,1921,1922,1923,1924,1929,1931,1932,1933,1934,1935,1936,1937,1938,1939,1940,1941,1942,1943,1945,1946,1948,1951,1952,1953,1954,1955,1956,1957,1958,1960,1961,1962,1965,1966,1967,1968,1969,1970,1971,1972,1973,1974,1975,1976,1977,1979,1980,1981,1983,1984,1988,1990])
#cat_df_3[280:340]
len(cat_df_3)
#puts us at 1471
cat_df_final = cat_df_3.drop([605,614,617,616,619,618,626,632,670,676])


In [49]:
industry_posts = [p for p in tool_posts if p.category in cat_df_final[0].values]

In [None]:
industry_qa = QuestionMatcher(industry_posts)

Parsing and matching questions and answers...
