In [151]:
import re

import numpy as np
import pandas as pd
import spacy

from spacy.lookups import Lookups
from spacy.lemmatizer import Lemmatizer
lookups = Lookups()
lookups.add_table("lemma_rules", {"noun": [["s", ""]]})
lemmatizer = Lemmatizer(lookups)

nlp = spacy.load('en')

pd.options.display.max_rows = 999

In [152]:
veggies_df = pd.read_excel("./companion-plants.xlsx", sheet_name="Vegetables")
fruits_df = pd.read_excel("./companion-plants.xlsx", sheet_name="Fruits")
herbs_df = pd.read_excel("./companion-plants.xlsx", sheet_name="Herbs")
flowers_df = pd.read_excel("./companion-plants.xlsx", sheet_name="Flowers")
others_df = pd.read_excel("./companion-plants.xlsx", sheet_name="Others")

combined_df = pd.concat([veggies_df, fruits_df, herbs_df, flowers_df, others_df])

In [153]:
def remove_references(x):
    try:
        return re.sub(r'\[[0-9]*\]', ',', x)
    except:
        ""
        
def replace_paren(x):
    try:
        x = re.sub(r'(\(|\))', ",", x)
        return x.lower()
    except:
        ""

def replace_and(x):
    try:
        x = re.sub(r'\ and\ ', ",", x)
        return x.lower()
    except:
        "" 


combined_df = combined_df.applymap(remove_references)
combined_df = combined_df.applymap(replace_paren)
combined_df = combined_df.applymap(replace_and)
combined_df = combined_df.fillna('')

In [154]:
def lemmatize(y):
    str_tokens = y.split(',')
    docs = [nlp(x) for x in str_tokens]
    
    new_list = []
    
    for doc in docs:
        lemmad_tokens = []
        for token in doc:
            lemmad_tokens.append(token.lemma_)
        
        new_list.append(' '.join(lemmad_tokens))
    
    return ','.join([s for s in new_list if s != ""])

combined_df = combined_df.apply(lambda x: x.apply(lemmatize) if x.name != 'Scientific Name' else x)

In [155]:
combined_df["combined_name"] = combined_df["Scientific name"] + " | " + combined_df["Common name"]

In [156]:
# combined_df = combined_df.set_index('combined_name')
combined_df = combined_df.set_index('Common name')

In [157]:
combined_df

Unnamed: 0_level_0,Category,Scientific name,Helps,Helped by,Attracts,-Repels/+distracts,Avoid,Comments,combined_name
Common name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
allium,vegetable,allium,"fruit tree, nightshade,tomato, capsicum pepp...","carrots, tomato, carrots,african spider plan...",thrips,"rabbit, slug,see garlic, aphid, carrot fly,...","bean, pea",allium be a family of plant which include onio...,allium | allium
asparagus,vegetable,asparagus officinalis,"tomato, parsley","aster family flower, dill, coriander, tomat...",couple with basil seem to encourage lady bug,,"onion, garlic, potato, gladiolus",,asparagus officinalis | asparagus
"bean, bush",vegetable,phaseolus vulgaris,"cucumber, soybean, strawberry","celery, strawberry, grain",,,"fennel, soybean, dry bean, alfalfa","lettuce, potato, tomato, other legume, c...","phaseolus vulgaris | bean, bush"
"bean, pole",vegetable,phaseolus vulgaris,,"radish, corn",,,"sunflower, beet, brassicas, kohlrabi",the stalk of the corn provide a pole for the b...,"phaseolus vulgaris | bean, pole"
"bean, fava",vegetable,vicia faba,,"strawberry, celery",,,,see the entry for legume for more info,"vicia faba | bean, fava"
beet,vegetable,beta vulgaris,"broccoli, bush bean, cabbage, lettuce, koh...","bush beans, onion, kohlrabi, catnip, garli...",,,runner or pole bean,good for add mineral to the soil through compo...,beta vulgaris | beet
brassicas,vegetable,brassica,"beet, onion, potato, cereal,e,g, corn, wheat","beet, spinach, chard, aromatic plant or pla...",,wireworm,"mustards, nightshade,tomato, pepper, etc, ...",brassicas be a family of plant which include b...,brassica | brassicas
broccoli,vegetable,brassica oleracea,lettuce,"mixture of mustard, pac choi,rape, beet, di...",,,,broccoli as a main crop intercrop with lettuce...,brassica oleracea | broccoli
brussels sprout,vegetable,brassica oleracea,,"sage, thyme, clover, malt barley",,,,,brassica oleracea | brussels sprout
cabbage,vegetable,brassica oleracea / brassica chinensis,"bean, celery","bean, clover, calendula / pot marigold, cha...","snail,slug",,grape,"see brassicas entry for more info, if use clo...",brassica oleracea / brassica chinensis | cabbage


In [158]:
combined_df.to_pickle('01_base_data.pkl')

In [160]:
combined_df.loc['cucumber']

Category                                                      vegetable
Scientific name                                         cucumis sativus
Helps                                         bean,  kohlrabi,  lettuce
Helped by             kohlrabi,  nasturtium,  radish,  marigold,  su...
Attracts                                   beneficial for ground beetle
-Repels/+distracts                                        raccoon,  ant
Avoid                                            potato,  aromatic herb
Comments              sow 2 or 3 radish seed in with cucumber to rep...
combined_name                                cucumis sativus | cucumber
Name: cucumber, dtype: object