In [150]:
import re

import numpy as np
import pandas as pd
import spacy

from spacy.lookups import Lookups
from spacy.lemmatizer import Lemmatizer
lookups = Lookups()
lookups.add_table("lemma_rules", {"noun": [["s", ""]]})
lemmatizer = Lemmatizer(lookups)

nlp = spacy.load('en')

pd.options.display.max_rows = 999

In [90]:
veggies_df = pd.read_excel("./companion-plants.xlsx", sheet_name="Vegetables")
fruits_df = pd.read_excel("./companion-plants.xlsx", sheet_name="Fruits")
herbs_df = pd.read_excel("./companion-plants.xlsx", sheet_name="Herbs")
flowers_df = pd.read_excel("./companion-plants.xlsx", sheet_name="Flowers")
others_df = pd.read_excel("./companion-plants.xlsx", sheet_name="Others")

combined_df = pd.concat([veggies_df, fruits_df, herbs_df, flowers_df, others_df])

In [91]:
def remove_references(x):
    try:
        return re.sub(r'\[[0-9]*\]', ',', x)
    except:
        ""
        
def replace_paren(x):
    try:
        x = re.sub(r'(\(|\))', ",", x)
        return x.lower()
    except:
        ""

def replace_and(x):
    try:
        x = re.sub(r'\ and\ ', ",", x)
        return x.lower()
    except:
        "" 


combined_df = combined_df.applymap(remove_references)
combined_df = combined_df.applymap(replace_paren)
combined_df = combined_df.applymap(replace_and)
combined_df = combined_df.fillna('')

In [149]:
def lemmatize(y):
    str_tokens = y.split(',')
    docs = [nlp(x) for x in str_tokens]
    
    new_list = []
    
    for doc in docs:
        lemmad_tokens = []
        for token in doc:
            lemmad_tokens.append(token.lemma_)
        
        new_list.append(' '.join(lemmad_tokens))
    
    return ','.join([s for s in new_list if s != ""])

# note - there are issues with repels / attracts, because of the + / - sometimes prepended to their data
# it's probably easier to clean this by hand.
combined_df = combined_df.apply(lambda x: x.apply(lemmatize) if x.name != 'Scientific Name' else x)

KeyboardInterrupt: 

In [92]:
combined_df["combined_name"] = combined_df["Scientific name"] + " | " + combined_df["Common name"]

In [93]:
# combined_df = combined_df.set_index('combined_name')
combined_df = combined_df.set_index('Common name')

In [94]:
combined_df

Unnamed: 0_level_0,Category,Scientific name,Helps,Helped by,Attracts,-Repels/+distracts,Avoid,Comments,combined_name
Common name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
alliums,vegetables,allium,"fruit trees,, nightshades ,tomatoes,, capsicum...","carrots,, tomatoes,, carrots,african spider pl...",thrips,"-rabbits,, slugs ,see garlic,, -aphids,, carro...","beans, peas,,",alliums are a family of plants which include o...,allium | alliums
asparagus,vegetables,asparagus officinalis,"tomatoes,, parsley,,","aster family flowers, dill, coriander, tomatoe...",coupled with basil seems to encourage lady bugs,,"onion, garlic, potatoes, gladiolus",,asparagus officinalis | asparagus
"beans, bush",vegetables,phaseolus vulgaris,"cucumber,, soybeans, strawberries,","celery,,, strawberries,,, grains,",,,"fennel,, soybeans,, dry beans,, alfalfa,","lettuce, potato, tomato, other legumes, cruci...","phaseolus vulgaris | beans, bush"
"beans, pole",vegetables,phaseolus vulgaris,,"radishes, corn,",,,"sunflowers,, beets,, brassicas,, kohlrabi,",the stalk of the corn provides a pole for the ...,"phaseolus vulgaris | beans, pole"
"beans, fava",vegetables,vicia faba,,"strawberries, celery,",,,,see the entry for legumes for more info,"vicia faba | beans, fava"
beets,vegetables,beta vulgaris,"broccoli,, bush beans,,, cabbage,, lettuce,, k...","bush beans,,, onions,, kohlrabi,,, catnip, gar...",,,"runner or pole beans,",good for adding minerals to the soil through c...,beta vulgaris | beets
brassicas,vegetables,brassica,"beets,, onions,, potatoes, cereals ,e,g, corn,...","beets,,, spinach,, chard,, aromatic plants or ...",,wireworms,"mustards, nightshades ,tomatoes,,, peppers, et...",brassicas are a family of plants which include...,brassica | brassicas
broccoli,vegetables,brassica oleracea,"lettuce,","mixture of mustard, pac choi,,rape, beets, dil...",,,,broccoli as a main crop intercropped with lett...,brassica oleracea | broccoli
brussels sprouts,vegetables,brassica oleracea,,"sage,, thyme,, clover,, malting barley",,,,,brassica oleracea | brussels sprouts
cabbage,vegetables,brassica oleracea / brassica chinensis,"beans,, celery,","beans,, clover,,, calendula/pot marigold,, cha...","snails,slugs,",,"grapes,","see brassicas entry for more info, if using cl...",brassica oleracea / brassica chinensis | cabbage


In [158]:
combined_df.to_pickle('01_base_data.pkl')