## House Plant Recommender System 

This notebook will cover the building of a content based recommender system (step 4). 

A content based system will be used as their is no user information available.

**Summary of This Notebook:**

- Recommender system is built and can be used with a single or multiple plant selections.
- Tested saving the cosine matrix and reloading from SQL and it worked fine, so don't need to re-run the calc again. 

#### Setup

In [1]:
from typing import Union, Tuple

import pandas as pd
import numpy as np
import sqlite3
import json

from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity

DATABASE_LOC = r"C:\Users\Rory Crean\Dropbox (lkgroup)\Backup_HardDrive\Postdoc\PyForFun\House_Plant_Recommender\Database\house_plants.db"

In [2]:
conn = sqlite3.connect(DATABASE_LOC)
c = conn.cursor()
df_features = pd.read_sql_query("SELECT * FROM plant_features", conn)
c.close()
df_features.head()

Unnamed: 0,Plant_Name,Min_Temp_Degrees_C,Min_Height,Max_Height_Capped,Min_Spread,Max_Spread_Capped,Sunlight_Ordinal,Watering_Ordinal,Maintenance_Ordinal,Flowers_Ordinal,Type_Bulb,Type_Fern,Type_Herbaceous_perennial,Type_Other,Type_Vine,Color_Not_Colorful,Fruit_Yes
0,Aechmea,-1.1,1.0,3.0,1.0,2.0,1,3,2,3,0,0,0,1,0,1,0
1,Ardisia crenata,-12.2,4.0,5.0,4.0,5.0,2,3,2,3,0,0,0,0,0,1,1
2,Euphorbia milii,-6.7,3.0,6.0,1.5,3.0,4,2,2,3,0,0,0,0,0,1,0
3,Ficus elastica,-1.1,50.0,20.0,50.0,20.0,1,3,1,2,0,0,0,0,0,1,0
4,Woodsia obtusa,-34.4,1.0,1.5,2.0,2.5,2,3,2,1,0,1,0,0,0,1,0


#### Step 4: Recommender System

In [3]:
def calc_cosine_sim(feature_array: np.ndarray) -> np.ndarray:
    """
    Calculate the cosine similarity matrix for an array of features. 
    MinMaxScaling performed prior to the calculation.  

    Parameters
    ----------
    feature_array: np.ndarray
        Array of features for the calculation. 
        
    Returns
    ----------
    np.ndarray
        Cosine similarity matrix. 
    """ 
    scaler = MinMaxScaler()
    features_scaled = scaler.fit_transform(feature_array)
    return cosine_similarity(features_scaled, features_scaled)

In [4]:
cosine_sim = calc_cosine_sim(feature_array=df_features.values[:, 1:])
cosine_sim

array([[1.        , 0.73493514, 0.74033414, ..., 0.74067915, 0.55439626,
        0.53306672],
       [0.73493514, 1.        , 0.8085923 , ..., 0.82325575, 0.78572096,
        0.79582047],
       [0.74033414, 0.8085923 , 1.        , ..., 0.7097595 , 0.55735369,
        0.72998216],
       ...,
       [0.74067915, 0.82325575, 0.7097595 , ..., 1.        , 0.70831206,
        0.84166606],
       [0.55439626, 0.78572096, 0.55735369, ..., 0.70831206, 1.        ,
        0.71846984],
       [0.53306672, 0.79582047, 0.72998216, ..., 0.84166606, 0.71846984,
        1.        ]])

In [5]:
def _plant_recommend_scores(plant_name: str, cosine_sim: np.ndarray) -> Tuple[int, dict]:
    """
    Determine the recommendation scores for a single plant.
    All scores are returned, not only the top scores. 

    Parameters
    ----------
    plant_name: str
        Name of the plant to search for similar plants. 
        
    cosine_sim: np.ndarray
        Cosine similarity matrix. 

    Returns
    ----------
    int
        Index of the plant that is being searched.

    dict
        Keys are the plant index and values are their score.
    """
    search_idx = df_features.loc[df_features["Plant_Name"] == plant_name].index[0]
   
    similarity_scores = pd.Series(cosine_sim[search_idx]).sort_values(ascending=False)
    idxs = list(similarity_scores.index)
    scores = list(np.round(similarity_scores.values, 4))

    # matches are already ordered from best to worst. 
    matches = {idxs[i]: scores[i] for i in range(len(idxs))}
    return search_idx, matches

In [6]:
def recommend_plant(plants_selected: Union[str, list], cosine_sim: np.ndarray) -> list:
    """
    Recommend the top 10 most similar plants to a single or multiple plants.
    Similarity determined by the cosine_similarity (pre-determined). 

    In the case of multiple plants to search against, each plant is weighted equally. 

    Parameters
    ----------
    plants_selected: Union[str, list]
        String (for single plant) or list (for multiple plants) of plant name(s) to make
        recommendations on. 
        
    cosine_sim: np.ndarray
        Cosine similarity matrix. 

    Returns
    ----------
    list
        Top 10 most similar plants ordered by their scores.  
    """

    # single plant to search. 
    if isinstance(plants_selected, str):
        search_idx, total_scores = _plant_recommend_scores(plant_name=plants_selected, cosine_sim=cosine_sim) 

        # remove the plant that was searched from the results
        total_scores.pop(search_idx)

    # multiple plants to search. 
    else:
        search_idxs, results = [], []
        for plant in plants_selected:
            search_idx, result = _plant_recommend_scores(plant_name=plant, cosine_sim=cosine_sim)
            search_idxs.append(search_idx)
            results.append(result)

        # sum each score for each plant searched and make combined score of same format as single search. 
        total_scores = {}
        for key, value in results[0].items():
            
            score = 0
            for idx in range(len(results)):
                score += results[idx][key]
                
            total_scores.update({key: round(score, 4)})

        total_scores = {k: v for k, v in sorted(total_scores.items(), key=lambda item: item[1], reverse=True)}

        # remove the plant that was searched from the results
        for search_idx in search_idxs:
           total_scores.pop(search_idx)
        

    # convert from index and score to Plant Name
    top_10_plants = []
    for idx, (k, v) in enumerate(total_scores.items()):
        if idx == 10: break
        top_10_plants.append(df_features.iloc[k]["Plant_Name"])

    return top_10_plants

In [7]:
results_1plant = recommend_plant(plants_selected="Peperomia", cosine_sim=cosine_sim)
results_2plants = recommend_plant(plants_selected=["Ficus elastica", "Ginkgo biloba"], cosine_sim=cosine_sim) 

In [8]:
results_1plant

['Peperomia obtusifolia',
 'Peperomia caperata',
 'Aglaonema commutatum',
 'Aspidistra elatior',
 'Anthurium andraeanum',
 'Saintpaulia ionantha',
 'Soleirolia soleirolii',
 'Pelargonium scented-leaves',
 'Pelargonium × hortorum',
 'Kalanchoe blossfeldiana']

In [9]:
results_2plants

['Radermachera sinica',
 'Dypsis lutescens',
 'Rhapis excelsa',
 'Ficus religiosa',
 'Ficus benghalensis',
 'Pinus pinea',
 'Araucaria heterophylla',
 'Ficus benjamina',
 'Euphorbia tirucalli',
 'Schefflera actinophylla']

#### Save the cosine similarity matrix to the database and check I can load it back in and use it in the function.

Saves having to install sklearn on heroku/pythonanywhere and normally space is a concern there...

In [10]:
type(cosine_sim)

numpy.ndarray

In [11]:

conn = sqlite3.connect(DATABASE_LOC)
c = conn.cursor()
c.execute("DROP TABLE IF EXISTS cosine_sim")
c.execute("CREATE TABLE cosine_sim (id INTEGER PRIMARY KEY, array BLOB)")
c.execute("INSERT INTO cosine_sim VALUES (?,?)", (None, json.dumps(cosine_sim.tolist())))
conn.commit()
c.execute("SELECT * FROM cosine_sim")
data = c.fetchall()
c.close()
cosine_sim_sql = np.asarray(json.loads(data[0][1]))


In [12]:
cosine_sim_sql == cosine_sim

array([[ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       ...,
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True]])

In [13]:
# Check I get the same output.
results_1plant_sql = recommend_plant(plants_selected="Peperomia", cosine_sim=cosine_sim_sql)
results_2plants_sql = recommend_plant(plants_selected=["Ficus elastica", "Ginkgo biloba"], cosine_sim=cosine_sim_sql) 

display(results_1plant_sql == results_1plant)
display(results_2plants_sql == results_2plants)

display(results_1plant_sql)
display(results_2plants_sql)

True

True

['Peperomia obtusifolia',
 'Peperomia caperata',
 'Aglaonema commutatum',
 'Aspidistra elatior',
 'Anthurium andraeanum',
 'Saintpaulia ionantha',
 'Soleirolia soleirolii',
 'Pelargonium scented-leaves',
 'Pelargonium × hortorum',
 'Kalanchoe blossfeldiana']

['Radermachera sinica',
 'Dypsis lutescens',
 'Rhapis excelsa',
 'Ficus religiosa',
 'Ficus benghalensis',
 'Pinus pinea',
 'Araucaria heterophylla',
 'Ficus benjamina',
 'Euphorbia tirucalli',
 'Schefflera actinophylla']