## House Plant Recommender System 

This notebook will cover the building of a content based recommender system (step 4). 

A content based system will be used as their is no user information available.

#### Setup

In [None]:
from typing import Union, Tuple

import pandas as pd
import numpy as np
import sqlite3

from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity

DATABASE_LOC = r"C:\Users\Rory Crean\Dropbox (lkgroup)\Backup_HardDrive\Postdoc\PyForFun\House_Plant_Recommender\Database\house_plants.db"

In [None]:
conn = sqlite3.connect(DATABASE_LOC)
c = conn.cursor()
df_features = pd.read_sql_query("SELECT * FROM plant_features", conn)
c.close()
df_features.head()

#### Step 4: Recommender System

In [8]:
def calc_cosine_sim(feature_array: np.ndarray) -> np.ndarray:
    """
    Calculate the cosine similarity matrix for an array of features. 
    MinMaxScaling performed prior to the calculation.  

    Parameters
    ----------
    feature_array: np.ndarray
        Array of features for the calculation. 
        
    Returns
    ----------
    np.ndarray
        Cosine similarity matrix. 
    """ 
    scaler = MinMaxScaler()
    features_scaled = scaler.fit_transform(feature_array)
    return cosine_similarity(features_scaled, features_scaled)

In [9]:
cosine_sim = calc_cosine_sim(feature_array=df_features.values[:, 1:])
cosine_sim

array([[1.        , 0.73490828, 0.74035088, ..., 0.73894473, 0.55428125,
        0.50024286],
       [0.73490828, 1.        , 0.80863867, ..., 0.82229848, 0.7857269 ,
        0.75056835],
       [0.74035088, 0.80863867, 1.        , ..., 0.70871133, 0.55756408,
        0.68664677],
       ...,
       [0.73894473, 0.82229848, 0.70871133, ..., 1.        , 0.70842529,
        0.81142489],
       [0.55428125, 0.7857269 , 0.55756408, ..., 0.70842529, 1.        ,
        0.68281625],
       [0.50024286, 0.75056835, 0.68664677, ..., 0.81142489, 0.68281625,
        1.        ]])

In [10]:
def _plant_recommend_scores(plant_name: str, cosine_sim: np.ndarray) -> Tuple[int, dict]:
    """
    Determine the recommendation scores for a single plant.
    All scores are returned, not only the top scores. 

    Parameters
    ----------
    plant_name: str
        Name of the plant to search for similar plants. 
        
    cosine_sim: np.ndarray
        Cosine similarity matrix. 

    Returns
    ----------
    int
        Index of the plant that is being searched.

    dict
        Keys are the plant index and values are their score.
    """
    search_idx = df_features.loc[df_features["Plant_Name"] == plant_name].index[0]
   
    similarity_scores = pd.Series(cosine_sim[search_idx]).sort_values(ascending=False)
    idxs = list(similarity_scores.index)
    scores = list(np.round(similarity_scores.values, 4))

    # matches are already ordered from best to worst. 
    matches = {idxs[i]: scores[i] for i in range(len(idxs))}
    return search_idx, matches

In [11]:
def recommend_plant(plants_selected: Union[str, list], cosine_sim: np.ndarray) -> dict:
    """
    Recommend the top 10 most similar plants to a single or multiple plants.
    Similarity determined by the cosine_similarity (pre-determined). 

    In the case of multiple plants to search against, each plant is weighted equally. 

    Parameters
    ----------
    plants_selected: Union[str, list]
        String (for single plant) or list (for multiple plants) of plant name(s) to make
        recommendations on. 
        
    cosine_sim: np.ndarray
        Cosine similarity matrix. 

    Returns
    ----------
    dict
        Top 10 most similar plants as the keys and values are scores. 
    """

    # single plant to search. 
    if isinstance(plants_selected, str):
        search_idx, total_scores = _plant_recommend_scores(plant_name=plants_selected, cosine_sim=cosine_sim) 

        # remove the plant that was searched from the results
        total_scores.pop(search_idx)

    # multiple plants to search. 
    else:
        search_idxs, results = [], []
        for plant in plants_selected:
            search_idx, result = _plant_recommend_scores(plant_name=plant, cosine_sim=cosine_sim)
            search_idxs.append(search_idx)
            results.append(result)

        # sum each score for each plant searched and make combined score of same format as single search. 
        total_scores = {}
        for key, value in results[0].items():
            
            score = 0
            for idx in range(len(results)):
                score += results[idx][key]
                
            total_scores.update({key: round(score, 4)})

        total_scores = {k: v for k, v in sorted(total_scores.items(), key=lambda item: item[1], reverse=True)}

        # remove the plant that was searched from the results
        for search_idx in search_idxs:
           total_scores.pop(search_idx)
        

    # convert from index and score to Plant Name
    top_10_plants = {}
    for idx, (k, v) in enumerate(total_scores.items()):
        if idx == 10: break
        top_10_plants.update({df_features.iloc[k]["Plant_Name"]: v})

    return top_10_plants

results = recommend_plant(plants_selected=["Ficus elastica", "Ficus lyrata"], cosine_sim=cosine_sim) # ["Ficus elastica", "Ginkgo biloba"]
results

{'Ficus': 1.8988,
 'Ficus benghalensis': 1.7863,
 'Ficus religiosa': 1.7863,
 'Ficus benjamina': 1.7721,
 'Schefflera actinophylla': 1.7334,
 'Pachira aquatica': 1.6788,
 'Plerandra elegantissima': 1.6782,
 'Radermachera sinica': 1.6743,
 'Dracaena fragrans': 1.6298,
 'Fatsia japonica': 1.6223}

In [13]:
recommend_plant(plants_selected="Peperomia", cosine_sim=cosine_sim) # ["Ficus elastica", "Ginkgo biloba"]

{'Peperomia obtusifolia': 1.0,
 'Peperomia caperata': 0.9989,
 'Aglaonema commutatum': 0.9859,
 'Aspidistra elatior': 0.9787,
 'Anthurium andraeanum': 0.9695,
 'Saintpaulia ionantha': 0.9573,
 'Pelargonium (scented-leaved group)': 0.9461,
 'Soleirolia soleirolii': 0.9459,
 'Pelargonium × hortorum': 0.9452,
 'Kalanchoe blossfeldiana': 0.9373}