# Risk Inference 
## Hybrid Content-Based Filtering

To predict invasiveness, we use a vector space model:
* **Similarity Metric:** We use Cosine Similarity:
$$\text{similarity} = \frac{\mathbf{A} \cdot \mathbf{B}}{\|\mathbf{A}\| \|\mathbf{B}\|}$$

* **Aggregation:** Final Score = $(Environmental \times 0.8) + (Aggression \times 0.2)$

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity

# San Diego
# "Ideal Invasive" based on local climate needs
target_region_profile = {
    'growth_minimum_precipitation_mm': 250,  # SD is dry
    'growth_maximum_precipitation_mm': 400,
    'growth_minimum_temperature_deg_c': 10,   # SD is warm
    'growth_maximum_temperature_deg_c': 30,
    'growth_ph_minimum': 6.0,
    'growth_ph_maximum': 8.5
}

In [2]:
def calculate_invasive_risk(plant_df, target_profile):
    cols = list(target_profile.keys())
    
    target_vector = np.array(list(target_profile.values())).reshape(1, -1)
    
    plant_vectors = plant_df[cols].fillna(plant_df[cols].median())
    
    scaler = MinMaxScaler()
    scaler.fit(pd.concat([plant_vectors, pd.DataFrame(target_vector, columns=cols)]))
    
    scaled_plants = scaler.transform(plant_vectors)
    scaled_target = scaler.transform(target_vector)
    
    climate_scores = cosine_similarity(scaled_plants, scaled_target).flatten()
    
    growth_boost = plant_df['specifications_growth_rate'].apply(
        lambda x: 0.2 if str(x).lower() == 'rapid' else 0
    ).values
    
    final_risk = (climate_scores * 0.8) + (growth_boost)
    
    return np.clip(final_risk, 0, 1) # Ensure we don't exceed 1.0

In [3]:
mock_plants = pd.DataFrame({
    'common_name': ['Ice Plant', 'California Poppy', 'Tropical Fern'],
    'growth_minimum_precipitation_mm': [200, 300, 2000],
    'growth_maximum_precipitation_mm': [500, 600, 3000],
    'growth_minimum_temperature_deg_c': [8, 5, 20],
    'growth_maximum_temperature_deg_c': [35, 30, 40],
    'growth_ph_minimum': [7.0, 6.5, 5.0],
    'growth_ph_maximum': [8.5, 8.0, 6.0],
    'specifications_growth_rate': ['Rapid', 'Moderate', 'Slow']
})

mock_plants['risk_score'] = calculate_invasive_risk(mock_plants, target_region_profile)

print(mock_plants[['common_name', 'risk_score']].sort_values(by='risk_score', ascending=False))

        common_name  risk_score
0         Ice Plant    0.909478
1  California Poppy    0.732767
2     Tropical Fern    0.123774




In [6]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

df = pd.read_csv('vectorized_species_master.csv')

target_profile = {
    'native_region_count': 1.0,  
    'growth_ph_minimum': 0.5,    
    'growth_ph_maximum': 0.6,    
    'habit_Shrub': 1,            
    'habit_Graminoid': 1         
}

def calculate_invasive_risk_optimized(plant_df, profile):
    feature_cols = [c for c in plant_df.columns if c not in ['scientific_name', 'is_invasive']]
    matrix = plant_df[feature_cols].copy()
    
    target_vec = np.zeros((1, len(feature_cols)))
    for feature, value in profile.items():
        if feature in feature_cols:
            idx = feature_cols.index(feature)
            target_vec[0, idx] = value
            
    scores = cosine_similarity(matrix, target_vec).flatten()
    
    return scores

df['risk_score'] = calculate_invasive_risk_optimized(df, target_profile)

print(df[['scientific_name', 'is_invasive', 'risk_score']].sort_values(by='risk_score', ascending=False).head(10))

            scientific_name  is_invasive  risk_score
371  Echinochloa crus-galli            1    0.727056
231         Juncus bufonius            0    0.712845
48            Poa pratensis            1    0.693576
14           Juncus effusus            0    0.689689
66     Phalaris arundinacea            1    0.679438
9             Festuca rubra            0    0.676985
39    Deschampsia cespitosa            0    0.676187
136      Juncus articulatus            0    0.668961
266    Eleocharis palustris            0    0.667329
195           Poa nemoralis            0    0.662515
