## AI model using cleaned RESOURCES dataset

In [20]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler

class HybridResourceRecommender:
    def __init__(self, df):
        """
        Initialize the hybrid recommender system
        
        Args:
        df (pandas.DataFrame): DataFrame containing resource information
        """
        self.df = df
        self.prepare_data()
    
    def prepare_data(self):
        """
        Prepare data for recommendation
        """
        # Preprocess text features for content-based filtering
        # Combine relevant text columns
        self.df['content_features'] = self.df.apply(
            lambda row: f"{row['Resource Name']} {row['Description']} {row['Keywords']} "
            f"{row['Category 1']} {row['Category 2']} {row['Category 3']} "
            f"{row['SIC Description']} {row['Supply Type']}", 
            axis=1
        )
        
        # Create TF-IDF vectorizer for content-based similarity
        self.tfidf = TfidfVectorizer(stop_words='english')
        self.content_matrix = self.tfidf.fit_transform(self.df['content_features'])
        
        # Prepare numerical features for content-based filtering
        numerical_features = [
            'Total Quantity', 
            'Available Quantity', 
            'Latitude', 
            'Longitude'
        ]
        
        # Scale numerical features
        scaler = MinMaxScaler()
        self.numerical_features_scaled = scaler.fit_transform(
            self.df[numerical_features].fillna(0)
        )
    
    def content_based_recommendation(self, resource_id, top_n=5):
        """
        Generate content-based recommendations
        
        Args:
        resource_id (int): Index of the resource to find similar resources for
        top_n (int): Number of recommendations to return
        
        Returns:
        list: Top N similar resources
        """
        # Compute content similarity using TF-IDF and numerical features
        content_sim = cosine_similarity(
            self.content_matrix[resource_id], 
            self.content_matrix
        )[0]

        print(content_sim)
        
        # Compute numerical feature similarity
        numerical_sim = cosine_similarity(
            self.numerical_features_scaled[resource_id].reshape(1, -1), 
            self.numerical_features_scaled
        )[0]
        
        # Combine similarities (you can adjust weights)
        combined_sim = 0.7 * content_sim + 0.3 * numerical_sim
        
        # Remove the resource itself and get top N similar resources
        similar_indices = combined_sim.argsort()[::-1][1:top_n+1]
        
        return self.df.iloc[similar_indices]
    
    def collaborative_filtering_recommendation(self, resource_type='Want', top_n=5):
        """
        Generate collaborative filtering recommendations
        
        Args:
        resource_type (str): Type of resource to recommend ('Want' or 'Have')
        top_n (int): Number of recommendations to return
        
        Returns:
        pandas.DataFrame: Top N recommended resources
        """
        # Group resources by similar characteristics
        grouped = self.df.groupby([
            'Resource Type', 
            'Category 1', 
            'Category 2', 
            'Region'
        ]).size().reset_index(name='group_count')
        
        # Find most similar groups for the given resource type
        similar_groups = grouped[
            (grouped['Resource Type'] == resource_type)
        ].sort_values('group_count', ascending=False).head(top_n)
        
        # Recommend resources from these groups
        recommendations = self.df.merge(
            similar_groups, 
            on=['Resource Type', 'Category 1', 'Category 2', 'Region']
        ).drop_duplicates()
        
        return recommendations.head(top_n)
    
    
    def hybrid_recommendation(self, resource_id=None, resource_type=None, top_n=5):
        """
        Generate hybrid recommendations
        
        Args:
        resource_id (int, optional): Index of the resource to find similar resources for
        resource_type (str, optional): Type of resource to recommend
        top_n (int): Number of recommendations to return
        
        Returns:
        pandas.DataFrame: Hybrid recommendations
        """
        # If a specific resource is provided, use content-based recommendations
        if resource_id is not None:
            content_recs = self.content_based_recommendation(resource_id, top_n)
        
        # If a resource type is provided, use collaborative filtering
        elif resource_type is not None:
            content_recs = self.collaborative_filtering_recommendation(resource_type, top_n)
        
        # If neither is provided, return most common resources
        else:
            content_recs = self.df.groupby('Resource Type').size().reset_index(name='count')
            content_recs = content_recs.sort_values('count', ascending=False)
            content_recs = self.df[self.df['Resource Type'].isin(content_recs['Resource Type'].head(top_n))]
        
        return content_recs
    
    def recall_at_n(self, relevant_items, recommended_items, n):
        """
        Calculate Recall@N
        
        Args:
        relevant_items (list): List of ground truth relevant item IDs
        recommended_items (list): List of recommended item IDs
        n (int): Number of top recommendations to consider

        Returns:
        float: Recall@N score
        """
        # Take the top N recommended items
        top_n_recommended = recommended_items[:n]
        
        # Count relevant items in the top N recommendations
        relevant_and_recommended = len(set(top_n_recommended) & set(relevant_items))
        
        # Recall@N: Proportion of relevant items retrieved
        recall = relevant_and_recommended / len(relevant_items) if relevant_items else 0
        
        return recall
    
    def recommend_for_multiple_rows(self, row_indices, top_n=20):
        """
        Generate recommendations for multiple rows in the dataset
        
        Args:
        row_indices (list): List of row indices to generate recommendations for
        top_n (int): Number of recommendations for each row
        
        Returns:
        dict: A dictionary with row indices as keys and recommendation DataFrames as values
        """
        recommendations = {}
        for idx in row_indices:
            recommendations[idx] = self.content_based_recommendation(idx, top_n)
        return recommendations

df = pd.read_excel('RESOURCES_CLEANED.xlsx')
recommender = HybridResourceRecommender(df)


In [21]:
relevant_items = [1, 1403, 225]  # Ground truth relevant item IDs
recommended_items = [1464, 1403, 1468, 225, 226, 754, 753, 1404, 1407, 1235, 1410, 1409, 880, 1383, 749, 1390, 1388, 76]  # Top 18 recommended item IDs

# Calculate Recall@N with N=20
def recall_at_n(relevant_items, recommended_items, n):
    recommended_at_n = recommended_items[:n]
    relevant_retrieved = set(recommended_at_n).intersection(set(relevant_items))
    recall = len(relevant_retrieved) / len(relevant_items) if relevant_items else 0
    return recall

recall_score = recall_at_n(relevant_items, recommended_items, n=18)
print(f"Recall@20: {recall_score * 100:.2f}%")

# Example row indices to recommend
row_indices_to_recommend = [0, 1, 2] 

# Generate recommendations for the selected rows
recommendations = recommender.recommend_for_multiple_rows(row_indices_to_recommend, top_n=20)

# Print recommendations
for idx, recs in recommendations.items():
    print(f"Recommendations for Row {idx}:")
    print(recs[['Resource Name', 'Description', 'Category 1', 'Category 2']])
    print("\n")


Recall@20: 66.67%
[1.         0.96175489 0.01104741 ... 0.00920052 0.0076407  0.00173889]
[0.96175489 1.         0.00213811 ... 0.0093621  0.00777489 0.00176943]
[0.01104741 0.00213811 1.         ... 0.0103447  0.00559023 0.01142547]
Recommendations for Row 0:
                                       Resource Name  \
1                 1x vrije vrachtwagen met chauffeur   
1464                                     Vrachtwagen   
1403                               transport pallets   
1468                           Vrachtwagen met kraan   
225                           Chauffeur vrachtwagens   
226                           Chauffeur vrachtwagens   
754                       Laadcapaciteit vrachtwagen   
753                       Laadcapaciteit vrachtwagen   
1404                               transport pallets   
1407                         Transport palletvervoer   
1235                                    retourvracht   
1410                              Transport Tsjechië   
1409       