# Load the libraries 

In [1]:
import numpy as np 
import pandas as pd 
from itertools import product
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix

# Clean the users data

In [2]:
users_database = pd.read_csv('/kaggle/input/recommendation-system-dataset/users.tsv', sep='\t')
users_database.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45 entries, 0 to 44
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   id               45 non-null     int64  
 1   first_name       45 non-null     object 
 2   last_name        45 non-null     object 
 3   email            45 non-null     object 
 4   role_id          45 non-null     int64  
 5   organisation_id  44 non-null     float64
 6   picture_name     39 non-null     object 
 7   position         44 non-null     object 
 8   gender           45 non-null     object 
 9   city             0 non-null      float64
 10  country          0 non-null      float64
 11  state            0 non-null      float64
 12  created          45 non-null     int64  
 13  phone_number     0 non-null      float64
 14  linkedin_url     0 non-null      float64
 15  description      0 non-null      float64
dtypes: float64(7), int64(3), object(6)
memory usage: 5.8+ KB


In [3]:
# Drop the ['city', 'country', 'state', 'phone_number', 'linkedin_url', 'description'] columns as it's no longer needed (Empty)
users_database = users_database.drop(columns=['city', 'country', 'state', 'phone_number', 'linkedin_url', 'description'])

users_database.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45 entries, 0 to 44
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   id               45 non-null     int64  
 1   first_name       45 non-null     object 
 2   last_name        45 non-null     object 
 3   email            45 non-null     object 
 4   role_id          45 non-null     int64  
 5   organisation_id  44 non-null     float64
 6   picture_name     39 non-null     object 
 7   position         44 non-null     object 
 8   gender           45 non-null     object 
 9   created          45 non-null     int64  
dtypes: float64(1), int64(3), object(6)
memory usage: 3.6+ KB


# Clean the content data

In [4]:
content_database = pd.read_csv('/kaggle/input/recommendation-system-dataset/content.tsv', sep='\t')
content_database.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 142 entries, 0 to 141
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   id                141 non-null    float64
 1   title             142 non-null    object 
 2    description      141 non-null    object 
 3   content_type      141 non-null    object 
 4   organisation_id   1 non-null      float64
 5   url               141 non-null    object 
 6   creator_id        7 non-null      float64
 7   created           140 non-null    float64
 8   publication_date  133 non-null    object 
dtypes: float64(4), object(5)
memory usage: 10.1+ KB


In [5]:
# Rename the column to remove leading whitespace (description column has leading whitespace)
content_database = content_database.rename(columns=lambda x: x.strip())

# Drop the 'organisation_id' and 'creator_id' columns as it's no longer needed (Empty)
# 'content_type' same value for all not important
content_database = content_database.drop(columns=['organisation_id', 'creator_id', 'content_type'])

# Remove row that doesn't has id
content_database = content_database.dropna(subset=['id'])

# Convert the id type to integer
content_database['id'] = content_database['id'].astype(int)

content_database.info()

<class 'pandas.core.frame.DataFrame'>
Index: 141 entries, 0 to 141
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   id                141 non-null    int64  
 1   title             141 non-null    object 
 2   description       141 non-null    object 
 3   url               140 non-null    object 
 4   created           140 non-null    float64
 5   publication_date  133 non-null    object 
dtypes: float64(1), int64(1), object(4)
memory usage: 7.7+ KB


In [6]:
content_database.head()

Unnamed: 0,id,title,description,url,created,publication_date
0,1,Justice Department Researcher Questions 0.3% T...,The 0.3 percent THC threshold in the federal d...,https://www.marijuanamoment.net/justice-depart...,1714286000.0,
1,2,Cannabis operator Canopy raising $35 million i...,Canadian cannabis company Canopy Growth Corp. ...,https://mjbizdaily.com/cannabis-operator-canop...,1714286000.0,
2,3,Global Cannabis Legalization: A Comprehensive ...,This article provides an overview of the evolv...,https://thecannabiswatcher.com/pages/articles/...,1714286000.0,
3,4,Top Biden Health Official In Touch With DEA Ab...,The U.S. Department of Health and Human Servic...,https://www.marijuanamoment.net/top-biden-heal...,1714286000.0,
4,5,Regulated marijuana offers new chance at equit...,"Martin Luther King III, co-founder of the Drum...",https://mjbizdaily.com/marijuana-social-equity...,1714286000.0,


# Clean the events data

In [7]:
events_database = pd.read_csv('/kaggle/input/recommendation-system-dataset/events.tsv', sep='\t')
events_database.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 266 entries, 0 to 265
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   id               266 non-null    int64  
 1   title            266 non-null    object 
 2   event_type       266 non-null    object 
 3   description      262 non-null    object 
 4   start            259 non-null    object 
 5   end              259 non-null    object 
 6   location         266 non-null    object 
 7   url              265 non-null    object 
 8   price            183 non-null    object 
 9   organisation_id  1 non-null      float64
 10  created          266 non-null    float64
 11  last_modified    187 non-null    float64
 12  lon              185 non-null    float64
 13  lat              185 non-null    float64
dtypes: float64(5), int64(1), object(8)
memory usage: 29.2+ KB


In [8]:
# Drop the 'organisation_id' column as it's no longer needed (Empty)
events_database = events_database.drop(columns=['organisation_id'])

# Remove row that doesn't has id
events_database = events_database.dropna(subset=['id'])

# Function to convert "From $number" to number
def handle_price_format(value):
    if value is np.nan:
        return np.nan
    elif value == 'Free':
        return 0

    return value.replace('From $', '')

# Apply the conversion function to the 'price' column
events_database['price'] = events_database['price'].apply(handle_price_format)

# Apply the conversion function to the 'description' column, if not exist then put empty string
events_database['description'] = events_database['description'].apply(lambda x: '' if x is np.nan else x)

events_database.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 266 entries, 0 to 265
Data columns (total 13 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             266 non-null    int64  
 1   title          266 non-null    object 
 2   event_type     266 non-null    object 
 3   description    266 non-null    object 
 4   start          259 non-null    object 
 5   end            259 non-null    object 
 6   location       266 non-null    object 
 7   url            265 non-null    object 
 8   price          183 non-null    object 
 9   created        266 non-null    float64
 10  last_modified  187 non-null    float64
 11  lon            185 non-null    float64
 12  lat            185 non-null    float64
dtypes: float64(4), int64(1), object(8)
memory usage: 27.1+ KB


In [9]:
events_database.head()

Unnamed: 0,id,title,event_type,description,start,end,location,url,price,created,last_modified,lon,lat
0,1,2024 Cannabis Research Conference,conferences,The 8th annual Cannabis Research Conference (C...,1723017600,1723219200,"Fort Collins, CO",https://www.eventbrite.com/e/2024-cannabis-res...,,1711441000.0,1714920000.0,-105.077011,40.587178
1,2,2024 HBS Cannabis Business Conference,conferences,Join us for a scholarly discussion of the cann...,1713639600,1713715200,"Boston, MA",https://www.eventbrite.com.au/e/2024-hbs-canna...,25.0,1711441000.0,1714920000.0,-71.060511,42.355433
2,3,"Ohio Cannabis Session, March 30, 2024",conferences,Training Coursework:,1711792800,1711803600,"Toledo, OH",https://www.eventbrite.com/e/ohio-cannabis-ses...,0.0,1711441000.0,1714920000.0,-83.537817,41.652914
3,4,3rd Annual Higher Learning: Cannabis Conferenc...,conferences,,1713610800,1713625200,"New York, NY",https://www.eventbrite.com/e/3rd-annual-higher...,,1711441000.0,1714920000.0,-74.006015,40.712728
4,5,Accelerate Cannabis: The Mid-Atlantic Mixer,conferences,Hard Rock Hotel & Casino Atlantic City,1704299400,1704299400,"Atlantic City, NJ",https://www.eventbrite.com/e/accelerate-cannab...,,1711441000.0,1714920000.0,-74.422935,39.364285


# Recommendation System Preprocessing

In [10]:
# Read the recommendations file
recommendations_file_path = '/kaggle/input/recommendation-system-dataset/recommendations.tsv'
df = pd.read_csv(recommendations_file_path, sep='\t')

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 658 entries, 0 to 657
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   id               658 non-null    int64  
 1   title            658 non-null    object 
 2   asset_type       658 non-null    object 
 3   asset_id         658 non-null    int64  
 4   organisation_id  658 non-null    int64  
 5   user_id          658 non-null    int64  
 6   creator_id       658 non-null    int64  
 7   system_score     658 non-null    float64
 8   user_score       0 non-null      float64
 9   created          658 non-null    int64  
 10  opened           625 non-null    float64
 11  deleted          403 non-null    float64
 12  saved            338 non-null    float64
 13  clicked_out      414 non-null    float64
dtypes: float64(6), int64(6), object(2)
memory usage: 72.1+ KB


**just require these fields ...**

In [12]:
# Assuming df is your DataFrame after loading your data
# Filter the required columns
filtered_df = df[['asset_type', 'asset_id', 'user_id', 'system_score']]

# Divide the dataset into two groups based on 'asset_type'
content_df = filtered_df[filtered_df['asset_type'] == 'content']
event_df = filtered_df[filtered_df['asset_type'] == 'event']

# Drop the 'asset_type' column as it's no longer needed
content_df = content_df.drop(columns=['asset_type'])
event_df = event_df.drop(columns=['asset_type'])

# Perform basic statistics for each group
content_stats = content_df['system_score'].describe()
event_stats = event_df['system_score'].describe()


# Additional analysis if needed (mean, median, count)
content_mean = round(content_df['system_score'].mean(), 3)
content_median = content_df['system_score'].median()
content_count = content_df['system_score'].count()

event_mean = round(event_df['system_score'].mean(), 3)
event_median = event_df['system_score'].median()
event_count = event_df['system_score'].count()

print("Content Statistics:")
print("Mean:", content_mean, "Median:", content_median, "Number of samples:", content_count)

print("\nEvent Statistics:")
print("Mean:", event_mean, "Median:", event_median, "Number of samples:", event_count)


Content Statistics:
Mean: 0.308 Median: 0.3 Number of samples: 218

Event Statistics:
Mean: 0.398 Median: 0.4 Number of samples: 220


# Content Recommendation System 

In [13]:
# Extract unique IDs
unique_user_ids = users_database['id'].unique()
unique_content_ids = content_database['id'].unique()


# Create all combinations of user_id and content_id
all_combinations = pd.DataFrame(product(unique_user_ids, unique_content_ids), columns=['user_id', 'content_id'])

# Merge all_combinations with content_df on the appropriate columns
# Since 'content_id' in all_combinations corresponds to 'asset_id' in content_df
merged_df = all_combinations.merge(content_df[['user_id', 'asset_id', 'system_score']], 
                                   left_on=['user_id', 'content_id'], 
                                   right_on=['user_id', 'asset_id'], 
                                   how='left')

# Drop the 'asset_id' column as it is redundant with 'content_id'
merged_df.drop(columns=['asset_id'], inplace=True)


merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6345 entries, 0 to 6344
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   user_id       6345 non-null   int64  
 1   content_id    6345 non-null   int64  
 2   system_score  218 non-null    float64
dtypes: float64(1), int64(2)
memory usage: 148.8 KB


## Evaluating sparsity
Here, we calculate sparsity by dividing the number of stored elements by total number of elements. The number of stored (non-empty) elements in our matrix (nnz) is equivalent to the number of ratings in our dataset.
- If sparsity more than 0.1 then you can make the recommendation system 
- Every user must have rated at least one piece of content rated
- Every piece of content must have been rated by at least one user

In [14]:
n_total = len(merged_df)
n_ratings = merged_df['system_score'].notna().sum()
sparsity = n_ratings/n_total
print(f"Matrix sparsity: {round(sparsity*100,2)}%")

Matrix sparsity: 3.44%


In [15]:
# Group by user_id and count non-null system scores for each user
user_ratings_count = merged_df.groupby('user_id')['system_score'].count()

# Get the minimum number of ratings among all users
min_ratings = user_ratings_count.min()

print("Minimum number of ratings by any user:", min_ratings)


Minimum number of ratings by any user: 0


In [16]:
# Group by content_id and count non-null system scores for each content
content_ratings_count = merged_df.groupby('content_id')['system_score'].count()

# Get the minimum number of ratings among all contents
min_ratings = content_ratings_count.min()

print("Minimum number of ratings by any content:", min_ratings)


Minimum number of ratings by any content: 0


## Building the Recommendation System

In [17]:
# Fill missing values with 0 (can change based on strategy)
# it just will take the users and contents that has at least one rated value
utility_matrix = merged_df.pivot_table(index='user_id', columns='content_id', values='system_score', fill_value=0)

# Convert to sparse matrix for efficient calculations
sparse_matrix = csr_matrix(utility_matrix.values)

# Compute cosine similarity between users
user_similarity = cosine_similarity(sparse_matrix)

# Compute cosine similarity between items
item_similarity = cosine_similarity(sparse_matrix.T)


In [18]:
def recommend_content_for_user(user_id, top_n=5):
    if user_id not in utility_matrix.index:
        raise Exception("User ID not found in the dataset")
    
    user_index = utility_matrix.index.tolist().index(user_id)
    similar_scores = user_similarity[user_index]
    similar_users = np.argsort(-similar_scores)
    
    # Get the content items already rated by the user
    rated_content = set(utility_matrix.loc[user_id][utility_matrix.loc[user_id] > 0].index)
    
    recommended_content = {}
    for i in similar_users:
        for content_id in utility_matrix.columns[np.argsort(-utility_matrix.iloc[i].values)]:
            if content_id not in rated_content and content_id not in recommended_content:
                recommended_content[content_id] = utility_matrix.iloc[i][content_id]
            if len(recommended_content) == top_n:
                return list(recommended_content.keys())
    return list(recommended_content.keys())

# Example usage:
user_recommendations = recommend_content_for_user(user_id=96)
print("Top 5 content recommendations for user:", user_recommendations)


Top 5 content recommendations for user: [21, 131, 127, 124, 123]


In [19]:
def recommend_similar_content(content_id, top_n=5):
    if content_id not in utility_matrix.columns:
        raise Exception("Content ID not found in the dataset")
    
    content_index = utility_matrix.columns.tolist().index(content_id)
    similar_scores = item_similarity[content_index]
    similar_content_indices = np.argsort(-similar_scores)[1:top_n+1]  # Exclude the content itself
    
    similar_content_ids = utility_matrix.columns[similar_content_indices]
    return list(similar_content_ids)

# Example usage:
content_id = 86
content_recommendations = recommend_similar_content(content_id=content_id)
print("Top 5 content recommendations for content:", content_recommendations)


Top 5 content recommendations for content: [70, 76, 137, 82, 124]


In [20]:
# Get details for the specific content_id
content_details = content_database[content_database['id'] == content_id][['id', 'title', 'description']].iloc[0]
    
# Get details for the other content IDs in the provided list
other_content_details = content_database[content_database['id'].isin(content_recommendations)][['id', 'title', 'description']]


print(f"Details for content_id {content_id}:")
print(f"Title: {content_details['title']}")
print(f"Description: {content_details['description']}\n")


print("\nDetails for other content IDs:")

for _, content in other_content_details.iterrows():
    print(f"Id: {content['id']}")
    print(f"Title: {content['title']}")
    print(f"Description: {content['description']}\n")
    

Details for content_id 86:
Title: Jushi looks to boost efficiency as it awaits new adult-use cannabis markets
Description: Multistate operator Jushi Holdings is trying to improve its efficiency and margins as it hopes for new adult-use cannabis markets in 2024.


Details for other content IDs:
Id: 70
Title: Canadian cannabis producer Organigram enters US, plans CA$25 million raise
Description: Organigram Holdings has dipped into its strategic investment pool to pick up a minority stake in Roxboro, North Carolina-based Open Book Extracts as part of its plan to target emerging cannabis markets in the United States and elsewhere.

Id: 76
Title: 280E, new markets and wholesale revenue highlight cannabis earnings season
Description: Tax strategies took the spotlight in fourth-quarter financial reports from publicly traded U.S. cannabis companies.

Id: 82
Title: How R&D cannabis lines help cultivators conduct market research
Description: R&D cannabis lines  or limited-edition, small-batch fl