<a href="https://colab.research.google.com/github/RedietNegash/Machine-Learning/blob/main/Hybrid.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from math import sqrt

In [3]:
df_articles = pd.read_csv("/content/drive/MyDrive/Machine-Learning/Recommendation-Systems/articles_metadata.csv", low_memory=False)


In [4]:
print(df_articles.head())

   articleId                           title  \
0          1     The Impact of AI on Society   
1          2  Understanding Machine Learning   
2          3         Deep Learning Explained   
3          4          The Future of Robotics   
4          5                       AI Ethics   

                                             content  
0              AI is transforming various sectors...  
1              Machine learning is a subset of AI...  
2  Deep learning is a technique in machine learni...  
3  Robots are increasingly becoming part of our d...  
4  Ethical considerations in AI development are c...  


In [6]:
df_ratings=pd.read_csv("/content/drive/MyDrive/Machine-Learning/Recommendation-Systems/article_ratings_small.csv")
df_ratings.head(10)

Unnamed: 0,userId,articleId,rating,timestamp
0,1,1,4.0,1622555555
1,1,2,5.0,1622555556
2,1,3,4.0,1622555557
3,1,4,3.0,1622555558
4,1,5,5.0,1622555559
5,2,1,3.0,1622555560
6,2,3,4.0,1622555561
7,2,6,2.0,1622555562
8,2,7,5.0,1622555563
9,2,10,4.0,1622555564


In [7]:
df_merged = pd.merge(df_ratings, df_articles, on="articleId", how="left")


In [8]:
!pip install sentence_transformers

Collecting sentence_transformers
  Downloading sentence_transformers-3.2.0-py3-none-any.whl.metadata (10 kB)
Downloading sentence_transformers-3.2.0-py3-none-any.whl (255 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m255.2/255.2 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentence_transformers
Successfully installed sentence_transformers-3.2.0


In [9]:
from sentence_transformers import SentenceTransformer

  from tqdm.autonotebook import tqdm, trange


In [10]:
model = SentenceTransformer("all-MiniLM-L6-v2")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [11]:
df_articles['content'] = df_articles['content'].fillna('')
all_content = df_articles['content'].tolist()
all_embeddings = model.encode(all_content)

df_articles = df_articles.copy()
df_articles['embedding'] = list(all_embeddings)


print(df_articles[['articleId', 'content', 'embedding']].head())

   articleId                                            content  \
0          1              AI is transforming various sectors...   
1          2              Machine learning is a subset of AI...   
2          3  Deep learning is a technique in machine learni...   
3          4  Robots are increasingly becoming part of our d...   
4          5  Ethical considerations in AI development are c...   

                                           embedding  
0  [0.0065397914, -0.02900096, -0.007339295, -0.0...  
1  [-0.015036831, -0.03257998, 0.07369401, 0.0105...  
2  [-0.036377545, -0.03372445, 0.013367745, -0.01...  
3  [-0.0032222318, -0.05688273, 0.08174227, -0.00...  
4  [-0.026047785, 0.041954458, -0.0559622, -0.046...  


In [12]:
all_embeddings.shape

(15, 384)

In [16]:
def compute_weighted_average(df_ratings):
    article_ids = df_ratings['articleId'].tolist()


    filtered_articles = df_articles[df_articles['articleId'].isin(article_ids)]

    embeddings = np.array(filtered_articles['embedding'].tolist())
    ratings = df_ratings['rating'].values
    weighted_avg_embedding = np.average(embeddings, axis=0, weights=ratings)

    return weighted_avg_embedding

In [17]:
user_profiles = df_merged.groupby('userId').apply(compute_weighted_average).reset_index()
user_profiles.columns = ['userId', 'profile_embedding']
user_profiles.head()


  user_profiles = df_merged.groupby('userId').apply(compute_weighted_average).reset_index()


Unnamed: 0,userId,profile_embedding
0,1,"[-0.015925751865974495, -0.01784178330784752, ..."
1,2,"[-0.002341887059931954, -0.012313246313068602,..."
2,3,"[0.007804625471563716, -0.009231027926465398, ..."
3,4,"[-0.023701064055785537, -0.0268607372418046, 0..."
4,5,"[0.0015445607196953562, -0.03116066836648517, ..."


In [18]:
def get_relevant_articles(user_id, threshold=3.5):
    relevant_articles = df_merged[(df_merged['userId'] == user_id) & (df_merged['rating'] >= threshold)]['articleId']
    return set(relevant_articles)

In [19]:
def compute_similarity(profile_embedding, movie_embeddings):
    return cosine_similarity([profile_embedding], all_embeddings )

In [20]:
def generate_recommendations_for_user_con(user_id, k=10):
    if user_id in user_profiles['userId'].values:
        profile_embedding = user_profiles[user_profiles['userId'] == user_id]['profile_embedding'].values[0]


        all_article_embeddings = np.array(df_articles['embedding'].tolist())


        similarities = compute_similarity(profile_embedding, all_article_embeddings)[0]


        all_articles_with_scores = df_articles[['content', 'articleId']].copy()
        all_articles_with_scores['similarity'] = similarities


        top_recommendations = all_articles_with_scores.sort_values(by='similarity', ascending=False).head(k)

        top_recommendations.reset_index(drop=True, inplace=True)

        print(f"Top {k} Recommendations for User ID {user_id}:")
        print(top_recommendations[['content', 'articleId', 'similarity']])

        return top_recommendations
    else:
        print(f"No user profile found for User ID {user_id}.")
        return None


In [21]:
def extract_new_articles(user_id, recommendations):
    seen_article_ids = get_relevant_articles(user_id)
    new_articles = recommendations[~recommendations['articleId'].isin(seen_article_ids)]
    print('\n\n-----------------------------------------------------------------')
    print(f"Top 10  New Articles Recommendations for User ID {user_id}:")
    print(new_articles[['content','articleId', 'similarity']])

    return new_articles

In [23]:
user_id = 3
all_recommendations = generate_recommendations_for_user_con(user_id, k=10)
new_articles_recommendations=extract_new_articles(user_id, all_recommendations)

Top 10 Recommendations for User ID 3:
                                             content  articleId  similarity
0  Computer vision is a field that enables machin...          8    0.745435
1  Robots are increasingly becoming part of our d...          4    0.709408
2              Machine learning is a subset of AI...          2    0.684573
3              AI is transforming various sectors...          1    0.609886
4  AI applications in healthcare are revolutioniz...         11    0.592066
5  Data science combines statistics, computer sci...          6    0.588825
6  Deep learning is a technique in machine learni...          3    0.581179
7  Ethical considerations in AI development are c...          5    0.547763
8  Reinforcement learning is a type of machine le...         10    0.528841
9  AI can play a significant role in addressing c...         12    0.507852


-----------------------------------------------------------------
Top 10  New Articles Recommendations for User ID 3:
      

In [24]:
def evaluate_recommendations(k=10, relevance_threshold=3.5):
    precision_sum = 0
    recall_sum = 0
    user_count = 0

    for user_id in user_profiles['userId']:
        user_count += 1
        all_recommendations = generate_recommendations_for_user_con(user_id, k=k)
        if all_recommendations is None:
            continue

        relevant_movies = get_relevant_articles(user_id, threshold=relevance_threshold)
        recommended_ids = set(all_recommendations['articleId'])
        relevant_and_recommended = recommended_ids & relevant_movies

        precision = len(relevant_and_recommended) / k
        recall = len(relevant_and_recommended) / len(relevant_movies) if len(relevant_movies) > 0 else 0

        precision_sum += precision
        recall_sum += recall

    precision_at_k = precision_sum / user_count
    recall_at_k = recall_sum / user_count
    f1_at_k = (2 * precision_at_k * recall_at_k) / (precision_at_k + recall_at_k) if (precision_at_k + recall_at_k) > 0 else 0

    print(f"Precision@{k}: {precision_at_k}")
    print(f"Recall@{k}: {recall_at_k}")
    print(f"F1 Score@{k}: {f1_at_k}")


evaluate_recommendations(k=10)

Top 10 Recommendations for User ID 1:
                                             content  articleId  similarity
0              Machine learning is a subset of AI...          2    0.834161
1              AI is transforming various sectors...          1    0.772954
2  Ethical considerations in AI development are c...          5    0.734210
3  Deep learning is a technique in machine learni...          3    0.707602
4  Robots are increasingly becoming part of our d...          4    0.618873
5  Reinforcement learning is a type of machine le...         10    0.593728
6  AI can play a significant role in addressing c...         12    0.589508
7  AI applications in healthcare are revolutioniz...         11    0.565175
8  Computer vision is a field that enables machin...          8    0.540962
9  GANs are a class of machine learning framework...          9    0.515122
Top 10 Recommendations for User ID 2:
                                             content  articleId  similarity
0  Deep lear

-Precision@10 means that, on average, 11.06% of the top 10 recommended movies are relevant to the user (i.e., they match the user's preferences or previous high ratings).


-Recall@K measures the proportion of all relevant items that are recommended within the top K. Recall@10 in this case indicates that the system, on average, retrieves 35.48% of all the relevant movies for the user within the top 10 recommendations.




Precision measures how much of the recommended content is relevant to the user.
Recall measures how much of the relevant content is recommended to the user.

## **collaborative**

In [25]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [27]:
df_articles.head()
df_ratings_subset = df_ratings.copy()

df_ratings_subset = df_ratings_subset.groupby('userId').head(5)
df_ratings_subset.head()

Unnamed: 0,userId,articleId,rating,timestamp
0,1,1,4.0,1622555555
1,1,2,5.0,1622555556
2,1,3,4.0,1622555557
3,1,4,3.0,1622555558
4,1,5,5.0,1622555559


In [28]:

pivoted_data =df_ratings_subset.pivot(index='userId', columns='articleId', values='rating')
pivoted_data



articleId,1,2,3,4,5,6,7,8,9,10,11
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,4.0,5.0,4.0,3.0,5.0,,,,,,
2,3.0,,4.0,,,2.0,5.0,,,4.0,
3,,,,5.0,2.0,4.0,,5.0,,,3.0
4,,4.0,5.0,,,,4.0,,3.0,,
5,4.0,,,,,5.0,,,,,


In [29]:
pivoted_data_filled = pivoted_data.fillna(0)


In [30]:
user_similarity = cosine_similarity(pivoted_data_filled)
user_similarity

array([[1.        , 0.35082321, 0.29485315, 0.51613977, 0.26194334],
       [0.35082321, 1.        , 0.10757898, 0.58848989, 0.41065937],
       [0.29485315, 0.10757898, 1.        , 0.        , 0.35141842],
       [0.51613977, 0.58848989, 0.        , 1.        , 0.        ],
       [0.26194334, 0.41065937, 0.35141842, 0.        , 1.        ]])

In [35]:
user_similarity_df = pd.DataFrame(user_similarity, index=pivoted_data_filled.index, columns=pivoted_data_filled.index)
user_similarity_df

userId,1,2,3,4,5
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,1.0,0.350823,0.294853,0.51614,0.261943
2,0.350823,1.0,0.107579,0.58849,0.410659
3,0.294853,0.107579,1.0,0.0,0.351418
4,0.51614,0.58849,0.0,1.0,0.0
5,0.261943,0.410659,0.351418,0.0,1.0


In [31]:
def get_k_nearest_neighbors(similarity_matrix, target_user, k):
    similarities = similarity_matrix.loc[target_user]
    similar_users = similarities.sort_values(ascending=False).index[1:k+1]
    print("simliar users")
    print(similar_users)
    return similar_users

In [39]:
def recommend_articles_coll(ratings, similarity_matrix, target_user, k):

    if target_user not in similarity_matrix.index:
        print(f"Target user {target_user} not found in similarity matrix.")
        return None


    nearest_neighbors = get_k_nearest_neighbors(similarity_matrix, target_user, k)
    neighbor_ratings = ratings.loc[nearest_neighbors]
    avg_ratings = neighbor_ratings.mean()
    user_ratings = ratings.loc[target_user]



    recommendations = avg_ratings

    print("Recommendations including the user seen articles")
    print(recommendations)

    recommendations = avg_ratings[user_ratings == 0]
    print("Filtered recommendations (excluding articles already rated by target user):")
    print(recommendations)




    return recommendations

In [40]:
k = 2
target_user = 3
recommended_articles = recommend_articles_coll(pivoted_data_filled, user_similarity_df, target_user, k)


simliar users
Index([5, 1], dtype='int64', name='userId')
Recommendations including the user seen articles
articleId
1     4.0
2     2.5
3     2.0
4     1.5
5     2.5
6     2.5
7     0.0
8     0.0
9     0.0
10    0.0
11    0.0
dtype: float64
Filtered recommendations (excluding articles already rated by target user):
articleId
1     4.0
2     2.5
3     2.0
7     0.0
9     0.0
10    0.0
dtype: float64


In [41]:
def evaluate_collaborative_filtering(k=10, relevance_threshold=3.5):
    precision_sum = 0
    recall_sum = 0
    user_count = 0

    for user_id in user_similarity_df.index:
        user_count += 1
        recommended_articles = recommend_articles_coll(pivoted_data_filled, user_similarity_df, user_id, k)
        if recommended_articles is None:
            continue

        # Relevant articles for the user
        user_ratings = df_ratings[df_ratings['userId'] == user_id]
        relevant_articles = set(user_ratings[user_ratings['rating'] >= relevance_threshold]['articleId'])

        # Extracting the list of recommended article IDs from the recommended articles
        recommended_ids = set(recommended_articles.index)
        relevant_and_recommended = recommended_ids & relevant_articles

        # Precision and Recall calculation
        precision = len(relevant_and_recommended) / k
        recall = len(relevant_and_recommended) / len(relevant_articles) if len(relevant_articles) > 0 else 0

        precision_sum += precision
        recall_sum += recall

    precision_at_k = precision_sum / user_count
    recall_at_k = recall_sum / user_count
    f1_at_k = (2 * precision_at_k * recall_at_k) / (precision_at_k + recall_at_k) if (precision_at_k + recall_at_k) > 0 else 0

    return precision_at_k, recall_at_k, f1_at_k
precision_at_k, recall_at_k, f1_at_k = evaluate_collaborative_filtering(k=10)

print(f"Precision@{k}: {precision_at_k}")
print(f"Recall@{k}: {recall_at_k}")
print(f"F1 Score@{k}: {f1_at_k}")

simliar users
Index([4, 2, 3, 5], dtype='int64', name='userId')
Recommendations including the user seen articles
articleId
1     1.75
2     1.00
3     2.25
4     1.25
5     0.50
6     2.75
7     2.25
8     1.25
9     0.75
10    1.00
11    0.75
dtype: float64
Filtered recommendations (excluding articles already rated by target user):
articleId
6     2.75
7     2.25
8     1.25
9     0.75
10    1.00
11    0.75
dtype: float64
simliar users
Index([4, 5, 1, 3], dtype='int64', name='userId')
Recommendations including the user seen articles
articleId
1     2.00
2     2.25
3     2.25
4     2.00
5     1.75
6     2.25
7     1.00
8     1.25
9     0.75
10    0.00
11    0.75
dtype: float64
Filtered recommendations (excluding articles already rated by target user):
articleId
2     2.25
4     2.00
5     1.75
8     1.25
9     0.75
11    0.75
dtype: float64
simliar users
Index([5, 1, 2, 4], dtype='int64', name='userId')
Recommendations including the user seen articles
articleId
1     2.75
2     2.25
3  

In [45]:
def hybrid_recommend_items(cb_recommender, cf_recommender, user_id, cb_weight=1.0, cf_weight=1.0, topn=10):

    cb_recs = cb_recommender(user_id, k=1000)
    cf_recs = cf_recommender(pivoted_data_filled, user_similarity_df, user_id, k=1000)

    if cb_recs is None or cf_recs is None:
        return None

    cb_recs = cb_recs.rename(columns={'similarity': 'recStrengthCB'})
    cf_recs = cf_recs.reset_index().rename(columns={0: 'recStrengthCF'})
    cf_recs.columns = ['articleId', 'recStrengthCF']

    cb_recs['articleId'] = cb_recs['articleId'].astype(str)
    cf_recs['articleId'] = cf_recs['articleId'].astype(str)

    merged_recs = cb_recs.merge(cf_recs, on='articleId', how='outer').fillna(0)

    merged_recs['recStrengthHybrid'] = (merged_recs['recStrengthCB'] * cb_weight) + (merged_recs['recStrengthCF'] * cf_weight)

    recommendations = merged_recs.sort_values(by='recStrengthHybrid', ascending=False).head(topn)

    recommendations = recommendations[['articleId', 'recStrengthHybrid']]

    recommendations = recommendations.reset_index(drop=True)

    print(recommendations)
    return recommendations

user_id = 3
recommendations = hybrid_recommend_items(
    cb_recommender=generate_recommendations_for_user_con,
    cf_recommender=recommend_articles_coll,
    user_id=user_id,
    cb_weight=1.0,
    cf_weight=1.0,
    topn=5
)


Top 1000 Recommendations for User ID 3:
                                              content  articleId  similarity
0   Computer vision is a field that enables machin...          8    0.745435
1   Robots are increasingly becoming part of our d...          4    0.709408
2               Machine learning is a subset of AI...          2    0.684573
3               AI is transforming various sectors...          1    0.609886
4   AI applications in healthcare are revolutioniz...         11    0.592066
5   Data science combines statistics, computer sci...          6    0.588825
6   Deep learning is a technique in machine learni...          3    0.581179
7   Ethical considerations in AI development are c...          5    0.547763
8   Reinforcement learning is a type of machine le...         10    0.528841
9   AI can play a significant role in addressing c...         12    0.507852
10  NLP enables computers to understand human lang...          7    0.464464
11  GANs are a class of machine lear

In [47]:
def evaluate_hybrid_recommender(k=10, cb_weight=1.0, cf_weight=1.0, relevance_threshold=3.5):
    precision_sum = 0
    recall_sum = 0
    user_count = 0

    for user_id in user_profiles['userId']:
        user_count += 1

        # Generate hybrid recommendations
        recommendations = hybrid_recommend_items(
            cb_recommender=generate_recommendations_for_user_con,
            cf_recommender=recommend_articles_coll,
            user_id=user_id,
            cb_weight=cb_weight,
            cf_weight=cf_weight,
            topn=k
        )

        if recommendations is None:
            continue

        # Get relevant movies
        relevant_movies = get_relevant_articles(user_id, threshold=relevance_threshold)
        recommended_ids = set(recommendations['articleId'])
        relevant_and_recommended = recommended_ids & relevant_movies

        # Calculate precision and recall
        precision = len(relevant_and_recommended) / k
        recall = len(relevant_and_recommended) / len(relevant_movies) if len(relevant_movies) > 0 else 0

        precision_sum += precision
        recall_sum += recall

    # Calculate average precision, recall, and F1 score
    precision_at_k = precision_sum / user_count
    recall_at_k = recall_sum / user_count
    f1_at_k = (2 * precision_at_k * recall_at_k) / (precision_at_k + recall_at_k) if (precision_at_k + recall_at_k) > 0 else 0

    return precision_at_k, recall_at_k, f1_at_k

# Call the evaluation function for hybrid recommendations
precision_at_k, recall_at_k, f1_at_k = evaluate_hybrid_recommender(k=10)

print(f"Hybrid Precision@{k}: {precision_at_k}")
print(f"Hybrid Recall@{k}: {recall_at_k}")
print(f"Hybrid F1 Score@{k}: {f1_at_k}")


Top 1000 Recommendations for User ID 1:
                                              content  articleId  similarity
0               Machine learning is a subset of AI...          2    0.834161
1               AI is transforming various sectors...          1    0.772954
2   Ethical considerations in AI development are c...          5    0.734210
3   Deep learning is a technique in machine learni...          3    0.707602
4   Robots are increasingly becoming part of our d...          4    0.618873
5   Reinforcement learning is a type of machine le...         10    0.593728
6   AI can play a significant role in addressing c...         12    0.589508
7   AI applications in healthcare are revolutioniz...         11    0.565175
8   Computer vision is a field that enables machin...          8    0.540962
9   GANs are a class of machine learning framework...          9    0.515122
10  NLP enables computers to understand human lang...          7    0.447851
11  Data science combines statistics

Precision 4 of 0.098 means that, on average, only about 9.8% of the top 4 recommended movies are relevant to the user.

Recall 4 of 0.332 indicates that the system is able to retrieve about 33.2% of all relevant movies for the user within the top 4 recommendations.

F1 Score 4 of 0.151 is relatively low, reflecting a trade-off between precision and recall. It indicates that while the system is somewhat effective in retrieving relevant items, its overall performance is not strong.