<a href="https://colab.research.google.com/github/RedietNegash/Machine-Learning/blob/main/content_based.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from math import sqrt

In [None]:
df_articles = pd.read_csv("/content/drive/MyDrive/Machine-Learning/Recommendation-Systems/articles_metadata.csv", low_memory=False)


In [None]:
print(df_articles.head())

   articleId                           title  \
0          1     The Impact of AI on Society   
1          2  Understanding Machine Learning   
2          3         Deep Learning Explained   
3          4          The Future of Robotics   
4          5                       AI Ethics   

                                             content  
0              AI is transforming various sectors...  
1              Machine learning is a subset of AI...  
2  Deep learning is a technique in machine learni...  
3  Robots are increasingly becoming part of our d...  
4  Ethical considerations in AI development are c...  


In [None]:
df_ratings=pd.read_csv("/content/drive/MyDrive/Machine-Learning/Recommendation-Systems/article_ratings_small.csv")
df_ratings.head(100)

Unnamed: 0,userId,articleId,rating,timestamp
0,1,1,4.0,1622555555
1,1,2,5.0,1622555556
2,1,3,4.0,1622555557
3,1,4,3.0,1622555558
4,1,5,5.0,1622555559
5,2,1,3.0,1622555560
6,2,3,4.0,1622555561
7,2,6,2.0,1622555562
8,2,7,5.0,1622555563
9,2,10,4.0,1622555564


In [None]:
df_merged = pd.merge(df_ratings, df_articles, on="articleId", how="left")


In [None]:
def calculate_similarity(articles):
    similarities = []
    for i in range(len(articles)):
        for j in range(i + 1, len(articles)):
            article1 = articles.iloc[i]
            article2 = articles.iloc[j]

            # Calculate content similarity (using a simple overlap of keywords for demonstration)
            content1 = set(article1['content'].split())
            content2 = set(article2['content'].split())
            shared_content = len(content1.intersection(content2))
            total_content = len(content1.union(content2))
            content_similarity = shared_content / total_content if total_content > 0 else 0

            # Calculate rating similarity
            rating1 = article1['rating']
            rating2 = article2['rating']
            rating_similarity = 1 - abs(rating1 - rating2) / max(rating1, rating2) if rating1 and rating2 else 0

            # Combine similarities: weighted average
            overall_similarity = (content_similarity + rating_similarity) / 2

            # Append results
            similarities.append({
                'articleId': article1['articleId'],
                'sim_articleId': article2['articleId'],
                'relevance': overall_similarity
            })

    return similarities

In [None]:
!pip install sentence_transformers



In [None]:
from sentence_transformers import SentenceTransformer

In [None]:
model = SentenceTransformer("all-MiniLM-L6-v2")



In [None]:
df_articles['content'] = df_articles['content'].fillna('')
all_content = df_articles['content'].tolist()
all_embeddings = model.encode(all_content)

df_articles = df_articles.copy()
df_articles['embedding'] = list(all_embeddings)


print(df_articles[['articleId', 'content', 'embedding']].head())

   articleId                                            content  \
0          1              AI is transforming various sectors...   
1          2              Machine learning is a subset of AI...   
2          3  Deep learning is a technique in machine learni...   
3          4  Robots are increasingly becoming part of our d...   
4          5  Ethical considerations in AI development are c...   

                                           embedding  
0  [0.0065397914, -0.02900096, -0.007339295, -0.0...  
1  [-0.015036831, -0.03257998, 0.07369401, 0.0105...  
2  [-0.036377545, -0.03372445, 0.013367745, -0.01...  
3  [-0.0032222318, -0.05688273, 0.08174227, -0.00...  
4  [-0.026047785, 0.041954458, -0.0559622, -0.046...  


In [None]:
all_embeddings.shape

(15, 384)

In [None]:
def compute_weighted_average(df_ratings):
    embeddings = np.array(df_articles['embedding'].tolist())
    ratings = df_ratings['rating'].values
    weighted_avg_embedding = np.average(embeddings, axis=0, weights=ratings)
    return weighted_avg_embedding

In [None]:
user_profiles = df_merged.groupby('userId').apply(compute_weighted_average).reset_index()
user_profiles.columns = ['userId', 'profile_embedding']
user_profiles.head()


  user_profiles = df_merged.groupby('userId').apply(compute_weighted_average).reset_index()


Unnamed: 0,userId,profile_embedding
0,1,"[-0.015925751865974495, -0.01784178330784752, ..."
1,2,"[-0.002341887059931954, -0.012313246313068602,..."
2,3,"[0.007804625471563716, -0.009231027926465398, ..."
3,4,"[-0.023701064055785537, -0.0268607372418046, 0..."
4,5,"[0.0015445607196953562, -0.03116066836648517, ..."


In [None]:
def get_relevant_articles(user_id, threshold=3.5):
    relevant_articles = df_merged[(df_merged['userId'] == user_id) & (df_merged['rating'] >= threshold)]['articleId']
    return set(relevant_articles)

In [None]:
def compute_similarity(profile_embedding, article_embeddings):
    return cosine_similarity([profile_embedding], all_embeddings )

In [None]:
def generate_recommendations_for_user_con(user_id, k=10):
    if user_id in user_profiles['userId'].values:
        profile_embedding = user_profiles[user_profiles['userId'] == user_id]['profile_embedding'].values[0]


        all_article_embeddings = np.array(df_articles['embedding'].tolist())


        similarities = compute_similarity(profile_embedding, all_article_embeddings)[0]


        all_articles_with_scores = df_articles[['content', 'articleId']].copy()
        all_articles_with_scores['similarity'] = similarities


        top_recommendations = all_articles_with_scores.sort_values(by='similarity', ascending=False).head(k)

        top_recommendations.reset_index(drop=True, inplace=True)

        print(f"Top {k} Recommendations for User ID {user_id}:")
        print(top_recommendations[['content', 'articleId', 'similarity']])

        return top_recommendations
    else:
        print(f"No user profile found for User ID {user_id}.")
        return None

In [None]:


def extract_unseen_movies(user_id, recommendations):
    seen_movie_ids = get_relevant_articles(user_id)
    unseen_movies = recommendations[~recommendations['articleId'].isin(seen_movie_ids)]
    return unseen_movies

In [None]:
user_id = 3
all_recommendations = generate_recommendations_for_user_con(user_id, k=10)

Top 10 Recommendations for User ID 3:
                                             content  articleId  similarity
0  Computer vision is a field that enables machin...          8    0.745435
1  Robots are increasingly becoming part of our d...          4    0.709408
2              Machine learning is a subset of AI...          2    0.684573
3              AI is transforming various sectors...          1    0.609886
4  AI applications in healthcare are revolutioniz...         11    0.592066
5  Data science combines statistics, computer sci...          6    0.588825
6  Deep learning is a technique in machine learni...          3    0.581179
7  Ethical considerations in AI development are c...          5    0.547763
8  Reinforcement learning is a type of machine le...         10    0.528841
9  AI can play a significant role in addressing c...         12    0.507852


In [None]:
def evaluate_recommendations(k=10, relevance_threshold=3.5):
    precision_sum = 0
    recall_sum = 0
    user_count = 0

    for user_id in user_profiles['userId']:
        user_count += 1
        all_recommendations = generate_recommendations_for_user_con(user_id, k=k)
        if all_recommendations is None:
            continue

        relevant_movies = get_relevant_articles(user_id, threshold=relevance_threshold)
        recommended_ids = set(all_recommendations['articleId'])
        relevant_and_recommended = recommended_ids & relevant_movies

        precision = len(relevant_and_recommended) / k
        recall = len(relevant_and_recommended) / len(relevant_movies) if len(relevant_movies) > 0 else 0

        precision_sum += precision
        recall_sum += recall

    precision_at_k = precision_sum / user_count
    recall_at_k = recall_sum / user_count
    f1_at_k = (2 * precision_at_k * recall_at_k) / (precision_at_k + recall_at_k) if (precision_at_k + recall_at_k) > 0 else 0

    print(f"Precision@{k}: {precision_at_k}")
    print(f"Recall@{k}: {recall_at_k}")
    print(f"F1 Score@{k}: {f1_at_k}")

# Call the evaluation function
evaluate_recommendations(k=10)

Top 10 Recommendations for User ID 1:
                                             content  articleId  similarity
0              Machine learning is a subset of AI...          2    0.834161
1              AI is transforming various sectors...          1    0.772954
2  Ethical considerations in AI development are c...          5    0.734210
3  Deep learning is a technique in machine learni...          3    0.707602
4  Robots are increasingly becoming part of our d...          4    0.618873
5  Reinforcement learning is a type of machine le...         10    0.593728
6  AI can play a significant role in addressing c...         12    0.589508
7  AI applications in healthcare are revolutioniz...         11    0.565175
8  Computer vision is a field that enables machin...          8    0.540962
9  GANs are a class of machine learning framework...          9    0.515122
Top 10 Recommendations for User ID 2:
                                             content  articleId  similarity
0  Deep lear

-Precision@10 means that, on average, 11.06% of the top 10 recommended movies are relevant to the user (i.e., they match the user's preferences or previous high ratings).


-Recall@K measures the proportion of all relevant items that are recommended within the top K. Recall@10 in this case indicates that the system, on average, retrieves 35.48% of all the relevant movies for the user within the top 10 recommendations.




Precision measures how much of the recommended content is relevant to the user.
Recall measures how much of the relevant content is recommended to the user.