In [264]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import project_tests as t
import pickle

import re

from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# Import the tokenize function from tokenize_mod
from tokenize_mod import tokenize
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

%matplotlib inline

df = pd.read_csv('data/user-item-interactions.csv')
df_content = pd.read_csv('data/articles_community.csv')
del df['Unnamed: 0']
del df_content['Unnamed: 0']

# Show df to get an idea of the data
# This data set includes the articles with title and email per "view"
df_content['article_id'] = df_content['article_id'].astype(float)
df_content = df_content.drop_duplicates(subset="article_id")
df_content.head()

Unnamed: 0,doc_body,doc_description,doc_full_name,doc_status,article_id
0,Skip navigation Sign in SearchLoading...\r\n\r...,Detect bad readings in real time using Python ...,Detect Malfunctioning IoT Sensors with Streami...,Live,0.0
1,No Free Hunch Navigation * kaggle.com\r\n\r\n ...,"See the forest, see the trees. Here lies the c...",Communicating data science: A guide to present...,Live,1.0
2,☰ * Login\r\n * Sign Up\r\n\r\n * Learning Pat...,Here’s this week’s news in Data Science and Bi...,"This Week in Data Science (April 18, 2017)",Live,2.0
3,"DATALAYER: HIGH THROUGHPUT, LOW LATENCY AT SCA...",Learn how distributed DBs solve the problem of...,DataLayer Conference: Boost the performance of...,Live,3.0
4,Skip navigation Sign in SearchLoading...\r\n\r...,This video demonstrates the power of IBM DataS...,Analyze NY Restaurant data using Spark in DSX,Live,4.0


In [268]:
def find_missing_articles(df, df_content):
    """
    This function checks if there are any article_ids in df that are not in df_content.
    
    Args:
    df (DataFrame): The main DataFrame with article interactions.
    df_content (DataFrame): The DataFrame containing article metadata.
    
    Returns:
    missing_article_ids (list): A list of article_ids in df that are not present in df_content.
    """
    # Convert article_id columns to string to avoid data type mismatch
    df['article_id'] = df['article_id'].astype(str)
    df_content['article_id'] = df_content['article_id'].astype(str)

    # Find article_ids in df that are not in df_content
    missing_article_ids = df[~df['article_id'].isin(df_content['article_id'])]['article_id'].unique()

    # Return the list of missing article_ids
    return missing_article_ids

In [270]:
missing_articles = find_missing_articles(df, df_content)

In [265]:
df.article_id.max()

1444.0

In [266]:
df_content.article_id.max()

1050.0

In [89]:
doc_name = df_content.doc_full_name.values.tolist()
len(doc_name)


1056

In [90]:
doc_name_tokenized = []
doc_name_tokenized = [tokenize(str(name)) for name in ibm_content]
doc_name_tokenized

[['detect', 'malfunctioning', 'iot', 'sensors', 'streaming', 'analytics'],
 ['communicating', 'data', 'science', 'a', 'guide', 'presenting', 'work'],
 ['this', 'week', 'data', 'science', 'april', '18', '2017'],
 ['datalayer',
  'conference',
  'boost',
  'performance',
  'distributed',
  'database'],
 ['analyze', 'ny', 'restaurant', 'data', 'using', 'spark', 'dsx'],
 ['browsing', 'postgresql', 'data', 'compose'],
 ['upgrading', 'postgresql', '9', '5'],
 ['data', 'wrangling', 'slack'],
 ['data', 'science', 'bowl', '2017'],
 ['using',
  'apache',
  'spark',
  'predict',
  'attack',
  'vector',
  'among',
  'billion',
  'user',
  'trillion',
  'event'],
 ['offline',
  'first',
  'ios',
  'apps',
  'swift',
  'cloudant',
  'sync',
  'part',
  '1',
  'the',
  'datastore'],
 ['warehousing', 'geojson', 'document'],
 ['timeseries',
  'data',
  'analysis',
  'iot',
  'event',
  'using',
  'jupyter',
  'notebook'],
 ['bridging', 'gap', 'between', 'python', 'scala', 'jupyter', 'notebooks'],
 ['go

In [93]:
# Count the number of 1s (non-zero elements that are 1)
ones_count = np.sum(X == 1)

# Count the number of non-zero elements (sparse matrices store only non-zero elements)
non_zero_count = X.count_nonzero()

# Total number of elements in the matrix
total_elements = X.shape[0] * X.shape[1]

# Number of 0s is the total elements minus the non-zero elements
zeros_count = total_elements - non_zero_count

# Number of 'other' elements are non-zero elements that are not 1
others_count = non_zero_count - ones_count

print(f"Number of 0s: {zeros_count}")
print(f"Number of 1s: {ones_count}")
print(f"Number of other values: {others_count}")
print(f"Total elements in the matrix: {total_elements}")

Number of 0s: 695332
Number of 1s: 39
Number of other values: 4757
Total elements in the matrix: 700128


In [157]:
def build_similarity_model(df_content):
    """
    Builds a content-based similarity model using TF-IDF and cosine similarity.
    
    The function performs the following steps:
    1. Vectorizes the document titles using TF-IDF.
    2. Computes the cosine similarity matrix between document titles.
    
    Returns:
    --------
    cosine_sim_df: DataFrame
        A DataFrame containing the cosine similarity scores between document titles.
    """

    # Extract document titles from the df
    doc_name = df_content['doc_full_name'].values.tolist()

    # Create a TF-IDF vectorizer and fit it to the document titles
    vectorizer = TfidfVectorizer(tokenizer=tokenize, min_df=2, max_df=0.95)
    X = vectorizer.fit_transform(doc_name)

    # Compute the cosine similarity matrix between all document vectors
    cosine_sim = cosine_similarity(X)

    # Convert the similarity matrix into a df
    cosine_sim_df = pd.DataFrame(cosine_sim, index=doc_name, columns=doc_name)

    # Return the cosine similarity DataFrame
    return cosine_sim_df


In [158]:
cosine_sim_df = build_similarity_model(df_content)
cosine_sim_df



Unnamed: 0,Detect Malfunctioning IoT Sensors with Streaming Analytics,Communicating data science: A guide to presenting your work,"This Week in Data Science (April 18, 2017)",DataLayer Conference: Boost the performance of your distributed database,Analyze NY Restaurant data using Spark in DSX,Browsing PostgreSQL Data with Compose,Upgrading your PostgreSQL to 9.5,Data Wrangling at Slack,Data Science Bowl 2017,Using Apache Spark to predict attack vectors among billions of users and trillions of events,...,Mapping All the Things with Python – IBM Watson Data Lab – Medium,Use IBM Data Science Experience to Read and Write Data Stored on Amazon S3,Use IoT data in Streams Designer for billing and alerts,Mapping Points with Folium,A Speed Guide To Redis Lua Scripting,A look under the covers of PouchDB-find,A comparison of logistic regression and naive Bayes,What I Learned Implementing a Classifier from Scratch in Python · Jean-Nicholas Hould,Use dashDB with Spark,"Jupyter Notebooks with Scala, Python, or R Kernels"
Detect Malfunctioning IoT Sensors with Streaming Analytics,1.0,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.233189,0.0,0.000000,0.0,0.0,0.00000,0.000000,0.00000
Communicating data science: A guide to presenting your work,0.0,1.000000,0.148876,0.0,0.045340,0.087390,0.0,0.052546,0.282969,0.000000,...,0.040063,0.141384,0.043087,0.0,0.268218,0.0,0.0,0.00000,0.000000,0.00000
"This Week in Data Science (April 18, 2017)",0.0,0.148876,1.000000,0.0,0.035614,0.068645,0.0,0.041275,0.526122,0.000000,...,0.031470,0.111058,0.033845,0.0,0.000000,0.0,0.0,0.00000,0.000000,0.00000
DataLayer Conference: Boost the performance of your distributed database,0.0,0.000000,0.000000,1.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.00000,0.000000,0.00000
Analyze NY Restaurant data using Spark in DSX,0.0,0.045340,0.035614,0.0,1.000000,0.072029,0.0,0.043310,0.067692,0.170562,...,0.033021,0.052428,0.035513,0.0,0.000000,0.0,0.0,0.00000,0.182155,0.00000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
A look under the covers of PouchDB-find,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.0,0.000000,1.0,0.0,0.00000,0.000000,0.00000
A comparison of logistic regression and naive Bayes,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.0,0.000000,0.0,1.0,0.00000,0.000000,0.00000
What I Learned Implementing a Classifier from Scratch in Python · Jean-Nicholas Hould,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,...,0.141363,0.000000,0.000000,0.0,0.000000,0.0,0.0,1.00000,0.000000,0.16529
Use dashDB with Spark,0.0,0.000000,0.000000,0.0,0.182155,0.000000,0.0,0.000000,0.000000,0.142585,...,0.000000,0.169739,0.229955,0.0,0.000000,0.0,0.0,0.00000,1.000000,0.00000


In [186]:
## No need to change the code here - this will be helpful for later parts of the notebook
# Run this cell to map the user email to a user_id column and remove the email column

def email_mapper():
    coded_dict = dict()
    cter = 1
    email_encoded = []
    
    for val in df['email']:
        if val not in coded_dict:
            coded_dict[val] = cter
            cter+=1
        
        email_encoded.append(coded_dict[val])
    return email_encoded

email_encoded = email_mapper()
del df['email']
df['user_id'] = email_encoded

# show header
df.head()

Unnamed: 0,article_id,title,user_id
0,1430.0,"using pixiedust for fast, flexible, and easier...",1
1,1314.0,healthcare python streaming application demo,2
2,1429.0,use deep learning for image classification,3
3,1338.0,ml optimization using cognitive assistant,4
4,1276.0,deploy your python model as a restful api,5


In [187]:
# Create a pivot table from dfs user_id and article_id columns where 1s indicate an entry and 0 not
df_new = df[['user_id', 'article_id']].pivot_table(index='user_id', columns='article_id', aggfunc=lambda x: 1, fill_value=0)
df_new.head(3)

article_id,0.0,2.0,4.0,8.0,9.0,12.0,14.0,15.0,16.0,18.0,...,1434.0,1435.0,1436.0,1437.0,1439.0,1440.0,1441.0,1442.0,1443.0,1444.0
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


In [188]:
# create the user-article matrix with 1's and 0's

def create_user_item_matrix(df):
    '''
    INPUT:
    df - pandas dataframe with article_id, title, user_id columns
    
    OUTPUT:
    user_item - user item matrix 
    
    Description:
    Return a matrix with user ids as rows and article ids on the columns with 1 values where a user interacted with 
    an article and a 0 otherwise
    '''
    # Fill in the function here
    user_item = df[['user_id', 'article_id']].pivot_table(index='user_id', columns='article_id', aggfunc=lambda x: 1, fill_value=0)
    
    return user_item # return the user_item matrix 

user_item = create_user_item_matrix(df)

In [199]:
def get_article_names(article_ids, df=df):
    '''
    INPUT:
    article_ids - (list) a list of article ids
    df - (pandas dataframe) df as defined at the top of the notebook
    
    OUTPUT:
    article_names - (list) a list of article names associated with the list of article ids 
                    (this is identified by the title column)
    '''
    # Your code here
    article_names = df[df['article_id'].isin(article_ids)]['title'].drop_duplicates().tolist()
    
    return article_names # Return the article names associated with list of article ids

def get_user_articles(user_id, user_item=user_item):
    '''
    INPUT:
    user_id - (int) a user id
    user_item - (pandas dataframe) matrix of users by articles: 
                1's when a user has interacted with an article, 0 otherwise
    
    OUTPUT:
    article_ids - (list) a list of the article ids seen by the user
    article_names - (list) a list of article names associated with the list of article ids 
                    (this is identified by the doc_full_name column in df_content)
    
    Description:
    Provides a list of the article_ids and article titles that have been seen by a user
    '''
    
    # Get the article IDs for the user, where interaction is 1
    article_ids = user_item.columns[user_item.iloc[user_id-1] == 1].astype(str).tolist()
    
    # Get the article names using the article IDs
    article_names = get_article_names(article_ids)
    
    return article_ids, article_names # return the ids and names

In [212]:
# Get the articles the user has already seen
user_article_ids = user_item.loc[20][user_item.loc[20] == 1].index.tolist()
print(user_article_ids)

article_ids, article_names = get_user_articles(20, user_item=user_item)
article_ids

[232.0, 844.0, 1320.0]


['232.0', '844.0', '1320.0']

In [214]:
df['article_id'].dtype

dtype('float64')

In [252]:
article_ids_seen, article_names_seen = get_user_articles(1, user_item=user_item)
articles_seen = df[df['article_id'].astype(str).isin(article_ids_seen)]
article_interactions = articles_seen['article_id'].value_counts()
article_id_nr_1 = article_interactions.idxmax()
article_id_nr_1 = float(article_id_nr_1)
article_interactions
#df.loc[article_ids_seen]

article_id
1429.0    937
1431.0    671
1427.0    643
1293.0    572
1170.0    565
1436.0    481
43.0      460
1185.0    442
1368.0    418
1305.0    413
151.0     352
1430.0    336
1052.0    330
1400.0    279
390.0     270
732.0     239
109.0     198
1391.0    191
1183.0    168
268.0     151
981.0     130
910.0     125
525.0      85
310.0      84
329.0      65
1439.0     59
1406.0     58
494.0      48
768.0      28
585.0      26
968.0      26
346.0      25
1363.0     22
1232.0     12
668.0      12
626.0      10
Name: count, dtype: int64

In [257]:
article_title = df_content.loc[1200]["doc_full_name"]
article_title

KeyError: 1200

In [229]:
article_title = df_content.loc[1320.0]["doc_full_name"]

KeyError: 1320.0

In [236]:
article_ids_seen, _ = get_user_articles(20, user_item=user_item)
article_ids_seen

['232.0', '844.0', '1320.0']

In [282]:
def content_based_recc(user_id, m=10, missing_articles=missing_articles):
    """
    Recommend m most similar articles to the most popular article the user has interacted with,
    excluding articles that are missing from df_content. Outputs both article_id and doc_full_name.
    
    Args:
    user_id (int): The ID of the user for whom recommendations are being made.
    m (int): The number of recommendations to generate. Default is 10.
    missing_articles (list): A list of article_ids that should be excluded from the recommendations.
    
    Returns:
    list: A list of recommended articles in the form of tuples (article_id, doc_full_name).
    """
    
    # Get article ids of the articles the user has seen
    article_ids_seen, _ = get_user_articles(user_id, user_item=user_item)

    # Filter the articles seen by the user, excluding missing articles in df_content
    articles_seen = df[df['article_id'].astype(str).isin(article_ids_seen) & 
                       ~df['article_id'].astype(str).isin(missing_articles)]

    # Check if any articles are left after filtering
    if articles_seen.empty:
        print(f"User {user_id} has not seen any valid articles (excluding missing ones).")
        return []

    # Rank articles seen by the user by the number of interactions
    article_interactions = articles_seen['article_id'].value_counts()
    article_id_nr_1 = article_interactions.idxmax()

    # Get the article name of the most interacted article
    article_name_nr_1 = df_content[df_content['article_id'] == article_id_nr_1]['doc_full_name'].values[0]

    # Print user id and the article chosen for similarity comparison
    print(f"Recommendations for User ID: {user_id}")
    print(f"Article used for similarity comparison (most interacted by user):")
    print(f"  - Article ID: {article_id_nr_1}, Article Name: {article_name_nr_1}")
    
    # Get the cosine similarity DataFrame using the build_similarity_model() function
    cosine_sim_df = build_similarity_model(df_content)  # Assuming cosine_sim_df is built globally or from another source

    # Find the most similar articles to the most popular one
    sorted_similarity_scores = cosine_sim_df.loc[article_name_nr_1].sort_values(ascending=False)

    # Drop the chosen article_id article from the list to avoid recommending the same article
    sorted_similarity_scores = sorted_similarity_scores.drop(article_name_nr_1)

    # Filter articles that the user has not seen, not missing, and take the top m recommendations
    recommendations = []
    
    for similar_article_name in sorted_similarity_scores.index:
        similar_article_id = df_content[df_content['doc_full_name'] == similar_article_name]['article_id'].values[0]

        # Only recommend articles that the user has not seen and are not in missing_articles
        if similar_article_id not in article_ids_seen and similar_article_id not in missing_articles:
            recommendations.append((similar_article_id, similar_article_name))

        # Stop when we have enough recommendations
        if len(recommendations) >= m:
            break

    # Output the recommended articles
    print("\nRecommended Articles:")
    for article_id, article_name in recommendations:
        print(f"  - Article ID: {article_id}, Article Name: {article_name}")
    
    return recommendations

In [281]:
content_based_recc(20)

Recommendations for User ID: 20
Article used for similarity comparison (most interacted by user):
  - Article ID: 844.0, Article Name: Use the Cloudant-Spark connector in Python notebook





Recommended Articles:
  - Article ID: 264.0, Article Name: Introducing spark-cloudant, an open source Spark connector for Cloudant data
  - Article ID: 934.0, Article Name: Load Cloudant Data in Apache Spark Using a Python Notebook
  - Article ID: 1049.0, Article Name: Use dashDB with Spark
  - Article ID: 776.0, Article Name: Your own weather forecast in a Python notebook
  - Article ID: 953.0, Article Name: Simple Data Pipe connectors
  - Article ID: 172.0, Article Name: Simple Data Pipe Connectors
  - Article ID: 463.0, Article Name: What is Spark?
  - Article ID: 560.0, Article Name: Load Cloudant Data in Apache Spark Using a Scala Notebook
  - Article ID: 916.0, Article Name: Use the new Cloudant query
  - Article ID: 161.0, Article Name: Use the Machine Learning Library in Spark


[('264.0',
  'Introducing spark-cloudant, an open source Spark connector for Cloudant data'),
 ('934.0', 'Load Cloudant Data in Apache Spark Using a Python Notebook'),
 ('1049.0', 'Use dashDB with Spark'),
 ('776.0', 'Your own weather forecast in a Python notebook'),
 ('953.0', 'Simple Data Pipe connectors'),
 ('172.0', 'Simple Data Pipe Connectors'),
 ('463.0', 'What is Spark?'),
 ('560.0', 'Load Cloudant Data in Apache Spark Using a Scala Notebook'),
 ('916.0', 'Use the new Cloudant query'),
 ('161.0', 'Use the Machine Learning Library in Spark')]

In [161]:
article_title = df_content.loc[20]["doc_full_name"]
    # Choose the row with the article_title in X
X.loc[article_title]
    
X

Unnamed: 0,Detect Malfunctioning IoT Sensors with Streaming Analytics,Communicating data science: A guide to presenting your work,"This Week in Data Science (April 18, 2017)",DataLayer Conference: Boost the performance of your distributed database,Analyze NY Restaurant data using Spark in DSX,Browsing PostgreSQL Data with Compose,Upgrading your PostgreSQL to 9.5,Data Wrangling at Slack,Data Science Bowl 2017,Using Apache Spark to predict attack vectors among billions of users and trillions of events,...,Mapping All the Things with Python – IBM Watson Data Lab – Medium,Use IBM Data Science Experience to Read and Write Data Stored on Amazon S3,Use IoT data in Streams Designer for billing and alerts,Mapping Points with Folium,A Speed Guide To Redis Lua Scripting,A look under the covers of PouchDB-find,A comparison of logistic regression and naive Bayes,What I Learned Implementing a Classifier from Scratch in Python · Jean-Nicholas Hould,Use dashDB with Spark,"Jupyter Notebooks with Scala, Python, or R Kernels"
Detect Malfunctioning IoT Sensors with Streaming Analytics,1.0,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.233189,0.0,0.000000,0.0,0.0,0.00000,0.000000,0.00000
Communicating data science: A guide to presenting your work,0.0,1.000000,0.148876,0.0,0.045340,0.087390,0.0,0.052546,0.282969,0.000000,...,0.040063,0.141384,0.043087,0.0,0.268218,0.0,0.0,0.00000,0.000000,0.00000
"This Week in Data Science (April 18, 2017)",0.0,0.148876,1.000000,0.0,0.035614,0.068645,0.0,0.041275,0.526122,0.000000,...,0.031470,0.111058,0.033845,0.0,0.000000,0.0,0.0,0.00000,0.000000,0.00000
DataLayer Conference: Boost the performance of your distributed database,0.0,0.000000,0.000000,1.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.00000,0.000000,0.00000
Analyze NY Restaurant data using Spark in DSX,0.0,0.045340,0.035614,0.0,1.000000,0.072029,0.0,0.043310,0.067692,0.170562,...,0.033021,0.052428,0.035513,0.0,0.000000,0.0,0.0,0.00000,0.182155,0.00000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
A look under the covers of PouchDB-find,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.0,0.000000,1.0,0.0,0.00000,0.000000,0.00000
A comparison of logistic regression and naive Bayes,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.0,0.000000,0.0,1.0,0.00000,0.000000,0.00000
What I Learned Implementing a Classifier from Scratch in Python · Jean-Nicholas Hould,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,...,0.141363,0.000000,0.000000,0.0,0.000000,0.0,0.0,1.00000,0.000000,0.16529
Use dashDB with Spark,0.0,0.000000,0.000000,0.0,0.182155,0.000000,0.0,0.000000,0.000000,0.142585,...,0.000000,0.169739,0.229955,0.0,0.000000,0.0,0.0,0.00000,1.000000,0.00000


In [240]:
article_title = df_content.loc[[20.0, 40]]["doc_full_name"]
article_title

20    Working interactively with RStudio and noteboo...
40    Ensemble Learning to Improve Machine Learning ...
Name: doc_full_name, dtype: object