In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import project_tests as t
import pickle

%matplotlib inline

df = pd.read_csv('data/user-item-interactions.csv')
df_content = pd.read_csv('data/articles_community.csv')
del df['Unnamed: 0']
del df_content['Unnamed: 0']

# Show df to get an idea of the data
df.head()

Unnamed: 0,article_id,title,email
0,1430.0,"using pixiedust for fast, flexible, and easier...",ef5f11f77ba020cd36e1105a00ab868bbdbf7fe7
1,1314.0,healthcare python streaming application demo,083cbdfa93c8444beaa4c5f5e0f5f9198e4f9e0b
2,1429.0,use deep learning for image classification,b96a4f2e92d8572034b1e9b28f9ac673765cd074
3,1338.0,ml optimization using cognitive assistant,06485706b34a5c9bf2a0ecdac41daf7e7654ceb7
4,1276.0,deploy your python model as a restful api,f01220c46fc92c6e6b161b1849de11faacd7ccb2


In [None]:
df.shape

(45993, 3)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45993 entries, 0 to 45992
Data columns (total 3 columns):
article_id    45993 non-null float64
title         45993 non-null object
email         45976 non-null object
dtypes: float64(1), object(2)
memory usage: 1.1+ MB


In [None]:
# Show df_content to get an idea of the data
df_content.head()

Unnamed: 0,doc_body,doc_description,doc_full_name,doc_status,article_id
0,Skip navigation Sign in SearchLoading...\r\n\r...,Detect bad readings in real time using Python ...,Detect Malfunctioning IoT Sensors with Streami...,Live,0
1,No Free Hunch Navigation * kaggle.com\r\n\r\n ...,"See the forest, see the trees. Here lies the c...",Communicating data science: A guide to present...,Live,1
2,☰ * Login\r\n * Sign Up\r\n\r\n * Learning Pat...,Here’s this week’s news in Data Science and Bi...,"This Week in Data Science (April 18, 2017)",Live,2
3,"DATALAYER: HIGH THROUGHPUT, LOW LATENCY AT SCA...",Learn how distributed DBs solve the problem of...,DataLayer Conference: Boost the performance of...,Live,3
4,Skip navigation Sign in SearchLoading...\r\n\r...,This video demonstrates the power of IBM DataS...,Analyze NY Restaurant data using Spark in DSX,Live,4


In [None]:
df_content.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1056 entries, 0 to 1055
Data columns (total 5 columns):
doc_body           1042 non-null object
doc_description    1053 non-null object
doc_full_name      1056 non-null object
doc_status         1056 non-null object
article_id         1056 non-null int64
dtypes: int64(1), object(4)
memory usage: 41.3+ KB


### <a class="anchor" id="Exploratory-Data-Analysis">Part I : Exploratory Data Analysis</a>

In [None]:
interactions = df.groupby(['email'])['article_id'].count().reset_index()
interactions.rename(columns={'article_id':'interactions count'}, inplace=True)
interactions.describe()

Unnamed: 0,interactions count
count,5148.0
mean,8.930847
std,16.802267
min,1.0
25%,1.0
50%,3.0
75%,9.0
max,364.0


In [None]:
# Fill in the median and maximum number of user_article interactios below

median_val = 3 # 50% of individuals interact with 3 number of articles or fewer.
max_views_by_user = 364 # The maximum number of user-article interactions by any 1 user is 364.

`2.` Explore and remove duplicate articles from the **df_content** dataframe.  

In [None]:
# Find and explore duplicate articles

df_content.duplicated(subset='article_id').sum()

5

In [None]:
# Remove any rows that have the same article_id - only keep the first

df_content.drop_duplicates(subset='article_id', inplace=True)
df_content.duplicated(subset='article_id').sum()

0


Use the cells below to find:

**a.** The number of unique articles that have an interaction with a user.  
**b.** The number of unique articles in the dataset (whether they have any interactions or not).<br>
**c.** The number of unique users in the dataset. (excluding null values) <br>
**d.** The number of user-article interactions in the dataset.

In [None]:
# The number of unique articles that have at least one interaction with a user

u_interactions = df.groupby(['article_id'])['email'].nunique().reset_index()
u_interactions.rename(columns={'email':'unique users'}, inplace=True)
u_interactions.sort_values(by='unique users', ascending=False).head()

Unnamed: 0,article_id,unique users
625,1330.0,467
699,1429.0,397
652,1364.0,388
614,1314.0,345
671,1398.0,329


In [None]:
# The number of unique articles in the dataset (whether they have any interactions or not)

unique_articles = df['article_id'].nunique()
unique_articles

714

In [None]:
# The number of unique users in the dataset. (excluding null values)

unique_users = df['email'].nunique()
unique_users

5148

In [None]:
# The number of user-article interactions in the dataset

tot_inter = df.shape[0]
tot_inter

45993

In [None]:
# The number of unique articles on the IBM platform

total_articles = df_content['article_id'].nunique()
total_articles

1051


Use the cells below to find the most viewed **article_id**, as well as how often it was viewed.  After talking to the company leaders, the `email_mapper` function was deemed a reasonable way to map users to ids.  There were a small number of null values, and it was found that all of these null values likely belonged to a single user (which is how they are stored using the function below).

In [None]:
# The most viewed article in the dataset as a string with one value following the decimal 
most_viewed_article_id = str(df.groupby(['article_id'])['email'].count().sort_values(ascending=False).index[0])

# The most viewed article in the dataset was viewed how many times?
max_views = df.groupby(['article_id'])['email'].count().sort_values(ascending=False).values[0] 

In [None]:
## No need to change the code here - this will be helpful for later parts of the notebook
# Run this cell to map the user email to a user_id column and remove the email column

def email_mapper():
    coded_dict = dict()
    cter = 1
    email_encoded = []
    
    for val in df['email']:
        if val not in coded_dict:
            coded_dict[val] = cter
            cter+=1
        
        email_encoded.append(coded_dict[val])
    return email_encoded

email_encoded = email_mapper()
del df['email']
df['user_id'] = email_encoded

# show header
df.head()

Unnamed: 0,article_id,title,user_id
0,1430.0,"using pixiedust for fast, flexible, and easier...",1
1,1314.0,healthcare python streaming application demo,2
2,1429.0,use deep learning for image classification,3
3,1338.0,ml optimization using cognitive assistant,4
4,1276.0,deploy your python model as a restful api,5


In [None]:
df.user_id.nunique()

5149

In [None]:
## If you stored all your results in the variable names above, 
## you shouldn't need to change anything in this cell

sol_1_dict = {
    '`50% of individuals have _____ or fewer interactions.`': median_val,
    '`The total number of user-article interactions in the dataset is ______.`': tot_inter, #
    '`The maximum number of user-article interactions by any 1 user is ______.`': max_views_by_user,
    '`The most viewed article in the dataset was viewed _____ times.`': max_views, #
    '`The article_id of the most viewed article is ______.`': most_viewed_article_id, #
    '`The number of unique articles that have at least 1 rating ______.`': unique_articles,
    '`The number of unique users in the dataset is ______`': unique_users,
    '`The number of unique articles on the IBM platform`': total_articles
}

# Test your dictionary against the solution
t.sol_1_test(sol_1_dict)

It looks like you have everything right here! Nice job!


### <a class="anchor" id="Rank">Part II: Rank-Based Recommendations</a>



In [None]:
def get_top_articles(n, df=df):
    '''
    INPUT:
    n - (int) the number of top articles to return
    df - (pandas dataframe) df as defined at the top of the notebook 
    
    OUTPUT:
    top_articles - (list) A list of the top 'n' article titles 
    
    '''
    top_articles = list(df.groupby(['title'])['user_id'].count().sort_values(ascending=False).index[:n])
    
    return top_articles # Return the top article titles from df (not df_content)

def get_top_article_ids(n, threshold=10, df=df):
    '''
    INPUT:
    n - (int) the number of top articles to return
    threshold - (int) the minimum number of interactions to consider
    df - (pandas dataframe) df as defined at the top of the notebook 
    
    OUTPUT:
    top_articles - (list) A list of the top 'n' article titles 
    
    '''
    top_articles = df.groupby(['article_id'])['user_id'].count().sort_values(ascending=False).reset_index()
    top_articles = top_articles[top_articles['user_id']>=10]
    top_n_articles = top_articles.iloc[:10]
    top_ids = [str(x) for x in top_n_articles['article_id']]
 
    return top_ids # Return the top article ids

In [None]:
print(get_top_articles(10))
print(get_top_article_ids(10))

['use deep learning for image classification', 'insights from new york car accident reports', 'visualize car data with brunel', 'use xgboost, scikit-learn & ibm watson machine learning apis', 'predicting churn with the spss random tree algorithm', 'healthcare python streaming application demo', 'finding optimal locations of new store using decision optimization', 'apache spark lab, part 1: basic concepts', 'analyze energy consumption in buildings', 'gosales transactions for logistic regression model']
['1429.0', '1330.0', '1431.0', '1427.0', '1364.0', '1314.0', '1293.0', '1170.0', '1162.0', '1304.0']


In [None]:
# Test your function by returning the top 5, 10, and 20 articles
top_5 = get_top_articles(5)
top_10 = get_top_articles(10)
top_20 = get_top_articles(20)

# Test each of your three lists from above
t.sol_2_test(get_top_articles)

Your top_5 looks like the solution list! Nice job.
Your top_10 looks like the solution list! Nice job.
Your top_20 looks like the solution list! Nice job.


### <a class="anchor" id="User-User">Part III: User-User Based Collaborative Filtering</a>



In [None]:
# create the user-article matrix with 1's and 0's

def create_user_item_matrix(df):
    '''
    INPUT:
    df - pandas dataframe with article_id, title, user_id columns
    
    OUTPUT:
    user_item - user item matrix 
    
    Description:
    Return a matrix with user ids as rows and article ids on the columns with 1 values where a user interacted with 
    an article and a 0 otherwise
    '''
    
    user_item = df.groupby(['user_id', 'article_id'])['title'].count().notnull().unstack()
    user_item.fillna(0, inplace=True)
    user_item = user_item[user_item.columns] = user_item[user_item.columns].astype(int)
    
    return user_item # return the user_item matrix 

user_item = create_user_item_matrix(df)

In [None]:
## Tests: You should just need to run this cell.  Don't change the code.
assert user_item.shape[0] == 5149, "Oops!  The number of users in the user-article matrix doesn't look right."
assert user_item.shape[1] == 714, "Oops!  The number of articles in the user-article matrix doesn't look right."
assert user_item.sum(axis=1)[1] == 36, "Oops!  The number of articles seen by user 1 doesn't look right."
print("You have passed our quick tests!  Please proceed!")

You have passed our quick tests!  Please proceed!


In [None]:
user_item.head()

article_id,0.0,2.0,4.0,8.0,9.0,12.0,14.0,15.0,16.0,18.0,...,1434.0,1435.0,1436.0,1437.0,1439.0,1440.0,1441.0,1442.0,1443.0,1444.0
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
def find_similar_users(user_id, user_item=user_item):
    '''
    INPUT:
    user_id - (int) a user_id
    user_item - (pandas dataframe) matrix of users by articles: 
                1's when a user has interacted with an article, 0 otherwise
    
    OUTPUT:
    similar_users - (list) an ordered list where the closest users (largest dot product users)
                    are listed first
    
    Description:
    Computes the similarity of every pair of users based on the dot product
    Returns an ordered
    
    '''
    # compute similarity of each user to the provided user

    user = np.array(user_item.iloc[user_id-1])
    others = np.array(user_item)
    user = user.reshape(user.shape[0],1)
    dot_prd = others.dot(user)

    dp = pd.DataFrame(dot_prd)
    dp.insert(0,'user_id',dp.index+1)
    dp.rename(columns={0:'similarity grade'}, inplace=True)

    similar_users = dp.sort_values(by=['similarity grade'], ascending=False)
    
    sim_users = similar_users[similar_users['user_id']!=user_id]
    sim_users = sim_users['user_id'].tolist()
    return sim_users

In [None]:
# Do a spot check of your function
print("The 10 most similar users to user 1 are: {}".format(find_similar_users(1)[:10]))
print("The 5 most similar users to user 3933 are: {}".format(find_similar_users(3933)[:5]))
print("The 3 most similar users to user 46 are: {}".format(find_similar_users(46)[:3]))

In [None]:
def get_article_names(article_ids, df=df):
    '''
    INPUT:
    article_ids - (list) a list of article ids
    df - (pandas dataframe) df as defined at the top of the notebook
    
    OUTPUT:
    article_names - (list) a list of article names associated with the list of article ids 
                    (this is identified by the title column)
    '''
    art_titles = df.groupby(['article_id','title']).size().reset_index()
    art_titles.drop(columns=0, inplace=True)

    article_names = art_titles[art_titles['article_id'].isin(article_ids)]['title']
    article_names = article_names.tolist()
    
    return article_names # Return the article names associated with list of article ids


def get_user_articles(user_id, df=df, user_item=user_item):
    '''
    INPUT:
    user_id - (int) a user id
    df - (pandas dataframe) df as defined at the top of the notebook
    user_item - (pandas dataframe) matrix of users by articles: 
                1's when a user has interacted with an article, 0 otherwise
    
    OUTPUT:
    article_ids - (list) a list of the article ids seen by the user
    article_names - (list) a list of article names associated with the list of article ids 
                    (this is identified by the doc_full_name column in df_content)
    
    Description:
    Provides a list of the article_ids and article titles that have been seen by a user
    '''
    user = user_item.iloc[user_id-1:user_id]==1
    user_articles = user.loc[:,(user.values).tolist()[0]].columns

    all_art_user_inter=df.groupby(['article_id'])['user_id'].count().reset_index()

    user_articles_s = pd.Series(user_articles).reset_index()
    art_with_inter = pd.merge(user_articles_s,all_art_user_inter, on='article_id')
    art_with_inter.drop(columns=['index'], inplace=True)
    art_with_inter.sort_values(by='user_id', ascending=False, inplace=True)
    
    article_ids = art_with_inter['article_id']
    article_names = get_article_names(article_ids)
    
    return article_ids.astype(str), article_names # return the ids and names


def user_user_recs(user_id, m=10):
    '''
    INPUT:
    user_id - (int) a user id
    m - (int) the number of recommendations you want for the user
    
    OUTPUT:
    recs - (list) a list of recommendations for the user
    
    Description:
    Loops through the users based on closeness to the input user_id
    For each user - finds articles the user hasn't seen before and provides them as recs
    Does this until m recommendations are found
    
    Notes:
    Users who are the same closeness are chosen arbitrarily as the 'next' user
    
    For the user where the number of recommended articles starts below m 
    and ends exceeding m, the last items are chosen arbitrarily
    
    '''
    recs=set()
    user_seen = get_user_articles(user_id)[0]

    for u in find_similar_users(user_id):
        if len(recs)==m:
            break
        for i in get_user_articles(u)[0]:
            if i not in user_seen: 
                recs.add(i)
            if len(recs)==m:
                break
    
    return recs # return your recommendations for this user_id    

In [None]:
# Check Results
get_article_names(user_user_recs(1, 10)) # Return 10 recommendations for user 1

In [None]:
# Test your functions here - No need to change this code - just run this cell
assert set(get_article_names(['1024.0', '1176.0', '1305.0', '1314.0', '1422.0', '1427.0'])) == set(['using deep learning to reconstruct high-resolution audio', 'build a python app on the streaming analytics service', 'gosales transactions for naive bayes model', 'healthcare python streaming application demo', 'use r dataframes & ibm watson natural language understanding', 'use xgboost, scikit-learn & ibm watson machine learning apis']), "Oops! Your the get_article_names function doesn't work quite how we expect."
assert set(get_article_names(['1320.0', '232.0', '844.0'])) == set(['housing (2015): united states demographic measures','self-service data preparation with ibm data refinery','use the cloudant-spark connector in python notebook']), "Oops! Your the get_article_names function doesn't work quite how we expect."
assert set(get_user_articles(20)[0]) == set(['1320.0', '232.0', '844.0'])
assert set(get_user_articles(20)[1]) == set(['housing (2015): united states demographic measures', 'self-service data preparation with ibm data refinery','use the cloudant-spark connector in python notebook'])
assert set(get_user_articles(2)[0]) == set(['1024.0', '1176.0', '1305.0', '1314.0', '1422.0', '1427.0'])
assert set(get_user_articles(2)[1]) == set(['using deep learning to reconstruct high-resolution audio', 'build a python app on the streaming analytics service', 'gosales transactions for naive bayes model', 'healthcare python streaming application demo', 'use r dataframes & ibm watson natural language understanding', 'use xgboost, scikit-learn & ibm watson machine learning apis'])
print("If this is all you see, you passed all of our tests!  Nice job!")

In [None]:
def get_top_sorted_users(user_id, user_item=user_item):
    '''
    INPUT:
    user_id - (int)
    user_item - (pandas dataframe) matrix of users by articles: 
            1's when a user has interacted with an article, 0 otherwise
    
            
    OUTPUT:
    neighbors_df - (pandas dataframe) a dataframe with:
                    neighbor_id - is a neighbor user_id
                    similarity - measure of the similarity of each user to the provided user_id
                    num_interactions - the number of articles viewed by the user - if a u
                    
    Other Details - sort the neighbors_df by the similarity and then by number of interactions where 
                    highest of each is higher in the dataframe
     
    '''
    # compute similarity of each user to the provided user

    user = np.array(user_item.iloc[user_id-1])
    others = np.array(user_item)
    user = user.reshape(user.shape[0],1)
    dot_prd = others.dot(user)

    dp = pd.DataFrame(dot_prd)
    dp.insert(0,'user_id',user_item.reset_index()['user_id'])
    dp.rename(columns={0:'similarity grade'}, inplace=True)

    # Ranking the user-article interactions (option1 and option2)
    
    # option 1 using user_item COMMENTED OUT
    #user_articles = user_item.sum(axis=1).reset_index()
    #user_articles.rename(columns={0:'total articles'}, inplace=True)
    
    # option 2 using df
    user_articles = df.groupby(['user_id'])['article_id'].count().reset_index()
    user_articles.rename(columns={'article_id':'total articles'}, inplace=True)
    
    
    similar_users = pd.merge(dp,user_articles, on='user_id')

    similar_users = similar_users.sort_values(by=['similarity grade','total articles'], ascending=False)
    
    neighbors_df = similar_users[similar_users['user_id']!=user_id]
    
    return neighbors_df # Return the dataframe specified in the doc_string


def user_user_recs_part2(user_id, m=10):
    '''
    INPUT:
    user_id - (int) a user id
    m - (int) the number of recommendations you want for the user
    
    OUTPUT:
    recs - (list) a list of recommendations for the user by article id
    rec_names - (list) a list of recommendations for the user by article title
    
    Description:
    Loops through the users based on closeness to the input user_id
    For each user - finds articles the user hasn't seen before and provides them as recs
    Does this until m recommendations are found
    
    Notes:
    * Choose the users that have the most total article interactions 
    before choosing those with fewer article interactions.

    * Choose articles with the articles with the most total interactions 
    before choosing those with fewer total interactions. 
   
    '''
    recs=set()
    user_seen = get_user_articles(user_id)[0]

    for u in find_similar_users(user_id):
        if len(recs)==m:
            break
        for i in get_user_articles(u)[0]:
            if i not in user_seen: 
                recs.add(i)
            if len(recs)==m:
                break
    
    rec_names = get_article_names(recs)
    
    return recs, rec_names

In [None]:
# Quick spot check - don't change this code - just use it to test your functions
rec_ids, rec_names = user_user_recs_part2(20, 10)
print("The top 10 recommendations for user 20 are the following article ids:")
print(rec_ids)
print()
print("The top 10 recommendations for user 20 are the following article names:")
print(rec_names)

In [None]:
### Tests with a dictionary of results

user1_most_sim = get_top_sorted_users(1).iloc[0]['user_id']# Find the user that is most similar to user 1 
user131_10th_sim = get_top_sorted_users(131).iloc[9]['user_id']# Find the 10th most similar user to user 131

In [None]:
## Dictionary Test Here
sol_5_dict = {
    'The user that is most similar to user 1.': user1_most_sim, 
    'The user that is the 10th most similar to user 131': user131_10th_sim,
}

t.sol_5_test(sol_5_dict)

In [None]:
new_user = '0.0'

# What would your recommendations be for this new user '0.0'?  As a new user, they have no observed articles.
# Provide a list of the top 10 article ids you would give to 
new_user_recs = get_top_article_ids(10)

In [None]:
assert set(new_user_recs) == set(['1314.0','1429.0','1293.0','1427.0','1162.0','1364.0','1304.0','1170.0','1431.0','1330.0']), "Oops!  It makes sense that in this case we would want to recommend the most popular articles, because we don't know anything about these users."

print("That's right!  Nice job!")

### <a class="anchor" id="Matrix-Fact">Part IV: Matrix Factorization</a>

 

In [None]:
# Load the matrix here
user_item_matrix = pd.read_pickle('user_item_matrix.p')

In [None]:
# quick look at the matrix
user_item_matrix.head()

In [None]:
# Perform SVD on the User-Item Matrix Here

u, s, vt = np.linalg.svd(user_item_matrix)

In [None]:
num_latent_feats = np.arange(10,700+10,20)
sum_errs = []

for k in num_latent_feats:
    # restructure with k latent features
    s_new, u_new, vt_new = np.diag(s[:k]), u[:, :k], vt[:k, :]
    
    # take dot product
    user_item_est = np.around(np.dot(np.dot(u_new, s_new), vt_new))
    
    # compute error for each prediction to actual value
    diffs = np.subtract(user_item_matrix, user_item_est)
    
    # total errors and keep track of them
    err = np.sum(np.sum(np.abs(diffs)))
    sum_errs.append(err)
    

print(sum_errs)
print(num_latent_feats)
print(1 - np.array(sum_errs)/df.shape[0])

plt.plot(num_latent_feats, 1 - np.array(sum_errs)/df.shape[0]);
plt.xlabel('Number of Latent Features');
plt.ylabel('Accuracy');
plt.title('Accuracy vs. Number of Latent Features');

In [None]:
df_train = df.head(40000)
df_test = df.tail(5993)

def create_test_and_train_user_item(df_train, df_test):
    '''
    INPUT:
    df_train - training dataframe
    df_test - test dataframe
    
    OUTPUT:
    user_item_train - a user-item matrix of the training dataframe 
                      (unique users for each row and unique articles for each column)
    user_item_test - a user-item matrix of the testing dataframe 
                    (unique users for each row and unique articles for each column)
    test_idx - all of the test user ids
    test_arts - all of the test article ids
    
    '''
    user_item_train = create_user_item_matrix(df_train)             
    user_item_test = create_user_item_matrix(df_test)
    
    user_item_test2 = user_item_test.reset_index()
    test_idx = user_item_test2['user_id'].tolist()
    test_arts = user_item_test.columns
    
    return user_item_train, user_item_test, test_idx, test_arts

user_item_train, user_item_test, test_idx, test_arts = create_test_and_train_user_item(df_train, df_test)

In [None]:
# How many users can we make predictions for in the test set?

user_id_train = user_item_train.reset_index().user_id
user_id_test = user_item_test.reset_index().user_id

user_id_intersect = np.intersect1d(user_id_train,user_id_test)
user_id_intersect.shape[0]

In [None]:
# How many users in the test set are we not able to make predictions for because of the cold start problem?

user_item_test.shape[0] - user_id_intersect.shape[0]

In [None]:
# How many articles can we make predictions for in the test set?

art_id_train = user_item_train.columns
art_id_test = user_item_test.columns

art_id_intersect = [value for value in art_id_train if value in art_id_test]
len(art_id_intersect)

In [None]:
# How many articles in the test set are we not able to make predictions for because of the cold start problem?

len(art_id_test) - len(art_id_intersect)

In [None]:
# Replace the values in the dictionary below
a = 662 
b = 574 
c = 20 
d = 0 


sol_4_dict = {
    'How many users can we make predictions for in the test set?': c, 
    'How many users in the test set are we not able to make predictions for because of the cold start problem?': a, 
    'How many articles can we make predictions for in the test set?': b,
    'How many articles in the test set are we not able to make predictions for because of the cold start problem?': d
}

t.sol_4_test(sol_4_dict)

In [None]:
# Removing the articles we can't make predictions
inter_cols = user_item_train.columns & user_item_test.columns
user_item_train = user_item_train[inter_cols]
user_item_train.shape

In [None]:
# Selecting the subset of rows that I can predict
user_item_train_inter = user_item_train.query('user_id in @user_id_intersect')
user_item_test_inter = user_item_test.query('user_id in @user_id_intersect')

In [None]:
u_train, s_train, vt_train = np.linalg.svd(user_item_train)

In [None]:
num_latent_feats = np.arange(10,user_item_train.shape[1],20) 
sum_errs = []

for k in num_latent_feats:
    # restructure with k latent features
    s_new, u_new, vt_new = np.diag(s_train[:k]), u_train[:, :k], vt_train[:k, :]    
    
    # take dot product
    user_item_est = np.around(np.dot(np.dot(u_new[user_id_intersect-1], s_new), vt_new))
    
    # compute error for each prediction to actual value
    diffs = np.subtract(user_item_test_inter, user_item_est)
    
    # total errors and keep track of them
    err = np.sum(np.sum(np.abs(diffs)))
    sum_errs.append(err)
    
print(sum_errs)
print(num_latent_feats)
print(1 - np.array(sum_errs)/df.shape[0])
    
plt.plot(num_latent_feats, 1 - np.array(sum_errs)/df.shape[0]);
plt.xlabel('Number of Latent Features');
plt.ylabel('Accuracy');
plt.title('Accuracy vs. Number of Latent Features');

In [None]:
print('u:{}'.format(u.shape))
print('s:{}'.format(s.shape))
print('vt:{}\n'.format(vt.shape))
print('u_new:{}'.format(u_new.shape))
print('s_new:{}'.format(s_new.shape))
print('vt_new:{}\n'.format(vt_new.shape))