Importing libraries and loading data

In [29]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from scipy.stats import pearsonr, spearmanr

! jupyter nbextension enable --py widgetsnbextension
import ipywidgets as widgets
from ipywidgets import interact, interact_manual

Enabling notebook extension jupyter-js-widgets/extension...
      - Validating: [32mOK[0m


In [30]:
users = pd.read_csv("data/users.csv")
posts = pd.read_csv("data/posts.csv")
views = pd.read_csv("data/views.csv")

users.rename({'_id': 'user_id'}, axis=1, inplace=True)
posts.rename({'_id': 'post_id'}, axis=1, inplace=True)

posts1 = posts.set_index('post_id') #for finding title from post_id
posts2 = posts.set_index('title') #for finding post_id from title

In [31]:
users.head()

Unnamed: 0,user_id,name,gender,academics
0,5d60098a653a331687083238,Nivesh Singh Chauhan,male,undergraduate
1,5d610ae1653a331687083239,Gaurav Sharma,male,graduate
2,5d618359fc5fcf3bdd9a0910,Akshay Mishra,male,undergraduate
3,5d6d2bb87fa40e1417a49315,Saksham Mathur,male,undergraduate
4,5d7c994d5720533e15c3b1e9,Varun Chowhan,male,undergraduate


In [32]:
posts.head()

Unnamed: 0,post_id,title,category,post_type
0,5d62abaa65218653a132c956,hello there,Plant Biotechnology,blog
1,5d6d39567fa40e1417a4931c,Ml and AI,Artificial Intelligence|Machine Learning|Infor...,blog
2,5d7d23315720533e15c3b1ee,What is an Operating System ?,Operating Systems,blog
3,5d7d405e5720533e15c3b1f3,Lord Shiva,Drawings,artwork
4,5d80dfbc6c53455f896e600e,How Competition law evolved?,Competition Laws,blog


In [33]:
views.head()

Unnamed: 0,user_id,post_id,timestamp
0,5df49b32cc709107827fb3c7,5ec821ddec493f4a2655889e,2020-06-01T10:46:45.131Z
1,5ed3748576027d35905ccaab,5ed4cbadbd514d602c1531a6,2020-06-01T09:39:20.021Z
2,5ed0defa76027d35905cc2de,5eac305f10426255a7aa9dd3,2020-06-01T08:12:42.682Z
3,5ed0defa76027d35905cc2de,5ed1ff0276027d35905cc60d,2020-06-01T08:10:23.880Z
4,5ed0defa76027d35905cc2de,5ed3820f76027d35905ccac8,2020-06-01T08:08:54.124Z


obtaining list of all categories and post types - used as feature for `content-based filtering`

along with these features, the number of users who viewed a post is also taken into consideration under the feature `rating`, since users have not rated the posts they viewed, each view on a post adds a value of 1 to its rating - to achieve `colloborative-based filtering`

In [34]:
categories = list(posts['category'].dropna())
posts_list = list(posts[' post_type'].dropna())

all_categories = set()
for cat in categories:
    cur = set(cat.split('|'))
    all_categories.update(cur)
    
all_posts = set()
for cat in posts_list:
    cur = set(cat.split('|'))
    all_posts.update(cur)
    
len(all_categories), len(all_posts)

(234, 4)

In [35]:
view_post = views.merge(posts, on='post_id')
view_post.head()

Unnamed: 0,user_id,post_id,timestamp,title,category,post_type
0,5df49b32cc709107827fb3c7,5ec821ddec493f4a2655889e,2020-06-01T10:46:45.131Z,Save Earth.,Visual Arts|Graphic Design|Artistic design|Gra...,artwork
1,5ec3ba5374f7660d73aa1201,5ec821ddec493f4a2655889e,2020-05-24T10:49:55.177Z,Save Earth.,Visual Arts|Graphic Design|Artistic design|Gra...,artwork
2,5ec2204374f7660d73aa100f,5ec821ddec493f4a2655889e,2020-05-24T09:23:57.537Z,Save Earth.,Visual Arts|Graphic Design|Artistic design|Gra...,artwork
3,5d7c994d5720533e15c3b1e9,5ec821ddec493f4a2655889e,2020-05-22T20:11:32.317Z,Save Earth.,Visual Arts|Graphic Design|Artistic design|Gra...,artwork
4,5de50d768eab6401affbb135,5ec821ddec493f4a2655889e,2020-05-22T20:10:41.100Z,Save Earth.,Visual Arts|Graphic Design|Artistic design|Gra...,artwork


Generate user history and preference dataframe

In [36]:
user_history = view_post.copy()
user_history.drop(['category', ' post_type'], axis=1, inplace=True)
user_history['user_id_'] = user_history['user_id'].copy()

# Create separate features for each of the category and post_type
for cat in all_categories:
    user_history[cat] = 0  
for post_type in all_posts:
    user_history[post_type] = 0

# objective is to identify user's preferences in terms of the tags and post_types
for i in range(len(view_post)):
    categories = view_post.loc[i,'category']
    if type(categories) != float:
        for category in categories.split('|'):
            user_history.at[i, category] = 1
            
for i in range(len(view_post)):
    post_type = view_post.loc[i, 'category']
    if post_type in all_posts:
        user_history.at[i, post_type] = 1

In [37]:
user_history.drop(['post_id', 'timestamp', 'title'], axis=1, inplace=True)
user_history = user_history.groupby(['user_id'], as_index=False).sum()
user_history.head()

Unnamed: 0,user_id,Plant Biotechnology,Hardware,Learning,Sculptures,Rights and Duties,Electrical Machines,Watercolours,Financial Accounting,E Transactions,...,Pen and ink,Database Management,Drawing,Craft work,Robotics,Professionalism,project,skill,artwork,blog
0,5d60098a653a331687083238,2,0,1,2,0,0,9,1,0,...,4,1,0,1,1,1,0,0,0,0
1,5d610ae1653a331687083239,1,0,0,3,0,0,4,0,0,...,1,0,2,0,1,0,0,0,0,0
2,5d618359fc5fcf3bdd9a0910,0,0,2,0,0,0,2,0,0,...,0,0,0,0,0,0,0,0,0,0
3,5d6d2bb87fa40e1417a49315,0,0,1,0,0,0,6,0,0,...,2,0,0,0,0,0,0,0,0,0
4,5d7c994d5720533e15c3b1e9,0,0,2,0,0,0,5,0,0,...,4,0,1,0,0,1,0,0,0,0


Generate dataframe of posts along with their categories similar to what was done with the users dataframe

In [38]:
post_history = posts.copy() 
post_history.drop(['title', 'category', ' post_type'], axis=1, inplace=True) 

for cat in all_categories:
    post_history[cat] = 0
    
for post_type in all_posts:
    post_history[post_type] = 0

# for each post, assign values of 1 to all tags and the post_type it's associated with 
for i in range(len(posts)):
    categories = posts.loc[i,'category']
    if type(categories) != float:
        for category in categories.split('|'):
            post_history.at[i, category] = 1

In [39]:
post_history1 = post_history.copy()
post_history1['ratings'] = 0
for i in range(len(post_history1)):
    post_id = post_history1.loc[i]['post_id']
    post_history1.at[i, 'ratings'] = len(views[views['post_id'] == post_id])

In [40]:
post_history.fillna(0, inplace=True)
post_history.head()

Unnamed: 0,post_id,Plant Biotechnology,Hardware,Learning,Sculptures,Rights and Duties,Electrical Machines,Watercolours,Financial Accounting,E Transactions,...,Pen and ink,Database Management,Drawing,Craft work,Robotics,Professionalism,project,skill,artwork,blog
0,5d62abaa65218653a132c956,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,5d6d39567fa40e1417a4931c,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,5d7d23315720533e15c3b1ee,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,5d7d405e5720533e15c3b1f3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5d80dfbc6c53455f896e600e,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Different correlation measures can be used : 

1. Cosine

2. Eucledian distance

3. Pearson 

4. Spearman

The pearson correlation measure has been used here. Though other measures can also be used by changing the option variable

In [41]:
def cosine(arg1,arg2):
    dot_product = np.dot(arg1,arg2)
    norm_1 = np.linalg.norm(arg1)
    norm_2 = np.linalg.norm(arg2)
    return 'invalid' if norm_1*norm_2 == 0 else dot_product/(norm_1 * norm_2)

def eucledian_distance(arg1,arg2):
    return np.sqrt(np.sum((arg1 - arg2)**2))

def correlation(arg1, arg2, type = 'cosine'):
    if type == 'cosine':
        return cosine(arg1, arg2)
    elif type == 'eucledian':
        return eucledian_distance(arg1, arg2)
    elif type == 'pearson':
        return 'invalid' if np.std(arg1)*np.std(arg2) == 0 else pearsonr(arg1, arg2)[0]
    elif type == 'spearman':
        return 'invalid' if str(spearmanr(arg1, arg2)[0])=='nan' else spearmanr(arg1, arg2)[0]
    
option = 'pearson'
r = True
if option == 'eucledian':
    r = False

Obtaining correlation between an user and a post

In [42]:
user = np.array(user_history.loc[0][1:])
post = np.array(post_history.loc[0][1:])

correlation(user,post, option)

0.0035125510994521263

correlation between two posts

In [43]:
post1 = np.array(post_history1.loc[0][1:])
post2 = np.array(post_history1.loc[30][1:])

correlation(post1,post2, option)

0.6683696288351118

correlation between a single user and all posts. The posts are sorted in the descending order of their correlation

In [44]:
user = np.array(user_history.loc[0][1:])
dic = {}
for i in range(len(posts)):
    post = np.array(post_history.loc[i][1:])
    corr = correlation(user, post, option)
    
    if corr == 'invalid':
        continue
    else:
        post_id = posts.loc[i][0]
        post = posts.loc[posts['post_id'] == post_id]
        index = int(str(post['post_id']).split(' ')[0])    
        title = posts.loc[index]['title']
        dic[title] = corr
        
dic = {k:v for k,v in dic.items() if v != 0}
recommended_posts = sorted(dic.items(), key=lambda kv: kv[1], reverse=r)[1:]

In [45]:
recommended_posts

[('Spirituality', 0.7559009966020975),
 ('Travelling', 0.7559009966020975),
 ('screw town', 0.7559009966020975),
 ('screw2', 0.7559009966020975),
 ('Aesthetic', 0.7559009966020975),
 ('Quarantined', 0.7559009966020975),
 ('Bloom🌸', 0.7559009966020975),
 ('Dog❤️', 0.7559009966020975),
 ('Leap of faith', 0.7559009966020975),
 ('The silence that shouts', 0.7559009966020975),
 ('Night View Ha Penny Bridge in Dublin Ireland', 0.7559009966020975),
 ('freeze', 0.7559009966020975),
 ('Bliss :)', 0.7559009966020975),
 ('Sunset', 0.7559009966020975),
 ('Peaceful', 0.7559009966020975),
 ('TOWER', 0.7559009966020975),
 ('PEACE', 0.7559009966020975),
 ('EYES', 0.7559009966020975),
 ('based on your imagination', 0.7559009966020975),
 ('CANON', 0.7559009966020975),
 ('THOUGHTFUL', 0.7559009966020975),
 ('engrossed', 0.7559009966020975),
 ('Hermit', 0.7559009966020975),
 ('innocence', 0.7559009966020975),
 ('peace', 0.7559009966020975),
 ('trekking', 0.7559009966020975),
 ('portraying goddess', 0.7559

correlation between given post and all other posts sorted in the descending order of their correlation

In [46]:
post = np.array(post_history1.loc[0][1:])
dic = {}
for i in range(len(posts)):
    post1 = np.array(post_history1.loc[i][1:])
    corr = correlation(post,post1, option)
    
    if corr == 'invalid':
        continue
    else:
        post_id = posts.loc[i][0]
        post1 = posts.loc[posts['post_id'] == post_id]
        index = int(str(post1['post_id']).split(' ')[0])    
        title = posts.loc[index]['title']
        dic[title] = corr
        
dic = {k:v for k,v in dic.items() if v != 0}
recommended_posts = sorted(dic.items(), key=lambda kv: kv[1], reverse=r)[1:]

In [47]:
recommended_posts

[('100 Free University Courses Online [2020]', 0.8150298445674802),
 ('Library Managment System: Software Requirement Specification (SRS)',
  0.7056196990333146),
 ('Navigation system using BFS DFS algorithms', 0.7056196990333146),
 ('OS', 0.7056196990333146),
 ('Bill Calculation(PHP)', 0.7056196990333146),
 ('Prime No.(PHP)', 0.7056196990333146),
 ('Factorial of a No.(PHP)', 0.7056196990333146),
 ('Palindrome No.(PHP)', 0.7056196990333146),
 ('Factorial (.net)', 0.7056196990333146),
 ('Leap year checking (.net)', 0.7056196990333146),
 ('Delegates(.net)', 0.7056196990333146),
 ('Even and Odd Numbers (PHP)', 0.7056196990333146),
 ('Armstrong No.(C# .Net)', 0.7056196990333146),
 ('Negative or Positive No. (PHP)', 0.7056196990333146),
 ('Friend Number (C# .Net)', 0.7056196990333146),
 ('Photography', 0.7056196990333146),
 ('Students Networking!', 0.7056196990333146),
 ('Zero-Waste Lifestyle', 0.7056196990333146),
 ('Moore FSM Sequence Detector', 0.7056196990333146),
 ("GAN's Introduction"

Developed user interface to

**1. Recommend posts for the given user**
   
   It is possible to choose the name of the person and the number of posts to be recommended quite easily below
   
    
**2. Recommend similar posts for the given post**
   
   It is possible to choose the name of the post for which you want to recommend similar posts. The similarity between posts as well as the user count is taken into consideration for this prediction

In [48]:
#Recommend posts for given user
@interact
def postForUser(user_ = list(users['name']),
                count = (0,50,1)):
    print(user_)
    user_name = user_
    user_id = users[users['name']==user_]['user_id'].values[0]
    postsToBeRemoved = [posts1.loc[id]['title'] for id in list(views[views['user_id'] == user_id]['post_id']) if id in list(posts['post_id'])]
    user = user_history[users['name'] == user_name]
    user = np.array(user)[0][1:]
    dic = {}
    for i in range(len(posts)):
        post = np.array(post_history.loc[i][1:])
        corr = correlation(user,post, option)

        if corr == 'invalid':
            continue
        else:
            post_id = posts.loc[i][0]
            post = posts.loc[posts['post_id'] == post_id]
            index = int(str(post['post_id']).split(' ')[0])    
            title = posts.loc[index]['title']
            dic[title] = corr

    dic = {k:v for k,v in dic.items() if v != 0}
    # sort the recommended posts on the descending order of their correlation
    recommended_posts = sorted(dic.items(), key=lambda kv: kv[1], reverse=r)[1:]
    
    if (list(iter(zip(*recommended_posts)))[1][0] < 0):
        print("NOT ANY POSTS THAT YOU WOULD LIKE TO VIEW, but here are a list of ones you can look into :)\n")
    
    #removing posts that user has already seen from the recommendation list
    recommended_posts = [arr for arr in recommended_posts if arr[0] not in postsToBeRemoved]
    for post in recommended_posts[:count]:
        print ("{0:<50s} {1}".format(post[0], post[1]))

interactive(children=(Dropdown(description='user_', options=('Nivesh Singh Chauhan', 'Gaurav Sharma', 'Akshay …

In [49]:
# Recommend similar posts to the given post
@interact
def postForPost(post_ = list(posts['title']), 
                count = (0,50,1)):
    post_name = post_
    post = post_history1[posts['title'] == post_name]
    post = np.array(post)[0][1:]
    dic = {}
    for i in range(len(posts)):
        post1 = np.array(post_history1.loc[i][1:])
        corr = correlation(post, post1, option)

        if corr == 'invalid':
            continue
        else:
            post_id = posts.loc[i][0]
            post1 = posts.loc[posts['post_id'] == post_id]
            index = int(str(post1['post_id']).split(' ')[0])    
            title = posts.loc[index]['title']
            dic[title] = corr
        
    dic = {k:v for k,v in dic.items() if v != 0}
    recommended_posts = sorted(dic.items(), key=lambda kv: kv[1], reverse=r)[1:]
    for post in recommended_posts[:count]:
        if post[0] != post_name:
            print ("{0:<50s} {1}".format(post[0], post[1]))

interactive(children=(Dropdown(description='post_', options=('hello there', 'Ml and AI', 'What is an Operating…