## Importing necessary modules

In [21]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Overviewing Data

In [22]:
users=pd.read_csv('users.csv')
print(users.head())
print('\n')
print(users.describe())

                        _id                  name gender      academics
0  5d60098a653a331687083238  Nivesh Singh Chauhan   male  undergraduate
1  5d610ae1653a331687083239         Gaurav Sharma   male       graduate
2  5d618359fc5fcf3bdd9a0910         Akshay Mishra   male  undergraduate
3  5d6d2bb87fa40e1417a49315        Saksham Mathur   male  undergraduate
4  5d7c994d5720533e15c3b1e9         Varun Chowhan   male  undergraduate


                             _id    name gender      academics
count                        118     118    118            118
unique                       118     118      3              3
top     5d610ae1653a331687083239  Varsha   male  undergraduate
freq                           1       1     72             68


In [23]:
posts=pd.read_csv('posts.csv')
print(posts.head())
posts.describe()

                        _id                          title  \
0  5d62abaa65218653a132c956                    hello there   
1  5d6d39567fa40e1417a4931c                      Ml and AI   
2  5d7d23315720533e15c3b1ee  What is an Operating System ?   
3  5d7d405e5720533e15c3b1f3                     Lord Shiva   
4  5d80dfbc6c53455f896e600e   How Competition law evolved?   

                                            category  post_type  
0                                Plant Biotechnology       blog  
1  Artificial Intelligence|Machine Learning|Infor...       blog  
2                                  Operating Systems       blog  
3                                           Drawings    artwork  
4                                   Competition Laws       blog  


Unnamed: 0,_id,title,category,post_type
count,493,493,465,493
unique,493,477,231,4
top,5e36746a8d344822fed4d147,PENCIL RENDERING,Photography,artwork
freq,1,3,81,241


In [24]:
views=pd.read_csv('views.csv')
print(views.head())
views.describe()

                    user_id                   post_id  \
0  5df49b32cc709107827fb3c7  5ec821ddec493f4a2655889e   
1  5ed3748576027d35905ccaab  5ed4cbadbd514d602c1531a6   
2  5ed0defa76027d35905cc2de  5eac305f10426255a7aa9dd3   
3  5ed0defa76027d35905cc2de  5ed1ff0276027d35905cc60d   
4  5ed0defa76027d35905cc2de  5ed3820f76027d35905ccac8   

                  timestamp  
0  2020-06-01T10:46:45.131Z  
1  2020-06-01T09:39:20.021Z  
2  2020-06-01T08:12:42.682Z  
3  2020-06-01T08:10:23.880Z  
4  2020-06-01T08:08:54.124Z  


Unnamed: 0,user_id,post_id,timestamp
count,1449,1449,1449
unique,118,495,1449
top,5d60098a653a331687083238,5ec1fd0974f7660d73aa0fd5,2020-05-20T15:54:43.577Z
freq,230,18,1


# Building a posts recommendation system on the basis of post.

In [25]:
print(posts[' post_type'].value_counts())
print('\n')
posts['category'].value_counts()

artwork    241
blog       198
skill       27
project     27
Name:  post_type, dtype: int64




Photography                                                                                                 81
Drawings                                                                                                    47
Painting                                                                                                    20
Visual Arts                                                                                                 10
Computer Technology|Machine Learning                                                                         7
Computer Technology|Computer Application|Information Technology                                              7
Competition Laws                                                                                             5
Typography|Pen and ink                                                                                       5
Mass Media|Indian Government                                                                                 5
C

# Building a content based recommender system using Category Subgroup

In [26]:
posts[posts[' post_type'].isnull()]

Unnamed: 0,_id,title,category,post_type


In [27]:
posts[posts['category'].isnull()]

Unnamed: 0,_id,title,category,post_type
24,5ddeb6e80eb5e25a8a07f065,Library Managment System: Software Requirement...,,project
25,5de179d80eb5e25a8a07f079,Navigation system using BFS DFS algorithms,,project
29,5dee9b5042a8854bf6eabaaf,Computer Aided Machine Drawing (CAMD),,project
65,5e3ea110eab55d319938a7a7,OS,,project
88,5e4c3873f5561b1994c8e3d9,Bill Calculation(PHP),,project
93,5e4da502f5561b1994c8e42c,Prime No.(PHP),,project
94,5e4da7c8f5561b1994c8e439,Factorial of a No.(PHP),,project
95,5e4dab15f5561b1994c8e446,Palindrome No.(PHP),,project
96,5e4ed85af5561b1994c8e470,Factorial (.net),,project
97,5e4ed8ccf5561b1994c8e47d,Leap year checking (.net),,project


In [28]:
posts=posts.dropna().reset_index(drop=True)

In [29]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(posts['category'])
from sklearn.metrics.pairwise import linear_kernel
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [30]:
titles = posts['title']
indices = pd.Series(posts.index, index=posts['title'])

# Function that get movie recommendations based on the cosine similarity score of movie genres
def posts_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:21]
    posts_indices = [i[0] for i in sim_scores]
    return titles.iloc[posts_indices].head(10)

# Enter the title of the post you want to get related posts for in postTitle

In [31]:
postTitle='Ml and AI'

#### filtering results on the basis of post type

In [32]:
ptype=posts[posts['title']==postTitle][' post_type']

Final Recommendations

In [33]:
posts_recommendations(postTitle)
df=pd.DataFrame(posts_recommendations(postTitle))
df.columns = ['title']
fdf=posts[posts['title'].isin(df['title'])]
fdf[fdf[' post_type']==ptype[1]]

Unnamed: 0,_id,title,category,post_type
110,5e5bdf4fd701ab08af792bfd,Artificial Intelligence,Computer Technology|Artificial Intelligence,blog
157,5e7bd922cfc8b713f5ac7da9,What sports will look like in the future,Computer Technology|Robotics|Data Science|Info...,blog
158,5e7c78fdcfc8b713f5ac7daa,Types Of AI.,Computer Technology|Artificial Intelligence,blog
174,5e81a4f2a3258347b42f21d3,7 Best Python Data Science Courses & Certifica...,Computer Technology|Machine Learning,blog
212,5e897ab4a3258347b42f25ca,10 Best Artificial Intelligence (AI) Courses O...,Computer Technology|Machine Learning,blog
428,5ecd5d417023451e662235c5,"Machine Learning”&“Operations"" (MlOps)",Computer Technology|Machine Learning,blog
432,5ecf818376027d35905cbf03,GAN's INTRODUCTION,Computer Technology|Machine Learning,blog
433,5ecf96e876027d35905cbf46,GAN's Part(2),Computer Technology|Machine Learning,blog
434,5ecfa0ca76027d35905cbf57,Recommend Systems Machine Learning,Computer Technology|Machine Learning,blog
459,5ed2502b76027d35905cc7db,Learning...,Computer Technology|Machine Learning,blog


# Building a collabrative Recommendation system using SVD model

#### Recommending post based on user id

In [34]:
pd.DataFrame(views.groupby(['user_id','post_id'])['timestamp'].count())
views
viewspt=views.pivot_table('timestamp',index='user_id',columns='post_id',aggfunc='count').fillna(0)
viewspt.head()

post_id,5d62abaa65218653a132c956,5d6d39567fa40e1417a4931c,5d7d23315720533e15c3b1ee,5d7d405e5720533e15c3b1f3,5d80dfbc6c53455f896e600e,5d80e7c16c53455f896e6014,5d80ecfd6c53455f896e601a,5d81323a6c53455f896e6044,5d9b3514979d5962253c2f90,5d9b950768671220a1b2b153,...,5ed23cf876027d35905cc790,5ed23d4276027d35905cc798,5ed23e4d76027d35905cc7b8,5ed2502b76027d35905cc7db,5ed3476576027d35905cca1d,5ed3791976027d35905ccab6,5ed3820f76027d35905ccac8,5ed3ef4cbd514d602c1530f2,5ed415c6bd514d602c15312d,5ed4cbadbd514d602c1531a6
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5d60098a653a331687083238,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
5d610ae1653a331687083239,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5d618359fc5fcf3bdd9a0910,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5d6d2bb87fa40e1417a49315,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5d7c994d5720533e15c3b1e9,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [35]:
viewsptm=viewspt.as_matrix()

In [36]:
from scipy.sparse.linalg import svds
U, sigma, Vt = svds(viewsptm, k = 50)
sigma = np.diag(sigma)

In [37]:
all_user_predicted_views = np.dot(np.dot(U, sigma), Vt)
preds = pd.DataFrame(all_user_predicted_views,index=viewspt.index,columns = viewspt.columns)
preds.head()

post_id,5d62abaa65218653a132c956,5d6d39567fa40e1417a4931c,5d7d23315720533e15c3b1ee,5d7d405e5720533e15c3b1f3,5d80dfbc6c53455f896e600e,5d80e7c16c53455f896e6014,5d80ecfd6c53455f896e601a,5d81323a6c53455f896e6044,5d9b3514979d5962253c2f90,5d9b950768671220a1b2b153,...,5ed23cf876027d35905cc790,5ed23d4276027d35905cc798,5ed23e4d76027d35905cc7b8,5ed2502b76027d35905cc7db,5ed3476576027d35905cca1d,5ed3791976027d35905ccab6,5ed3820f76027d35905ccac8,5ed3ef4cbd514d602c1530f2,5ed415c6bd514d602c15312d,5ed4cbadbd514d602c1531a6
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5d60098a653a331687083238,0.99761,0.99761,0.001024,4.5e-05,-0.000218,-0.001215,0.001258,-0.001205,2.8694690000000004e-17,0.000606,...,-0.000559,-0.001533,-0.000559,0.0024,0.001812,0.003392,0.999469,-0.000559,-0.002092,0.003951
5d610ae1653a331687083239,0.000341,0.000341,-0.000112,8.9e-05,0.998127,0.000708,-0.008257,0.001708,4.023645e-18,5.7e-05,...,-0.000284,0.001755,-0.000284,-4.7e-05,0.005063,-9e-05,0.002005,-0.000284,0.001471,0.000194
5d618359fc5fcf3bdd9a0910,-0.000656,-0.000656,-0.001835,0.003079,0.006862,0.965762,0.001536,-0.004157,4.1040230000000005e-17,0.002276,...,0.00587,-0.009252,0.00587,-0.021317,-0.022162,-0.003306,-0.013214,0.00587,-0.003382,-0.009175
5d6d2bb87fa40e1417a49315,0.000899,0.000899,0.001863,-0.004619,0.003318,0.000586,0.001088,0.001508,2.789017e-17,-0.002227,...,0.004143,0.005035,0.004143,-0.002087,-0.018803,0.002054,0.007989,0.004143,0.009178,-0.002089
5d7c994d5720533e15c3b1e9,0.000386,0.000386,0.000295,0.996118,-0.001632,-0.000523,8e-06,-2e-06,1.5897610000000002e-17,-0.000598,...,-0.001477,0.001311,-0.001477,-0.00058,0.005834,-0.002015,-0.000319,-0.001477,-0.000166,-0.000538


In [38]:
def recommend_posts(user_id):
    
    sorted_user_predictions = preds.loc[user_id].sort_values(ascending=False)
    supdf=pd.DataFrame(sorted_user_predictions).reset_index()
    supdf.columns = ['post_id', 'value']
    
    # Recommend the highest predicted views posts that the user hasn't seen yet.
    recommendations =posts[~posts['_id'].isin(views[views['user_id']==user_id]['post_id'])].merge(supdf, how = 'left',left_on = '_id',right_on = 'post_id').sort_values('value', ascending = False).iloc[:20,0:4]

    return recommendations

# Using the recommend_posts() to get new recommendation for a user_id

In [39]:
recommend_posts("5d6d2bb87fa40e1417a49315")

Unnamed: 0,_id,title,category,post_type
408,5ecfa0ca76027d35905cbf57,Recommend Systems Machine Learning,Computer Technology|Machine Learning,blog
395,5ec573a3f2781131cc7e51b8,My First Animated Post.,Art; Science,skill
159,5e7cc074cfc8b713f5ac7db0,REAL TALK,Philosophy|Public Philosophy,blog
404,5ecdecc67023451e66223765,Looking at nature through nature,Photography,artwork
416,5ed141aa76027d35905cc4c9,Integrating Machine Learning with devOps(MLops),Computer Technology|Data Science,blog
374,5eb4fab110426255a7aaa0ed,God Drawing,Drawings,artwork
210,5e895d87a3258347b42f25bb,Benefits Of A Virtual Portfolio.,Graphics|Articulation|Computer Creation,blog
393,5ec54fc9f2781131cc7e50f5,Future Communication Predictions,Science;Technology,skill
334,5ea85c1f10426255a7aa9c05,Innovative Marketing Strategies,Marketing|Principles Of Marketing|Internationa...,blog
396,5ec57914f2781131cc7e51c8,The Power of Indian Audience.,Art,skill


# Evaluating Performance

In [93]:
df=pd.DataFrame(views.groupby(['user_id','post_id']).count())
df.reset_index(inplace=True)
df.index.set_names(['user_id'])
df.columns=['user_id','post_id','count']
df['count'].max()

5

In [95]:
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate

reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df[['user_id', 'post_id', 'count']],reader=reader)

# Use the SVD algorithm
algo = SVD()

# Run 5-fold cross-validation and then print results
cross_validate(algo, data, measures=['rmse', 'mae'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
MAE (testset)     0.1034  0.0738  0.0877  0.0965  0.0835  0.0890  0.0103  
RMSE (testset)    0.3081  0.1713  0.2114  0.2383  0.1598  0.2178  0.0532  
Fit time          0.22    0.24    0.28    0.25    0.26    0.25    0.02    
Test time         0.00    0.01    0.01    0.00    0.01    0.01    0.00    


{'fit_time': (0.21883821487426758,
  0.23605060577392578,
  0.27849650382995605,
  0.2473597526550293,
  0.2634556293487549),
 'test_mae': array([0.1034119 , 0.07380524, 0.08770352, 0.09648961, 0.08348562]),
 'test_rmse': array([0.30812272, 0.17134996, 0.2114109 , 0.23829331, 0.15978283]),
 'test_time': (0.002239704132080078,
  0.009438037872314453,
  0.009448051452636719,
  0.002667665481567383,
  0.005807161331176758)}

A RMSE value of 0.2178 tells us that model is a good fit.

#### Work by Nishant Agrawal