# PROBLEM STATEMENT

### The main aim of this project is to build a Recommendation System, recommending the items based on the following.
### 1. Content Based Filtering
### 2. Collaborative Filtering

### End result should be a system that -:
###  Recommend posts for the given user
###  Recommend similar posts for the given post

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder

In [2]:
df_user = pd.read_csv('users.csv')
df_posts = pd.read_csv('posts.csv')
df_views = pd.read_csv('views.csv')

# DATA PRE-PROCESSING

In [3]:
df_user.rename(columns={'_id':'user_id'}, inplace=True) #Renaming so that it can be merged with df_views.csv
df_user.head()

Unnamed: 0,user_id,name,gender,academics
0,5d60098a653a331687083238,Nivesh Singh Chauhan,male,undergraduate
1,5d610ae1653a331687083239,Gaurav Sharma,male,graduate
2,5d618359fc5fcf3bdd9a0910,Akshay Mishra,male,undergraduate
3,5d6d2bb87fa40e1417a49315,Saksham Mathur,male,undergraduate
4,5d7c994d5720533e15c3b1e9,Varun Chowhan,male,undergraduate


In [4]:
df_posts.rename(columns={'_id':'post_id'},inplace=True) #Renaming so that it can be merged with df_views.csv
df_posts.head()

Unnamed: 0,post_id,title,category,post_type
0,5d62abaa65218653a132c956,hello there,Plant Biotechnology,blog
1,5d6d39567fa40e1417a4931c,Ml and AI,Artificial Intelligence|Machine Learning|Infor...,blog
2,5d7d23315720533e15c3b1ee,What is an Operating System ?,Operating Systems,blog
3,5d7d405e5720533e15c3b1f3,Lord Shiva,Drawings,artwork
4,5d80dfbc6c53455f896e600e,How Competition law evolved?,Competition Laws,blog


In [5]:
df_views.head()

Unnamed: 0,user_id,post_id,timestamp
0,5df49b32cc709107827fb3c7,5ec821ddec493f4a2655889e,2020-06-01T10:46:45.131Z
1,5ed3748576027d35905ccaab,5ed4cbadbd514d602c1531a6,2020-06-01T09:39:20.021Z
2,5ed0defa76027d35905cc2de,5eac305f10426255a7aa9dd3,2020-06-01T08:12:42.682Z
3,5ed0defa76027d35905cc2de,5ed1ff0276027d35905cc60d,2020-06-01T08:10:23.880Z
4,5ed0defa76027d35905cc2de,5ed3820f76027d35905ccac8,2020-06-01T08:08:54.124Z


In [6]:
newdf = pd.merge(df_views,df_posts,on='post_id')
newdf = pd.merge(newdf,df_user, on = 'user_id')
newdf.head()

Unnamed: 0,user_id,post_id,timestamp,title,category,post_type,name,gender,academics
0,5df49b32cc709107827fb3c7,5ec821ddec493f4a2655889e,2020-06-01T10:46:45.131Z,Save Earth.,Visual Arts|Graphic Design|Artistic design|Gra...,artwork,Niriksha Sharma,female,undergraduate
1,5df49b32cc709107827fb3c7,5ec1fd0974f7660d73aa0fd5,2020-05-18T08:35:28.241Z,Daaku,Drawings,artwork,Niriksha Sharma,female,undergraduate
2,5df49b32cc709107827fb3c7,5ecd5d417023451e662235c5,2020-05-26T20:11:07.153Z,"Machine Learning”&“Operations"" (MlOps)",Computer Technology|Machine Learning,blog,Niriksha Sharma,female,undergraduate
3,5df49b32cc709107827fb3c7,5ecb72c0eaff6b0c3a58a48e,2020-05-25T07:27:20.699Z,EID MUBARAK,Photography|Architecture|Visual Arts|Graphic D...,artwork,Niriksha Sharma,female,undergraduate
4,5df49b32cc709107827fb3c7,5ea5aacd10426255a7aa9b71,2020-05-13T09:20:46.457Z,Photography Composition,,project,Niriksha Sharma,female,undergraduate


In [7]:
newdf.isnull().sum()

user_id        0
post_id        0
timestamp      0
title          0
category      88
 post_type     0
name           0
gender         0
academics      0
dtype: int64

#### Filling category missing values with the mode of the column.

In [8]:
newdf['category'].fillna(value=newdf['category'].mode()[0], axis='index',inplace=True) 

In [9]:
newdf.drop(['name','timestamp'],axis=1,inplace=True)

In [10]:
newdf.head()

Unnamed: 0,user_id,post_id,title,category,post_type,gender,academics
0,5df49b32cc709107827fb3c7,5ec821ddec493f4a2655889e,Save Earth.,Visual Arts|Graphic Design|Artistic design|Gra...,artwork,female,undergraduate
1,5df49b32cc709107827fb3c7,5ec1fd0974f7660d73aa0fd5,Daaku,Drawings,artwork,female,undergraduate
2,5df49b32cc709107827fb3c7,5ecd5d417023451e662235c5,"Machine Learning”&“Operations"" (MlOps)",Computer Technology|Machine Learning,blog,female,undergraduate
3,5df49b32cc709107827fb3c7,5ecb72c0eaff6b0c3a58a48e,EID MUBARAK,Photography|Architecture|Visual Arts|Graphic D...,artwork,female,undergraduate
4,5df49b32cc709107827fb3c7,5ea5aacd10426255a7aa9b71,Photography Composition,Photography,project,female,undergraduate


In [11]:
n_users = newdf.user_id.nunique()
n_items = newdf.post_id.nunique()

print('Num. of Users: '+ str(n_users))
print('Num of posts: '+str(n_items))

Num. of Users: 118
Num of posts: 493


#### Converting the format of category column to a list and then appending to the new list

In [15]:
category=[]
for num in range(0,len(newdf)):
    cat=newdf.iloc[num]['category'].split('|')
    category.append(cat)
category[:10]

[['Visual Arts',
  'Graphic Design',
  'Artistic design',
  'Graphic',
  'Illustration'],
 ['Drawings'],
 ['Computer Technology', 'Machine Learning'],
 ['Photography',
  'Architecture',
  'Visual Arts',
  'Graphic Design',
  'Artistic design',
  'Graphic',
  'Logo Design'],
 ['Photography'],
 ['Photography'],
 ['Pen and ink'],
 ['Art; Science'],
 ['Photography', 'Architecture', 'Visual Arts', 'Graphic Design'],
 ['Photography', 'Architecture']]

#### Assigning the cleaned list values of 'category' to the 'category' column

In [16]:
newdf['category']=category 

#### Joining the category values to form into string for performing other operatons.

In [17]:
newdf['category']=newdf['category'].apply(' '.join)
newdf['category'].head()

0    Visual Arts Graphic Design Artistic design Gra...
1                                             Drawings
2                 Computer Technology Machine Learning
3    Photography Architecture Visual Arts Graphic D...
4                                          Photography
Name: category, dtype: object

In [18]:
newdf[' post_type'].unique()

array(['artwork', 'blog', 'project', 'skill'], dtype=object)

#### Label Encoding the features in 'post_type' 'gender' and 'academics' .

In [19]:
labelencoder = LabelEncoder()
newdf[' post_type'] = labelencoder.fit_transform(newdf[' post_type'])
newdf['gender'] = labelencoder.fit_transform(newdf['gender'])
newdf['academics'] =labelencoder.fit_transform(newdf['academics'])

In [20]:
newdf.head()

Unnamed: 0,user_id,post_id,title,category,post_type,gender,academics
0,5df49b32cc709107827fb3c7,5ec821ddec493f4a2655889e,Save Earth.,Visual Arts Graphic Design Artistic design Gra...,0,0,2
1,5df49b32cc709107827fb3c7,5ec1fd0974f7660d73aa0fd5,Daaku,Drawings,0,0,2
2,5df49b32cc709107827fb3c7,5ecd5d417023451e662235c5,"Machine Learning”&“Operations"" (MlOps)",Computer Technology Machine Learning,1,0,2
3,5df49b32cc709107827fb3c7,5ecb72c0eaff6b0c3a58a48e,EID MUBARAK,Photography Architecture Visual Arts Graphic D...,0,0,2
4,5df49b32cc709107827fb3c7,5ea5aacd10426255a7aa9b71,Photography Composition,Photography,2,0,2


# CONTENT BASED RECOMMENDATION ENGINE IMPLEMENTATION

#### Performing Count Vectoriser for the string values in category column.

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer(analyzer='word',stop_words='english')
tfidf_matrix = tf.fit_transform(newdf['category'])
tfidf_matrix.shape

(1447, 266)

#### Finding the cosine similarity for the tfidf_matrix to check for any similarity.

In [23]:
from sklearn.metrics.pairwise import linear_kernel
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
cosine_sim.shape

(1447, 1447)

### Creating Function to recommend posts based on similar posts.
#### 1. Creating a series of index values of newdf dataframe with index of 'posts title'.
#### 2. This is done so that we can get the index values in the 'newdf' corresponding to the post title given for recommendaton.
#### 3. This index value is used to find similarity scores from the 'cosine_sim' matrix for the similar posts.
#### 4. These similarity scores are (a). enumerated into lists (list of tuples) and (b). sorted in descending order.
#### 5. The enumeration values (which are actually the indices for the posts in the posts series) are extracted using list comprehension.
#### 6. The enumerated values (indices) are located into posts series and returned.


In [24]:
posts = newdf['title']

indices = pd.Series(newdf.index, index=newdf['title']) 

def post_recommendations(post_title):
    idx = indices[post_title][1]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:21]
    post_indices = [i[0] for i in sim_scores]
    return posts.iloc[post_indices]


## CONTENT BASED RECOMMENDATION ENGINE WORKING EXAMPLES -:

### EXAMPLE 1 :

In [25]:
postid = input('Enter the Post TITLE for which you want Recommendation : ')
print('\nTop Recommended Posts for you are :\n')
recom_posts = post_recommendations(postid)

for i in recom_posts.unique():
    print(i)


Enter the Post TITLE for which you want Recommendation : What is an Operating System ?

Top Recommended Posts for you are :

What is an Operating System ?
Operating System 1
Data Transmission Modes
Save Earth.
Daaku
Machine Learning”&“Operations" (MlOps)
EID MUBARAK
Photography Composition
Dakrai Artwork.
My First Animated Post.
Eid Mubarak
Happy Eid-ul-Fitr 2020
Zero-Waste Lifestyle
Computer Aided Machine Drawing (CAMD)
Future Communication Predictions
Gangster Style
Photography
Art Expo 2020
Ganesha


### EXAMPLE 2 :

In [26]:
postid = input('Enter the Post TITLE for which you want Recommendation : ')
print('\nTop Recommended Posts for you are :\n')
recom_posts = post_recommendations(postid)

for i in recom_posts.unique():
    print(i)


Enter the Post TITLE for which you want Recommendation : Daaku

Top Recommended Posts for you are :

Gangster Style
Ganesha
Kabir singh
Daaku
Mahakaal
Form of ma durga
Rides
Lord Shiva
OM


### EXAMPLE 3 :

In [27]:
postid = input('Enter the Post TITLE for which you want Recommendation : ')
print('\nTop Recommended Posts for you are :\n')
recom_posts = post_recommendations(postid)

for i in recom_posts.unique():
    print(i)

Enter the Post TITLE for which you want Recommendation : Photography Composition

Top Recommended Posts for you are :

Photography Composition
Zero-Waste Lifestyle
Computer Aided Machine Drawing (CAMD)
Photography
Sky never disappoints me!
Blur but beautiful <3
Solitude
Camel Shades
Faith in yourself
Designing Cmos circuit from Boolean expressions (Microwind)
The Waves Have Stood Still
Be yourself !!
Keep working hard !!
Even and Odd Numbers (PHP)


### END

#  **COLLABORATIVE FILTERING BASED ENGINE IMPLEMENTATION **

#### 1. Performing Vectorisation of the category column, converting tfidf_category_matrix(sparse matrix) to an array and finding average of all the vectorised words in a particular category.
#### 2. Adding the other label encoded categorical variables to the vectorised words to form another feature called 'vectorised_features'

In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer(analyzer='word',stop_words='english')
tfidf_category_matrix = tf.fit_transform(newdf['category'])
tfidf_category_matrix = tfidf_category_matrix.toarray()

cat_feat =newdf[' post_type']+newdf['gender']+newdf['academics']

tfidf_category_matrix = [sum(i)/len(i) for i in tfidf_category_matrix]
newdf['vectorised_features'] = tfidf_category_matrix + cat_feat

#### Label Encoding the user_id and post_id so that we can form user-item matrix's indices with corresponding user and item ids. (because string values can not be taken as matrix indices)

In [29]:
labelencoder = LabelEncoder()
newdf['encoded_userid'] = labelencoder.fit_transform(newdf['user_id'])
newdf['encoded_postid'] = labelencoder.fit_transform(newdf['post_id'])

In [30]:
newdf.head()

Unnamed: 0,user_id,post_id,title,category,post_type,gender,academics,vectorised_features,encoded_userid,encoded_postid
0,5df49b32cc709107827fb3c7,5ec821ddec493f4a2655889e,Save Earth.,Visual Arts Graphic Design Artistic design Gra...,0,0,2,2.00862,15,439
1,5df49b32cc709107827fb3c7,5ec1fd0974f7660d73aa0fd5,Daaku,Drawings,0,0,2,2.003759,15,415
2,5df49b32cc709107827fb3c7,5ecd5d417023451e662235c5,"Machine Learning”&“Operations"" (MlOps)",Computer Technology Machine Learning,1,0,2,3.00743,15,450
3,5df49b32cc709107827fb3c7,5ecb72c0eaff6b0c3a58a48e,EID MUBARAK,Photography Architecture Visual Arts Graphic D...,0,0,2,2.009689,15,445
4,5df49b32cc709107827fb3c7,5ea5aacd10426255a7aa9b71,Photography Composition,Photography,2,0,2,4.003759,15,352


### Splitting the dataframe into Test and Train Datasets.

In [32]:
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(newdf, test_size=0.25)

## Forming the User-Item Matrix for Test and Train Data.
### 1. Create an empty matrix with dimensions (n_users,n_items), unique users and items.
### 2. Using itertuples we are making a matrix which has user_id as rows and post_id as columns.

In [33]:
#USER-ITEM MATRIX
train_data_matrix = np.zeros((n_users, n_items))
for line in train_data.itertuples():
    train_data_matrix[line[9], line[10]] = line[8]  

test_data_matrix = np.zeros((n_users, n_items))
for line in test_data.itertuples():
    test_data_matrix[line[9], line[10]] = line[8]

### Finding the pairwise distances and creating user-user similarity matrix.

In [34]:
from sklearn.metrics.pairwise import pairwise_distances

# User-User Similarity Matrix
user_correlation = 1 - pairwise_distances(train_data_matrix, metric='correlation')
user_correlation[np.isnan(user_correlation)] = 0
print(user_correlation[:4, :4])
user_correlation.shape

[[ 1.          0.12702688 -0.03281652  0.06212337]
 [ 0.12702688  1.          0.02427166  0.02219141]
 [-0.03281652  0.02427166  1.          0.08014136]
 [ 0.06212337  0.02219141  0.08014136  1.        ]]


(118, 118)

### Finding the pairwise distances and creating item-item similarity matrix.

In [35]:
# Item-Item Similarity Matrix
item_correlation = 1 - pairwise_distances(train_data_matrix.T, metric='correlation')
item_correlation[np.isnan(item_correlation)] = 0
print(item_correlation[:4, :4])
item_correlation.shape

[[ 1.          1.         -0.01213928 -0.01084316]
 [ 1.          1.         -0.01213928 -0.01084316]
 [-0.01213928 -0.01213928  1.          0.66564512]
 [-0.01084316 -0.01084316  0.66564512  1.        ]]


(493, 493)

### Creating a DataFrame out of item similarity matrix.

In [36]:
post_similarity_df = pd.DataFrame(item_correlation, index=newdf.post_id.unique(), columns=newdf.post_id.unique())
post_similarity_df

Unnamed: 0,5ec821ddec493f4a2655889e,5ec1fd0974f7660d73aa0fd5,5ecd5d417023451e662235c5,5ecb72c0eaff6b0c3a58a48e,5ea5aacd10426255a7aa9b71,5ec3c09274f7660d73aa1229,5ec573a3f2781131cc7e51b8,5ecb7155eaff6b0c3a58a486,5ecb6edeeaff6b0c3a58a479,5eca8fceeaff6b0c3a58a3c0,...,5e964006a3258347b42f2a65,5e7220891b24db0468e90ce2,5e964200a3258347b42f2a6d,5e787636cfc8b713f5ac7cbe,5d80ecfd6c53455f896e601a,5e79b8becfc8b713f5ac7d45,5e79cf2acfc8b713f5ac7d4e,5e3ab644eab55d319938a72d,5e78d6dccfc8b713f5ac7cf4,5dd1751db802e41ed198b680
5ec821ddec493f4a2655889e,1.000000,1.000000,-0.012139,-0.010843,-0.011901,-0.012016,-0.008547,-0.008547,-0.008547,0.0,...,-0.008547,-0.008547,0.0,-0.012016,-0.012064,-0.008547,0.523950,-0.008547,-0.012089,0.0
5ec1fd0974f7660d73aa0fd5,1.000000,1.000000,-0.012139,-0.010843,-0.011901,-0.012016,-0.008547,-0.008547,-0.008547,0.0,...,-0.008547,-0.008547,0.0,-0.012016,-0.012064,-0.008547,0.523950,-0.008547,-0.012089,0.0
5ecd5d417023451e662235c5,-0.012139,-0.012139,1.000000,0.665645,-0.016902,-0.017066,-0.012139,-0.012139,-0.012139,0.0,...,-0.012139,-0.012139,0.0,-0.017066,-0.017134,-0.012139,-0.024429,-0.012139,-0.017169,0.0
5ecb72c0eaff6b0c3a58a48e,-0.010843,-0.010843,0.665645,1.000000,-0.015098,-0.015244,-0.010843,-0.010843,-0.010843,0.0,...,-0.010843,-0.010843,0.0,-0.015244,-0.015305,-0.010843,-0.021820,-0.010843,-0.015336,0.0
5ea5aacd10426255a7aa9b71,-0.011901,-0.011901,-0.016902,-0.015098,1.000000,0.490858,-0.011901,-0.011901,-0.011901,0.0,...,0.830363,-0.011901,0.0,0.490915,0.643611,-0.011901,0.315207,0.830363,0.524709,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5e79b8becfc8b713f5ac7d45,-0.008547,-0.008547,-0.012139,-0.010843,-0.011901,-0.012016,-0.008547,-0.008547,-0.008547,0.0,...,-0.008547,-0.008547,0.0,-0.012016,-0.012064,1.000000,0.523950,-0.008547,-0.012089,0.0
5e79cf2acfc8b713f5ac7d4e,0.523950,0.523950,-0.024429,-0.021820,0.315207,0.220563,-0.017200,-0.017200,-0.017200,0.0,...,0.388913,0.523950,0.0,0.220590,0.294151,0.523950,1.000000,0.388913,0.654238,0.0
5e3ab644eab55d319938a72d,-0.008547,-0.008547,-0.012139,-0.010843,0.830363,0.595783,-0.008547,-0.008547,-0.008547,0.0,...,1.000000,-0.008547,0.0,0.595851,0.778725,-0.008547,0.388913,1.000000,0.636365,0.0
5e78d6dccfc8b713f5ac7cf4,-0.012089,-0.012089,-0.017169,-0.015336,0.524709,0.373795,-0.012089,-0.012089,-0.012089,0.0,...,0.636365,0.765921,0.0,0.373838,0.491382,-0.012089,0.654238,0.636365,1.000000,0.0


# Posts Recommended Based on Other Posts (item-item)
### 1. We pass the post_id corresponding to the post for which we want recommendation and vectorised feature value to the function.
### 1.1. We locate the post_id in the post_similarity_df created above and multiply it with the vectorised_feat to scale.
### 1.2. We obtain the similar scores based on the post_id from the post_similarity_df and sort them in descending and return the index from post_similarity_df  i.e. the post id.
### 2. Converting the post_title to corresponding post_id to be passed to function.
### 3. We get a list of vectorised_features corresponding to a single post_id thus we pass the mean of all those features to the function.
### 4. Next we find the title from the newdf dataframe corresponding to the returned post_ids from the function.

In [187]:
#1
def get_similar_posts(post_id,vectorised_feat):
    similar_score = post_similarity_df[post_id]*vectorised_feat
    similar_score = similar_score.sort_values(ascending=True)
    return similar_score.index

#2
input_title = input('Enter the title of Post  : ')
post_id = newdf[newdf['title']==input_title]['post_id']
post_id = post_id.to_numpy()[0]
vectorised_post_id = newdf[newdf['post_id']==post_id]['vectorised_features']
#vectorised_post_id

#3
sim_score_posts = get_similar_posts(post_id, vectorised_post_id.mean())
#sim_score_posts

result=[]

#4
for i in range(0,len(newdf)):
    if newdf['post_id'][i] in sim_score_posts[:10]:
        if newdf['title'][i] not in result:
            result.append(newdf['title'][i])
        
print('\nTop Recommended Posts for you based on other posts are :\n')
for i in result:
    print(i)



Enter the title of Post  : Machine Learning”&“Operations" (MlOps)

Top Recommended Posts for you based on other posts are :

Machine Learning”&“Operations" (MlOps)
PENCIL RENDERING
Eyes that speak. Photo by Dan Farrell
How To Use Social Media To Promote A Project
Face Recognition using Transfer Learning.
Light to fight
deep thoughts
Time
Shiva Portrait
7 Best Python Data Science Courses & Certification [2020]


# Posts Recommended Based on Similar Users Like You (user-item)

### Creating a DataFrame out of user similarity matrix.

In [37]:
user_similarity_df = pd.DataFrame(user_correlation, index=newdf.user_id.unique(), columns=newdf.user_id.unique())
user_similarity_df

Unnamed: 0,5df49b32cc709107827fb3c7,5ec3ba5374f7660d73aa1201,5ec2204374f7660d73aa100f,5d7c994d5720533e15c3b1e9,5de50d768eab6401affbb135,5deeef6142a8854bf6eabab9,5d6d2bb87fa40e1417a49315,5e3563348d344822fed4d13a,5defd51362624b0135ea9fd2,5e5af599d701ab08af792b63,...,5e845fbaa3258347b42f2450,5e5dfbbefbc8805f69e02c91,5e8447a7a3258347b42f2446,5e78ce84cfc8b713f5ac7cee,5e365e758d344822fed4d144,5e840a75a3258347b42f2437,5e7dde87a3258347b42f2108,5e822b86a3258347b42f2360,5e5855ced701ab08af792b51,5e7def80a3258347b42f2124
5df49b32cc709107827fb3c7,1.000000,0.127027,-0.032817,0.062123,0.083381,-0.030897,0.028417,0.058546,-0.030897,0.011315,...,0.055617,0.0,-0.043739,0.071029,0.117471,0.110636,0.0,0.010632,0.071028,0.071029
5ec3ba5374f7660d73aa1201,0.127027,1.000000,0.024272,0.022191,0.139310,-0.019200,0.051224,-0.027873,-0.019200,-0.009289,...,-0.030856,0.0,-0.027181,-0.019200,0.114037,0.191000,0.0,-0.007061,-0.019200,-0.019200
5ec2204374f7660d73aa100f,-0.032817,0.024272,1.000000,0.080141,0.099675,-0.008500,-0.012033,0.007129,-0.008500,0.188029,...,-0.013689,0.0,-0.012033,-0.008500,0.088540,-0.010322,0.0,-0.014630,0.221216,0.221219
5d7c994d5720533e15c3b1e9,0.062123,0.022191,0.080141,1.000000,0.264532,-0.009598,-0.013588,0.073986,-0.009598,0.167631,...,-0.015459,0.0,-0.013588,-0.009598,0.041595,-0.011656,0.0,-0.016522,-0.009598,-0.009598
5de50d768eab6401affbb135,0.083381,0.139310,0.099675,0.264532,1.000000,-0.017675,-0.025022,0.001250,-0.017675,0.127098,...,0.103381,0.0,0.042844,0.110063,0.188115,0.187615,0.0,-0.030424,0.110061,-0.017675
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5e840a75a3258347b42f2437,0.110636,0.191000,-0.010322,-0.011656,0.187615,-0.002468,-0.003494,-0.007820,-0.002468,-0.018289,...,-0.003975,0.0,0.169695,-0.002468,0.661326,1.000000,0.0,-0.004249,-0.002468,-0.002468
5e7dde87a3258347b42f2108,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,1.0,0.000000,0.000000,0.000000
5e822b86a3258347b42f2360,0.010632,-0.007061,-0.014630,-0.016522,-0.030424,-0.003499,-0.004953,-0.011084,-0.003499,-0.025923,...,-0.005635,0.0,-0.004953,-0.003499,-0.006798,-0.004249,0.0,1.000000,-0.003499,-0.003499
5e5855ced701ab08af792b51,0.071028,-0.019200,0.221216,-0.009598,0.110061,-0.002033,-0.002877,-0.006440,-0.002033,-0.015060,...,-0.003274,0.0,-0.002877,-0.002033,0.454311,-0.002468,0.0,-0.003499,1.000000,-0.002033


In [39]:
def get_similar_posts_based_on_users(user_id,vectorised_feat):
    similar_score = user_similarity_df[user_id]*vectorised_feat
    similar_score = similar_score.sort_values(ascending=False)
    return similar_score
input_user_id = input('Enter the USER ID  : ')
vectorised_user_values = newdf[newdf['user_id']==input_user_id]['vectorised_features']
sim_score_posts = get_similar_posts_based_on_users(input_user_id, vectorised_user_values.mean())

result=[]

for i in range(0,len(newdf)):
    if newdf['user_id'][i] in sim_score_posts.index.to_numpy()[:10]:
        if newdf['user_id'][i] not in result:
            result.append(newdf['title'][i])
print('\nTop Recommended Posts based on Similar users like you : \n')
for i in result[:11]:
    print(i)

Enter the USER ID  : 5ec2204374f7660d73aa100f

Top Recommended Posts based on Similar users like you : 

Save Earth.
Daaku
Ganesha
Kabir singh
Rides
Women power
AWS services and how to launch OS on AWS Cloud
GAN's Part(2)
AWS services and how to launch OS on AWS Cloud
GAN's INTRODUCTION
Understanding Cloud Computing(AWS)


# RECOMMENDER ENGINE EVALUATION (Metric :MSE)

In [44]:
def predict(user_item_vectorised_scores, similarity, type='user'):
    if type == 'user':
        mean_user_score = user_item_vectorised_scores.mean(axis=1)
        #You use np.newaxis so that mean_user_rating has same format as ratings
        ratings_diff = (user_item_vectorised_scores - mean_user_score[:, np.newaxis]) 
        pred = mean_user_score[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif type == 'item':
        pred = user_item_vectorised_scores.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])     
    return pred

In [45]:
post_prediction = predict(train_data_matrix, item_correlation, type='item')
user_prediction = predict(train_data_matrix, user_correlation, type='user')

In [46]:
from sklearn.metrics import mean_squared_error
from math import sqrt
def rmse(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten() 
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    return sqrt(mean_squared_error(prediction, ground_truth))
    

In [47]:
print('User-based CF RMSE: ' + str(rmse(user_prediction, test_data_matrix)))
print('Post-based CF RMSE: ' + str(rmse(post_prediction, test_data_matrix)))

User-based CF RMSE: 2.68140576562217
Post-based CF RMSE: 2.8812150401600807
