In [1]:
import pandas as pd
import numpy as np

In [2]:
author_data = pd.read_csv("quillverse author data.csv")
blog_rating = pd.read_csv("quillverse blog data.csv")
medium_blog_data = pd.read_csv("quillverse blog likes.csv")
#print(author_data)
#print(blog_rating)
#print(medium_blog_data)
author_data.head()
blog_rating.head()
medium_blog_data.head()

Unnamed: 0,blog_id,user_id,likes
0,1,101,
1,2,102,
2,3,102,
3,4,103,
4,5,104,


In [3]:
# Replace "file1.csv", "file2.csv", and "file3.csv" with your actual filenames
file_paths = ["quillverse author data.csv", "quillverse blog data.csv", "quillverse blog likes.csv"]

#Load each CSV file into separate DataFrames
df_list = []
for file_path in file_paths:
  df = pd.read_csv(file_path)
  df_list.append(df)

#Choose the merging method
# Example: Merge based on a common column "id" (ensure all DataFrames have this column)
merged_df = pd.merge(df_list[1], df_list[2], on="blog_id", how="outer")  # Join on "id" column, outer join keeps all rows
merged_df = pd.merge(merged_df, df_list[0], on="author_id", how="outer")  # Join the third DataFrame

# Choose the appropriate "how" parameter:
#   - "inner": Keep only rows with matches in both DataFrames (default)
#   - "outer": Keep all rows from both DataFrames
#   - "left": Keep all rows from the left DataFrame and matching rows from the right
#   - "right": Keep all rows from the right DataFrame and matching rows from the left

merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100 entries, 0 to 99
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   blog_id       100 non-null    int64  
 1   author_id     100 non-null    int64  
 2   blog_title    100 non-null    object 
 3   blog_content  100 non-null    object 
 4   blog_img      100 non-null    object 
 5   topic         100 non-null    object 
 6   Unnamed: 6    1 non-null      object 
 7   user_id       100 non-null    int64  
 8   likes         0 non-null      float64
 9   author_name   100 non-null    object 
dtypes: float64(1), int64(3), object(6)
memory usage: 8.6+ KB


In [4]:
#Removing/dropping unnecessary columns
merged_df_dropped = merged_df.drop(['blog_img'], axis=1)
merged_df_dropped.head()
#merged_df_dropped.info()
#merged_df_dropped.isnull().sum()

Unnamed: 0,blog_id,author_id,blog_title,blog_content,topic,Unnamed: 6,user_id,likes,author_name
0,1,1,A Beginner's Guide to Stay Ahead of Inflation,Expansion is a terrible bad dream that financi...,money,money,101,,Guest Post links
1,2,2,Rights of a Daughter to Ancestral Property,The headway made in India in regards to ladies...,money,,102,,Anjali Devri
2,3,2,GST E-Invoice mandate on business cross ?10 cr...,Notice E-invoicing will become obligatory for ...,money,,102,,Anjali Devri
3,7,2,GST on Tour Operator & Travel Agents,The Significance of The travel industry in Ind...,money,,102,,Anjali Devri
4,4,3,Can You Get a Personal Loan without a Job,"In the present unsure monetary environment, ge...",money,,103,,Richard Conard


In [5]:
#Displaying info about first blog's content
merged_df_dropped.head(1)['topic']

0    money
Name: topic, dtype: object

In [6]:
#For creating a recommendation engine, for each and every blog, we need to create a vector matrix
#b'coz while applying recommendation system that usually is based on PAIR-WISE similarity

#The "blog_content" column is a sentence, a string, so our model cannot understand a sentence,
#so for this we'll be using a NLP concept called TF-IDF(help us to create document matrix from this sentences)

from sklearn.feature_extraction.text import TfidfVectorizer

tfv = TfidfVectorizer(min_df=3, max_features=None,
            strip_accents='unicode', analyzer='word',token_pattern=r'\w(1,)',
            ngram_range=(1, 3),
            stop_words = 'english') #This piece of code removes all the unnecessary characters like is, the, a, a comma or fullstops, etc. which are not required

#Filling NoNs with empty strings
merged_df_dropped['topic'] = merged_df_dropped['topic'].fillna('')

In [7]:
#Fit Transform
# Create a TfidfVectorizer object with a lower min_df value #-->ERROR PARAMETER
tfv = TfidfVectorizer(min_df=1)

#Converting into Sparse Matrix(a matrix having a lot of zero values, and very less no. of non-zero values.)
tfv_matrix = tfv.fit_transform(merged_df_dropped['topic'])
tfv_matrix
tfv_matrix.shape #it shows(no. of records, no. of features)

(100, 10)

In [8]:
from sklearn.metrics.pairwise import sigmoid_kernel
#Sigmoid curve, transforming the input between 0 and 1

sig = sigmoid_kernel(tfv_matrix, tfv_matrix) #how summary 1 is related to summary 1-->this value will be high!
sig[0]  #Dive deeper into this concept for more understanding

array([0.80049902, 0.80049902, 0.80049902, 0.80049902, 0.80049902,
       0.76159416, 0.80049902, 0.80049902, 0.80049902, 0.80049902,
       0.80049902, 0.76159416, 0.76159416, 0.76159416, 0.76159416,
       0.76159416, 0.76159416, 0.76159416, 0.76159416, 0.76159416,
       0.76159416, 0.76159416, 0.76159416, 0.76159416, 0.76159416,
       0.76159416, 0.76159416, 0.76159416, 0.76159416, 0.76159416,
       0.76159416, 0.76159416, 0.76159416, 0.76159416, 0.76159416,
       0.76159416, 0.76159416, 0.76159416, 0.76159416, 0.76159416,
       0.76159416, 0.76159416, 0.76159416, 0.76159416, 0.76159416,
       0.76159416, 0.76159416, 0.76159416, 0.76159416, 0.76159416,
       0.76159416, 0.76159416, 0.76159416, 0.76159416, 0.76159416,
       0.76159416, 0.76159416, 0.76159416, 0.76159416, 0.76159416,
       0.76159416, 0.76159416, 0.76159416, 0.76159416, 0.76159416,
       0.76159416, 0.76159416, 0.76159416, 0.76159416, 0.76159416,
       0.76159416, 0.76159416, 0.76159416, 0.76159416, 0.76159

In [9]:
#Reverse mapping of indices and blog titles
indices = pd.Series(merged_df_dropped.index, index=merged_df_dropped['blog_title'])
indices

blog_title
A Beginner's Guide to Stay Ahead of Inflation                        0
Rights of a Daughter to Ancestral Property                           1
GST E-Invoice mandate on business cross ?10 crore turnover           2
GST on Tour Operator & Travel Agents                                 3
Can You Get a Personal Loan without a Job                            4
                                                                    ..
How to Play DVDs and Blu Ray on PS4?                                95
Types of Video Content that Drive Engagement                        96
61 of the Best Ai Tools in List Form                                97
8 Tips for Programmers To Save Time and Energy While Programming    98
More Southern Cooking Techniques                                    99
Length: 100, dtype: int64

In [10]:
indices['Rights of a Daughter to Ancestral Property']

1

In [11]:
sig[13]

array([0.76159416, 0.76159416, 0.76159416, 0.76159416, 0.76159416,
       0.76159416, 0.76159416, 0.76159416, 0.76159416, 0.76159416,
       0.76159416, 0.76159416, 0.76159416, 0.80049902, 0.76159416,
       0.76159416, 0.76159416, 0.76159416, 0.76159416, 0.76159416,
       0.76159416, 0.76159416, 0.76159416, 0.76159416, 0.76159416,
       0.76159416, 0.76159416, 0.76159416, 0.76159416, 0.76159416,
       0.76159416, 0.76159416, 0.76159416, 0.76159416, 0.76159416,
       0.76159416, 0.76159416, 0.76159416, 0.76159416, 0.76159416,
       0.76159416, 0.76159416, 0.76159416, 0.76159416, 0.76159416,
       0.76159416, 0.76159416, 0.76159416, 0.76159416, 0.76159416,
       0.76159416, 0.76159416, 0.76159416, 0.76159416, 0.76159416,
       0.76159416, 0.76159416, 0.76159416, 0.76159416, 0.76159416,
       0.76159416, 0.76159416, 0.76159416, 0.76159416, 0.76159416,
       0.76159416, 0.76159416, 0.76159416, 0.76159416, 0.76159416,
       0.76159416, 0.76159416, 0.76159416, 0.76159416, 0.76159

In [12]:
list(enumerate(sig[indices['Rights of a Daughter to Ancestral Property']]))

[(0, 0.8004990217606297),
 (1, 0.8004990217606297),
 (2, 0.8004990217606297),
 (3, 0.8004990217606297),
 (4, 0.8004990217606297),
 (5, 0.7615941559557649),
 (6, 0.8004990217606297),
 (7, 0.8004990217606297),
 (8, 0.8004990217606297),
 (9, 0.8004990217606297),
 (10, 0.8004990217606297),
 (11, 0.7615941559557649),
 (12, 0.7615941559557649),
 (13, 0.7615941559557649),
 (14, 0.7615941559557649),
 (15, 0.7615941559557649),
 (16, 0.7615941559557649),
 (17, 0.7615941559557649),
 (18, 0.7615941559557649),
 (19, 0.7615941559557649),
 (20, 0.7615941559557649),
 (21, 0.7615941559557649),
 (22, 0.7615941559557649),
 (23, 0.7615941559557649),
 (24, 0.7615941559557649),
 (25, 0.7615941559557649),
 (26, 0.7615941559557649),
 (27, 0.7615941559557649),
 (28, 0.7615941559557649),
 (29, 0.7615941559557649),
 (30, 0.7615941559557649),
 (31, 0.7615941559557649),
 (32, 0.7615941559557649),
 (33, 0.7615941559557649),
 (34, 0.7615941559557649),
 (35, 0.7615941559557649),
 (36, 0.7615941559557649),
 (37, 0.761

In [13]:
sorted(list(enumerate(sig[indices['Rights of a Daughter to Ancestral Property']])), key=lambda x: x[1], reverse=True)

[(0, 0.8004990217606297),
 (1, 0.8004990217606297),
 (2, 0.8004990217606297),
 (3, 0.8004990217606297),
 (4, 0.8004990217606297),
 (6, 0.8004990217606297),
 (7, 0.8004990217606297),
 (8, 0.8004990217606297),
 (9, 0.8004990217606297),
 (10, 0.8004990217606297),
 (89, 0.8004990217606297),
 (90, 0.8004990217606297),
 (91, 0.8004990217606297),
 (92, 0.8004990217606297),
 (5, 0.7615941559557649),
 (11, 0.7615941559557649),
 (12, 0.7615941559557649),
 (13, 0.7615941559557649),
 (14, 0.7615941559557649),
 (15, 0.7615941559557649),
 (16, 0.7615941559557649),
 (17, 0.7615941559557649),
 (18, 0.7615941559557649),
 (19, 0.7615941559557649),
 (20, 0.7615941559557649),
 (21, 0.7615941559557649),
 (22, 0.7615941559557649),
 (23, 0.7615941559557649),
 (24, 0.7615941559557649),
 (25, 0.7615941559557649),
 (26, 0.7615941559557649),
 (27, 0.7615941559557649),
 (28, 0.7615941559557649),
 (29, 0.7615941559557649),
 (30, 0.7615941559557649),
 (31, 0.7615941559557649),
 (32, 0.7615941559557649),
 (33, 0.761

In [14]:
#Last 4 set of codes will be included in a function that we'll create for our Recommendation System
def give_rec(title, sig=sig):
  idx = indices[title]   #Get the indices corresponding to blog_title
  sig_scores = list(enumerate(sig[idx]))    #Get pairwise similarity score
  sig_scores =sorted(sig_scores, key = lambda x: x[1], reverse=True)   #Sort the blogs
  sig_scores = sig_scores[1:7]       #Scores of 6 most similar blogs
  blog_indices = [i[0] for i in sig_scores]      #Blog indices
  return merged_df_dropped['blog_title'].iloc[blog_indices]        #Top 10 most similar blogs

In [15]:
#Now testing our RS
give_rec('Role of Productivity Apps')

46    Understanding the Key Productivity Indicators ...
47                 Productivity Management in retailing
48            The Steps towards Increasing Productivity
49    Four Crucial Strategies for Efficient Home-Bas...
50    Using webmaster staffing to grow your online b...
51    Stay Connected: Discovering the Latest Busines...
Name: blog_title, dtype: object