In [3]:
!pip install pymupdf

Collecting pymupdf
  Downloading PyMuPDF-1.24.5-cp310-none-manylinux2014_x86_64.whl (3.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.5/3.5 MB[0m [31m25.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting PyMuPDFb==1.24.3 (from pymupdf)
  Downloading PyMuPDFb-1.24.3-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (15.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.8/15.8 MB[0m [31m49.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDFb, pymupdf
Successfully installed PyMuPDFb-1.24.3 pymupdf-1.24.5


In [4]:
pip install spacy textstat

Collecting textstat
  Downloading textstat-0.7.3-py3-none-any.whl (105 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m105.1/105.1 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
Collecting pyphen (from textstat)
  Downloading pyphen-0.15.0-py3-none-any.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m34.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pyphen, textstat
Successfully installed pyphen-0.15.0 textstat-0.7.3


In [64]:
!pip install scikit-surprise

Collecting scikit-surprise
  Downloading scikit_surprise-1.1.4.tar.gz (154 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.4/154.4 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (pyproject.toml) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.4-cp310-cp310-linux_x86_64.whl size=2357243 sha256=f5ea368920d66f4b81b6facdfba4f10a6f794f646ee2f994832adac69de9c9c4
  Stored in directory: /root/.cache/pip/wheels/4b/3f/df/6acbf0a40397d9bf3ff97f582cc22fb9ce66adde75bc71fd54
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
Successfully installed scikit-surprise-1.1.4


In [88]:
import os
import pymupdf
import spacy
import sqlite3
import pandas as pd
from textstat import textstat  # To calculate reading ease
from sklearn.model_selection import train_test_split
from surprise import Dataset, Reader, SVD
from surprise.model_selection import cross_validate
from surprise.model_selection import GridSearchCV


In [47]:

# Load the spaCy model
nlp = spacy.load('en_core_web_sm')

# Define the folder containing the PDFs
pdf_folder = '/content/'
db_name = 'pdfnotes_metadata.db'
# Function to extract text from specific pages
def extract_text_from_pages(pdf_path, page_numbers):
    doc = pymupdf.open(pdf_path)
    text = ""
    for page_num in page_numbers:
        page = doc.load_page(page_num - 1)  # page numbers are 0-indexed in pymupdf
        text += page.get_text()
    return text

# Function to estimate reading duration
def estimate_reading_duration(text, wpm=200):
    word_count = len(text.split())
    duration_minutes = word_count / wpm
    return f"{int(duration_minutes)} minutes"

# Function to determine difficulty based on Flesch Reading Ease score
def determine_difficulty(text):
    flesch_score = textstat.flesch_reading_ease(text)
    if flesch_score >= 80:
        return "Easy"
    elif flesch_score >= 50:
        return "Intermediate"
    else:
        return "Hard"

# Function to determine learning style
def determine_learning_style(pdf_name, text):
    if 'audio' in pdf_name.lower() or 'transcript' in text.lower():
        return "Audio"
    else:
        return "Reading/Writing"

# Function to extract metadata including subject and module
def extract_metadata(pdf_name, text):
    # Use the filename as the title (without .pdf extension)
    title = os.path.splitext(pdf_name)[0]

    # Extract subject and module from filename
    subject_name, module_name = os.path.splitext(pdf_name)[0].split('-')[:2]
    # Assuming subject_name-module_name.pdf format

    # Extract keywords (nouns and proper nouns)
    nlp_doc = nlp(text)
    keywords = [token.lemma_ for token in nlp_doc if token.pos_ in ['NOUN', 'PROPN']]

    # Determine difficulty
    difficulty = determine_difficulty(text)

    # Estimate duration
    duration = estimate_reading_duration(text)

    # Determine learning style
    learning_style = determine_learning_style(pdf_name, text)

    # Generate metadata
    metadata = {
        "title": title,
        "keywords": keywords,
        "difficulty": difficulty,
        "format": "PDF",
        "duration": duration,
        "learning_style": learning_style,
        "subject": subject_name.strip(),  # Include subject in metadata
        "module": module_name.strip()  # Include module in metadata
    }
    return metadata

# Function to process all PDFs in the folder
def process_pdfs(pdf_folder):
    pdf_metadata = []
    for pdf_file in os.listdir(pdf_folder):
        if pdf_file.endswith('.pdf'):
            pdf_path = os.path.join(pdf_folder, pdf_file)

            # Extract text from specific pages (e.g., first 5 pages)
            text = extract_text_from_pages(pdf_path, range(1, 11))

            # Extract metadata including subject and module
            metadata = extract_metadata(pdf_file, text)
            metadata['file_path'] = pdf_path
            pdf_metadata.append(metadata)
    return pdf_metadata

# Create the database table with subject and module included
def create_database(db_name):
    conn = sqlite3.connect(db_name)
    cursor = conn.cursor()
    cursor.execute('''
        CREATE TABLE IF NOT EXISTS pdfs (
            id INTEGER PRIMARY KEY,
            title TEXT,
            keywords TEXT,
            difficulty TEXT,
            format TEXT,
            duration TEXT,
            learning_style TEXT,
            subject TEXT,
            module TEXT,
            file_path TEXT
        )
    ''')
    conn.commit()
    conn.close()

# Insert the metadata into the database with subject and module included
def insert_metadata(db_name, metadata_list):
    conn = sqlite3.connect(db_name)
    cursor = conn.cursor()
    for metadata in metadata_list:
        # Use a try-except block to handle potential errors during insertion
        try:
            cursor.execute('''
                INSERT INTO pdfs (title, keywords, difficulty, format, duration, learning_style, subject, module, file_path)
                VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
            ''', (metadata['title'], ','.join(metadata['keywords']), metadata['difficulty'], metadata['format'], metadata['duration'], metadata['learning_style'], metadata['subject'], metadata['module'], metadata['file_path']))
        except sqlite3.IntegrityError as e:
            print(f"Error inserting {metadata['title']}: {e}")

    # Commit changes and close connection
    conn.commit()
    conn.close()

# Function to query and display contents of the pdfs table
def query_and_display_pdfs(db_name):
    conn = sqlite3.connect(db_name)
    cursor = conn.cursor()

    # Execute the query
    cursor.execute('SELECT * FROM pdfs')
    rows = cursor.fetchall()

    # Print header
    print(f"{'ID':<5} | {'Title':<30} | {'Keywords':<30} | {'Difficulty':<15} | {'Format':<10} | {'Duration':<10} | {'Learning Style':<20} | {'Subject':<20} | {'Module':<20} | {'File Path':<50}")
    print("-" * 200)

    # Print each row
    for row in rows:
        pdf_id, title, keywords, difficulty, format, duration, learning_style, subject, module, file_path = row
        # Truncate long strings for display
        title = title[:30] + (title[30:] and '...')
        keywords = keywords[:30] + (keywords[30:] and '...')
        file_path = file_path[:50] + (file_path[50:] and '...')
        print(f"{pdf_id:<5} | {title:<30} | {keywords:<30} | {difficulty:<15} | {format:<10} | {duration:<10} | {learning_style:<20} | {subject:<20} | {module:<20} | {file_path:<50}")

    # Close connection
    conn.close()


In [48]:
# Process the PDFs and store metadata in the database
metadata_list = process_pdfs(pdf_folder)

# Print metadata to verify
for metadata in metadata_list:
    print(metadata)

{'title': 'ML-M2', 'keywords': ['dataset', 'R.', 'Kelley', 'Pace', 'Ronald', 'Barry', 'Sparse', 'Spatial', 'Autoregressions', 'statistic', 'Probability', 'Letters', 'no', '.', 'Real', 'Data', 'Machine', 'Learning', 'world', 'datum', 'dataset', 'thousand', 'dataset', 'sort', 'domain', 'place', 'datum', 'datum', 'repository', 'UC', 'Irvine', 'Machine', 'Learning', 'Repository', 'Kaggle', 'dataset', 'Amazon', 'AWS', 'dataset', '•', 'Meta', 'portal', 'datum', 'repository', 'http://dataportals.org/', 'http://opendatamonitor.eu/', 'http://quandl.com/', 'page', 'datum', 'repository', 'Wikipedia', 'list', 'Machine', 'Learning', 'dataset', 'Quora.com', 'question', 'dataset', 'subreddit', 'chapter', 'California', 'Housing', 'Prices', 'StatLib', 'itory2', 'Figure', 'dataset', 'datum', 'California', 'cen‐', 'sus', 'house', 'Bay', 'Area', 'time', 'quality', 'learning', 'datum', 'attribute', 'feature', 'purpose', '|', 'chapter', 'end', 'end', 'Machine', 'Learning', 'Project', 'MODULE-2', 'Figure', '

In [49]:
create_database(db_name)
insert_metadata(db_name, metadata_list)
query_and_display_pdfs(db_name)

ID    | Title                          | Keywords                       | Difficulty      | Format     | Duration   | Learning Style       | Subject              | Module               | File Path                                         
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
1     | ML-M2                          | dataset,R.,Kelley,Pace,Ronald,... | Intermediate    | PDF        | 15 minutes | Reading/Writing      | ML                   | M2                   | /content/ML-M2.pdf                                
2     | ML-M5                          | chapter,BAYESIAN,LEARNING,reas... | Hard            | PDF        | 19 minutes | Reading/Writing      | ML                   | M5                   | /content/ML-M5.pdf                                
3     | DS&A-M4                        | tree,mystery,Jim,Woodring,Data... | In

In [50]:
#testing if the query is properly retriving the data or not
# Function to query and retrieve metadata for a specific PDF
def query_pdf_metadata(db_name, pdf_title):
    conn = sqlite3.connect(db_name)
    cursor = conn.cursor()

    # Execute the query to retrieve metadata for the given PDF title
    cursor.execute('''
        SELECT * FROM pdfs WHERE title = ?
    ''', (pdf_title,))
    row = cursor.fetchone()

    # Close connection
    conn.close()

    return row

# Example usage
if __name__ == "__main__":
    db_name = 'pdfnotes_metadata.db'
    pdf_title = "DS&A-M4"

    # Query metadata for the specific PDF title
    pdf_metadata = query_pdf_metadata(db_name, pdf_title)

    # Display metadata
    if pdf_metadata:
        pdf_id, title, keywords, difficulty, format, duration, learning_style, subject, module, file_path = pdf_metadata
        print(f"Title: {title}")
        print(f"Keywords: {keywords}")
        print(f"Difficulty: {difficulty}")
        print(f"Format: {format}")
        print(f"Duration: {duration}")
        print(f"Learning Style: {learning_style}")
        print(f"Subject: {subject}")
        print(f"Module: {module}")
        print(f"File Path: {file_path}")
    else:
        print(f"PDF '{pdf_title}' not found in the database.")


Title: DS&A-M4
Keywords: tree,mystery,Jim,Woodring,DataSciencester,VP,Talent,number,job,candidate,site,degree,success,data,attribute,candidate,candidate,datum,model,identifying,candidate,time,interview,fit,decision,tree,modeling,tool,data,scientist,kit,Decision,Trees,MODULE-4,decision,tree,decision,tree,tree,structure,number,decision,path,outcome,path,game,Twenty,Questions,decision,tree,example,animal,leg,no,no,back,cent,coin,echidna,path,leg,cent,coin,Echidna,idiosyncratic,animal,decision,tree,figure,figure,animal,decision,tree,decision,tree,lot,process,prediction,model,decision,tree,mix,number,leg,delicious,attribute,datum,attribute,time,decision,tree,set,training,datum,problem,tree,one,data,set,lot,work,decision,tree,training,datum,datum,way,people,decision,tree,classification,tree,output,regression,tree,output,chapter,classification,tree,ID3,algorithm,decision,tree,set,datum,decision,tree,thing,problem,output,candidate,website,visitor,advertisement,a,advertisement,b,food,office,fri

In [51]:
#checking the content of the database created
# Connect to SQLite database
conn = sqlite3.connect('pdfnotes_metadata.db')

# Query the database to fetch metadata
query = "SELECT * FROM pdfs"
df = pd.read_sql_query(query, conn)

# Close the database connection
conn.close()

# Display the loaded dataset
print(df.head())

   id    title                                           keywords  \
0   1    ML-M2  dataset,R.,Kelley,Pace,Ronald,Barry,Sparse,Spa...   
1   2    ML-M5  chapter,BAYESIAN,LEARNING,reasoning,approach,i...   
2   3  DS&A-M4  tree,mystery,Jim,Woodring,DataSciencester,VP,T...   
3   4    ML-M4  decision,tree,Early,Release,ebook,book,form,au...   
4   5  DS&A-M1  joke,data,scientist,statistic,computer,scienti...   

     difficulty format    duration   learning_style subject module  \
0  Intermediate    PDF  15 minutes  Reading/Writing      ML     M2   
1          Hard    PDF  19 minutes  Reading/Writing      ML     M5   
2  Intermediate    PDF  10 minutes  Reading/Writing    DS&A     M4   
3  Intermediate    PDF  13 minutes  Reading/Writing      ML     M4   
4  Intermediate    PDF  12 minutes  Reading/Writing    DS&A     M1   

              file_path  
0    /content/ML-M2.pdf  
1    /content/ML-M5.pdf  
2  /content/DS&A-M4.pdf  
3    /content/ML-M4.pdf  
4  /content/DS&A-M1.pdf  


# Laoding ,preprocessing and building the recommender model

In [77]:

# Defining a sample user interactions dataset (later when I make a web page, I can load this content dynamically from user interactions)
dataset = [
    (1, 1, 'view', '2024-06-23 10:15:00', 4),
    (1, 3, 'view', '2024-06-23 10:20:00', 3),
    (1, 5, 'view', '2024-06-23 10:25:00', 5),
    (1, 7, 'view', '2024-06-23 10:30:00', 2),
    (1, 9, 'view', '2024-06-23 10:35:00', 4),
    (2, 2, 'view', '2024-06-23 11:00:00', 5),
    (2, 4, 'view', '2024-06-23 11:05:00', 1),
    (2, 6, 'view', '2024-06-23 11:10:00', 3),
    (2, 8, 'view', '2024-06-23 11:15:00', 4),
    (2, 10, 'view', '2024-06-23 11:20:00', 2),
    (3, 3, 'view', '2024-06-23 11:30:00', 4),
    (3, 5, 'view', '2024-06-23 11:35:00', 5),
    (3, 7, 'view', '2024-06-23 11:40:00', 3),
    (3, 9, 'view', '2024-06-23 11:45:00', 2),
    (3, 11, 'view', '2024-06-23 11:50:00', 4),
    (4, 4, 'view', '2024-06-23 12:00:00', 5),
    (4, 6, 'view', '2024-06-23 12:05:00', 3),
    (4, 8, 'view', '2024-06-23 12:10:00', 4),
    (4, 10, 'view', '2024-06-23 12:15:00', 2),
    (4, 12, 'view', '2024-06-23 12:20:00', 1),
    (5, 5, 'view', '2024-06-23 12:30:00', 4),
    (5, 7, 'view', '2024-06-23 12:35:00', 3),
    (5, 9, 'view', '2024-06-23 12:40:00', 5),
    (5, 11, 'view', '2024-06-23 12:45:00', 2),
    (5, 13, 'view', '2024-06-23 12:50:00', 4),
    (6, 6, 'view', '2024-06-23 13:00:00', 5),
    (6, 8, 'view', '2024-06-23 13:05:00', 1),
    (6, 10, 'view', '2024-06-23 13:10:00', 3),
    (6, 12, 'view', '2024-06-23 13:15:00', 4),
    (6, 14, 'view', '2024-06-23 13:20:00', 2),
    (7, 7, 'view', '2024-06-23 13:30:00', 3),
    (7, 9, 'view', '2024-06-23 13:35:00', 4),
    (7, 11, 'view', '2024-06-23 13:40:00', 5),
    (7, 13, 'view', '2024-06-23 13:45:00', 2),
    (7, 15, 'view', '2024-06-23 13:50:00', 1),
    (8, 8, 'view', '2024-06-23 14:00:00', 4),
    (8, 10, 'view', '2024-06-23 14:05:00', 5),
    (8, 12, 'view', '2024-06-23 14:10:00', 3),
    (8, 14, 'view', '2024-06-23 14:15:00', 2),
    (8, 16, 'view', '2024-06-23 14:20:00', 4),
    (9, 9, 'view', '2024-06-23 14:30:00', 5),
    (9, 11, 'view', '2024-06-23 14:35:00', 3),
    (9, 13, 'view', '2024-06-23 14:40:00', 4),
    (9, 15, 'view', '2024-06-23 14:45:00', 2),
    (9, 17, 'view', '2024-06-23 14:50:00', 1),
    (10, 10, 'view', '2024-06-23 15:00:00', 3),
    (10, 12, 'view', '2024-06-23 15:05:00', 4),
    (10, 14, 'view', '2024-06-23 15:10:00', 5),
    (10, 16, 'view', '2024-06-23 15:15:00', 2),
    (10, 18, 'view', '2024-06-23 15:20:00', 1)
]

# Convert to DataFrame
df = pd.DataFrame(dataset, columns=['user_id', 'pdf_id', 'interaction_type', 'timestamp', 'rating'])

# Connect to SQLite database (or create it if not exist)
conn = sqlite3.connect('user_interactions.db')
cursor = conn.cursor()

# Create interactions table
cursor.execute('''
    CREATE TABLE IF NOT EXISTS interactions (
        user_id INTEGER,
        pdf_id INTEGER,
        interaction_type TEXT,
        timestamp TEXT,
        rating INTEGER,
        PRIMARY KEY (user_id, pdf_id, timestamp)
    )
''')

# Insert data into the table
for row in dataset:
    cursor.execute('''
        INSERT OR IGNORE INTO interactions (user_id, pdf_id, interaction_type, timestamp, rating)
        VALUES (?, ?, ?, ?, ?)
    ''', row)

# Commit changes and close connection
conn.commit()
conn.close()

print("Dataset successfully stored in SQLite database.")


Dataset successfully stored in SQLite database.


In [78]:
#checking the content of the dataset
# Connect to SQLite database
conn = sqlite3.connect('user_interactions.db')

# Query the database to fetch metadata
query = "SELECT * FROM interactions"
df = pd.read_sql_query(query, conn)

# Close the database connection
conn.close()

# Display the loaded dataset
print(df.head())

   user_id  pdf_id interaction_type            timestamp  rating
0        1       1             view  2024-06-23 10:15:00       4
1        1       3             view  2024-06-23 10:20:00       3
2        1       5             view  2024-06-23 10:25:00       5
3        1       7             view  2024-06-23 10:30:00       2
4        1       9             view  2024-06-23 10:35:00       4


# Preprocess Data


In [79]:

# Convert timestamp to datetime format (if needed)
df['timestamp'] = pd.to_datetime(df['timestamp'])

# Drop 'interaction_type' column
df.drop('interaction_type', axis=1, inplace=True)

# Check for missing values
print(df.isnull().sum())

# Handle missing values if any
df = df.dropna()


user_id      0
pdf_id       0
timestamp    0
rating       0
dtype: int64


In [80]:
# Split the dataset into training and testing sets
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)
# train_data, test_data


In [86]:
# Define parameter grid
param_grid = {
    'n_factors': [50, 75, 100, 125, 150, 175, 200],
    'n_epochs': [10, 20, 30, 40],
    'lr_all': [0.001, 0.002, 0.003, 0.004, 0.005],
    'reg_all': [0.01, 0.02, 0.03, 0.04, 0.05, 0.1]
}


# Define your DataFrame 'df' which includes columns user_id, pdf_id, and rating

# Define the Reader and load data
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df[['user_id', 'pdf_id', 'rating']], reader)

# Instantiate the algorithm
algo = SVD()

# Grid search
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=5)
gs.fit(data)

# Best RMSE and MAE scores
print("Best RMSE score:", gs.best_score['rmse'])
print("Best MAE score:", gs.best_score['mae'])

# Best parameters
print("Best parameters:", gs.best_params['rmse'])


Best RMSE score: 1.3051385514721858
Best MAE score: 1.1272607130214753
Best parameters: {'n_factors': 100, 'n_epochs': 10, 'lr_all': 0.001, 'reg_all': 0.1}


In [None]:
# on chatgpt , for content based filtering try the code