In [25]:
import fitz  # PyMuPDF

def extract_text_from_pdf(file_path):
    text = ""
    try:
        with fitz.open(file_path) as doc:
            for page in doc:
                text += page.get_text()
        return text
    except Exception as e:
        print(f"Error extracting text from {file_path}: {e}")
        return None


In [12]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

def preprocess_text(text):
    text = re.sub(r'\s+', ' ', text)
    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    processed_tokens = [lemmatizer.lemmatize(token.lower()) for token in tokens if token.isalpha() and token.lower() not in stop_words]
    return ' '.join(processed_tokens)

# Example usage
processed_text = preprocess_text(text)
print(processed_text)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Microsoft\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Microsoft\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Microsoft\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


unit measurement figure image might showing number thing might whirlpool tank water perhaps collage paint shiny bead done art class without knowing size object unit recognize meter inch difficult know looking fact image show whirlpool galaxy companion galaxy diameter km across credit modification work beckwith stsci hubble heritage team esa nasa chapter outline scope scale physic unit standard unit conversion dimensional analysis estimate fermi calculation significant figure solving problem physic introduction noted figure caption image whirlpool galaxy examine first section chapter galaxy immense atom small yet law physic describe along rest indication underlying unity universe law physic surprisingly implying underlying simplicity nature apparent complexity text learn law physic galaxy atom may seem far removed daily life begin explore subject may soon come chapter unit measurement realize physic play much larger role life first thought matter life goal career choice scope scale phys

In [None]:
import pandas as pd
import random
from datetime import datetime, timedelta

# Generate mock usage data
documents = ['D:\\Work and Assignments\\Python\\Python Projects\\NLP related projects\\Chapter1.pdf',
             'D:\\Work and Assignments\\Python\\Python Projects\\NLP related projects\\Chapter2.pdf',
             'D:\\Work and Assignments\\Python\\Python Projects\\NLP related projects\\Chapter3.pdf',
             'D:\\Work and Assignments\\Python\\Python Projects\\NLP related projects\\Chapter4.pdf',
             'D:\\Work and Assignments\\Python\\Python Projects\\NLP related projects\\Chapter5.pdf']
usage_data = []

for doc in documents:
    for _ in range(random.randint(5, 20)):
        usage_data.append({
            'document': doc,
            'access_time': datetime.now() - timedelta(days=random.randint(0, 30)),
            'user_id': random.randint(1, 10)
        })

usage_df = pd.DataFrame(usage_data)
usage_df


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Analyze document access frequency
access_counts = usage_df['document'].value_counts()

plt.figure(figsize=(10, 6))
sns.barplot(x=access_counts.index, y=access_counts.values)
plt.title('Document Access Frequency')
plt.xlabel('Document')
plt.ylabel('Number of Accesses')
plt.show()


In [28]:
from collections import Counter

def analyze_text(documents):
    word_counts = Counter()
    for text in documents:
        processed_text = preprocess_text(text)
        word_counts.update(processed_text.split())
    return word_counts

# Example usage
texts = [extract_text_from_pdf(doc) if doc.endswith('.pdf') else 
         extract_text_from_docx(doc) if doc.endswith('.docx') else 
         extract_text_from_xlsx(doc) for doc in documents if doc]
word_counts = analyze_text(texts)

print("Most common words:", word_counts.most_common(10))


Most common words: [('vector', 1380), ('force', 868), ('velocity', 796), ('acceleration', 682), ('b', 658), ('motion', 564), ('direction', 528), ('equation', 469), ('figure', 457), ('time', 453)]


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def find_similar_documents(texts):
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(texts)
    similarity_matrix = cosine_similarity(tfidf_matrix)
    return similarity_matrix

# Example usage
similarity_matrix = find_similar_documents(texts)

print("Document Similarity Matrix:\n", similarity_matrix)


In [None]:
def main():
    # Extract and preprocess text from documents
    documents = ['D:\\Work and Assignments\\Python\\Python Projects\\NLP related projects\\Chapter1.pdf',
             'D:\\Work and Assignments\\Python\\Python Projects\\NLP related projects\\Chapter2.pdf',
             'D:\\Work and Assignments\\Python\\Python Projects\\NLP related projects\\Chapter3.pdf',
             'D:\\Work and Assignments\\Python\\Python Projects\\NLP related projects\\Chapter4.pdf',
             'D:\\Work and Assignments\\Python\\Python Projects\\NLP related projects\\Chapter5.pdf']
    texts = [extract_text_from_pdf(doc) if doc.endswith('.pdf') else 
             extract_text_from_docx(doc) if doc.endswith('.docx') else 
             extract_text_from_xlsx(doc) for doc in documents if doc]
    processed_texts = [preprocess_text(text) for text in texts if text]

    # Analyze usage patterns
    usage_df = generate_mock_usage_data(documents)
    analyze_usage_patterns(usage_df)

    # Analyze text for knowledge gaps
    word_counts = analyze_text(processed_texts)
    print("Most common words:", word_counts.most_common(10))

    # Identify redundancies
    similarity_matrix = find_similar_documents(processed_texts)
    generate_similarity_heatmap(similarity_matrix, documents)

def generate_mock_usage_data(documents):
    # Generate mock usage data
    usage_data = []
    for doc in documents:
        for _ in range(random.randint(5, 20)):
            usage_data.append({
                'document': doc,
                'access_time': datetime.now() - timedelta(days=random.randint(0, 30)),
                'user_id': random.randint(1, 10)
            })
    return pd.DataFrame(usage_data)

def analyze_usage_patterns(usage_df):
    access_counts = usage_df['document'].value_counts()
    plt.figure(figsize=(10, 6))
    sns.barplot(x=access_counts.index, y=access_counts.values)
    plt.title('Document Access Frequency')
    plt.xlabel('Document')
    plt.ylabel('Number of Accesses')
    plt.show()

def generate_similarity_heatmap(similarity_matrix, documents):
    plt.figure(figsize=(10, 8))
    sns.heatmap(similarity_matrix, annot=True, xticklabels=documents, yticklabels=documents)
    plt.title('Document Similarity Heatmap')
    plt.show()

if __name__ == "__main__":
    main()


In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import fitz  # PyMuPDF
from docx import Document
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
from sklearn.linear_model import LinearRegression
from textblob import TextBlob
import json
import datetime

def extract_text_from_pdf(file_path):
    doc = fitz.open(file_path)
    text = ""
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        text += page.get_text()
    return text

def extract_text_from_docx(file_path):
    doc = Document(file_path)
    text = ""
    for paragraph in doc.paragraphs:
        text += paragraph.text + "\n"
    return text

def read_csv(file_path):
    return pd.read_csv(file_path)

def read_excel(file_path):
    return pd.read_excel(file_path)

def read_txt(file_path):
    with open(file_path, 'r') as file:
        return file.read()

def process_files(file_paths):
    documents = []
    for file_path in file_paths:
        if file_path.endswith('.pdf'):
            content = extract_text_from_pdf(file_path)
        elif file_path.endswith('.docx'):
            content = extract_text_from_docx(file_path)
        elif file_path.endswith('.csv'):
            content = read_csv(file_path).to_string()
        elif file_path.endswith('.xlsx'):
            content = read_excel(file_path).to_string()
        elif file_path.endswith('.txt'):
            content = read_txt(file_path)
        else:
            print(f"Unsupported file type: {file_path}")
            continue
        documents.append({
            'file_path': file_path,
            'content': content,
            'views': np.random.randint(0, 100),  # Simulated data
            'downloads': np.random.randint(0, 100),  # Simulated data
            'edits': np.random.randint(0, 10),  # Simulated data
            'feedback': np.random.choice(['Good', 'Average', 'Poor']),  # Simulated data
            'ratings': np.random.uniform(1, 5),  # Simulated data
            'last_accessed': datetime.datetime.now() - pd.to_timedelta(np.random.randint(0, 365), unit='d')
        })
    return pd.DataFrame(documents)

# Function to calculate document access metrics
def document_access_metrics(documents):
    documents['total_interactions'] = documents[['views', 'downloads', 'edits']].sum(axis=1)
    most_accessed = documents.sort_values(by='total_interactions', ascending=False).head(1)
    least_accessed = documents.sort_values(by='total_interactions').head(1)
    return most_accessed, least_accessed

# Function to monitor user engagement
def user_engagement(user_interactions):
    user_activity = user_interactions.groupby('user_id').size()
    active_users = user_activity[user_activity > 1]
    inactive_users = user_activity[user_activity == 1]
    return active_users, inactive_users

# Function to perform trend analysis
def trend_analysis(documents):
    documents['last_accessed'] = pd.to_datetime(documents['last_accessed'])
    documents.set_index('last_accessed', inplace=True)
    trend = documents.resample('M').sum()
    trend = trend[(trend.T != 0).any()]
    trend = trend.sort_values(by='total_interactions', ascending=False)
    return trend

# Function to collect and analyze feedback
def sentiment_analysis(documents):
    documents['sentiment'] = documents['feedback'].apply(lambda x: TextBlob(x).sentiment.polarity)
    return documents

# Function to identify knowledge gaps
def search_query_analysis(search_queries):
    no_results = search_queries[search_queries['results'] == 0]
    return no_results

# Function to detect redundancy in documents
def redundancy_detection(documents):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(documents['content'])
    cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
    
    suggetion = 'NA'
    num_docs = len(documents)
    for i in range(num_docs):
        for j in range(num_docs):
            if i != j and cosine_sim[i, j] > 0.95:
                suggetion = f"Documents '{documents.iloc[i]['file_path']}' and '{documents.iloc[j]['file_path']}' contain similar content."
                
    return cosine_sim, suggetion

# Function to generate dashboard
def generate_dashboard(documents, user_interactions, search_queries):
    most_accessed, least_accessed = document_access_metrics(documents)
    active_users, inactive_users = user_engagement(user_interactions)
    trend = trend_analysis(documents)
    sentiment = sentiment_analysis(documents[['feedback']])
    gaps = search_query_analysis(search_queries)
    redundancy,sug = redundancy_detection(documents)

    print("Most Accessed Document:")
    display(most_accessed)
    print("Least Accessed Document:")
    display(least_accessed)
    print("Active Users:")
    display(active_users)
    print("Inactive Users:")
    display(inactive_users)
    print("Trend Analysis:")
    display(trend)
    print("Sentiment Analysis:")
    display(sentiment)
    print("Knowledge Gaps:")
    display(gaps)
    print("Document Redundancy:\n", redundancy)
    print("\n\nSuggetions: ", sug)
    
    
    plt.figure(figsize=(12, 6))
    sns.barplot(x=documents['file_path'], y=documents['total_interactions'])
    plt.title('Document Access Metrics: Total Interactions')
    plt.xlabel('Document')
    plt.ylabel('Total Interactions')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

    # User Engagement
    plt.figure(figsize=(8, 5))
    sns.countplot(x=user_interactions['user_id'])
    plt.title('User Engagement: Number of Interactions')
    plt.xlabel('User ID')
    plt.ylabel('Number of Interactions')
    plt.tight_layout()
    plt.show()

    # Trend Analysis
    plt.figure(figsize=(10, 6))
    sns.lineplot(x=trend.index, y=trend['total_interactions'])
    plt.title('Trend Analysis: Document Interactions over Time')
    plt.xlabel('Time')
    plt.ylabel('Total Interactions')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

    # Knowledge Gaps
    plt.figure(figsize=(8, 5))
    sns.barplot(x='query', y='results', data=search_queries)
    plt.title('Knowledge Gaps: Search Queries with No Results')
    plt.xlabel('Search Query')
    plt.ylabel('Number of Results')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()
    
    #Redundancy Heatmap
    plt.figure(figsize=(10, 8))
    sns.heatmap(redundancy, annot=True, fmt='.2f', cmap='coolwarm', xticklabels=documents['file_path'], yticklabels=documents['file_path'])
    plt.title('Redundancy Heatmap: Document Similarity')
    plt.xlabel('Document')
    plt.ylabel('Document')
    plt.xticks(rotation=45)
    plt.yticks(rotation=45)
    plt.tight_layout()
    plt.show()
    

# Function to export data
def export_data(data, format='json'):
    if format == 'csv':
        data.to_csv('exported_data.csv', index=False)
    elif format == 'excel':
        data.to_excel('exported_data.xlsx', index=False)
    elif format == 'json':
        data.to_json('exported_data.json')
    else:
        print("Unsupported format")



import matplotlib.pyplot as plt
import seaborn as sns
   

# Main function
if __name__ == "__main__":
    # Provide the file paths here
    file_paths = ['Chapter1.pdf', 'Chapter2.pdf', 'demo.csv', 'Chapter1 copy.pdf']
    
    documents = process_files(file_paths)
    
    # Sample user interactions and search queries data
    user_interactions = pd.DataFrame({
        'user_id': [1, 2, 1],
        'doc_id': [1, 2, 3],
        'time_spent': [30, 45, 10],
        'activity': ['view', 'download', 'edit']
    })
    
    search_queries = pd.DataFrame({
        'query': ['document management', 'analytics', 'reporting'],
        'results': [10, 5, 0]
    })

    generate_dashboard(documents, user_interactions, search_queries)
    export_data(documents, format='csv')



In [29]:
user_interactions = pd.DataFrame({
        'user_id': [1, 2, 1],
        'doc_id': [1, 2, 3],
        'time_spent': [30, 45, 10],
        'activity': ['view', 'download', 'edit']
    })
user_interactions

Unnamed: 0,user_id,doc_id,time_spent,activity
0,1,1,30,view
1,2,2,45,download
2,1,3,10,edit


In [66]:
dataa = pd.read_csv('D:\\Work and Assignments\\Python\\Python Projects\\NLP related projects\\exported_data.csv')
dataa


Unnamed: 0,file_path,content,views,downloads,edits,feedback,ratings,total_interactions
0,Chapter1.pdf,1 | UNITS AND\nMEASUREMENT\nFigure 1.1 This im...,37,44,6,Average,1.194156,87
1,Chapter2.pdf,2 | VECTORS\nFigure 2.1 A signpost gives infor...,83,10,0,Average,4.772089,93
2,demo.csv,...,20,77,3,Average,1.527848,100
3,Chapter1 copy.pdf,1 | UNITS AND\nMEASUREMENT\nFigure 1.1 This im...,93,56,8,Poor,1.888624,157


In [23]:
file_paths = ['Chapter1.pdf', 'Chapter2.pdf', 'demo.csv', 'Chapter3.pdf', 'Chapter4.pdf', 'Chapter5.pdf', 'Chapter6.pdf', 'Chapter7.pdf', 'Chapter8.pdf', 'Chapter9.pdf', 'Chapter10.pdf']
documents = process_files(file_paths)

def document_access_metrics(documents):
    documents['total_interactions'] = documents[['views', 'downloads', 'edits']].sum(axis=1)
    documents = documents.drop('content', axis=1)
    most_accessed = documents.sort_values(by='total_interactions', ascending=False).head(1)
    least_accessed = documents.sort_values(by='total_interactions').head(1)
    return documents

document_access_metrics(documents)

Unnamed: 0,file_path,views,downloads,edits,feedback,ratings,last_accessed,total_interactions
0,Chapter1.pdf,35,58,2,Good,1.311433,2023-08-12 15:13:34.854395,95
1,Chapter2.pdf,35,77,1,Good,4.728374,2024-04-11 15:13:35.054921,113
2,demo.csv,14,80,2,Poor,3.843073,2024-03-25 15:13:35.060917,96
3,Chapter3.pdf,68,23,3,Average,3.026007,2024-06-02 15:13:35.302439,94


In [38]:
file_paths = ['Chapter1.pdf', 'Chapter2.pdf', 'demo.csv', 'Chapter3.pdf', 'Chapter4.pdf', 'Chapter5.pdf', 'Chapter6.pdf', 'Chapter7.pdf', 'Chapter8.pdf', 'Chapter9.pdf', 'Chapter10.pdf']
documents = process_files(file_paths)

def trend_analysis(documents):
    document_access_metrics(documents)
    documents['last_accessed'] = pd.to_datetime(documents['last_accessed'])
    # documents.set_index('last_accessed', inplace=True)
    # trend = documents.resample('M').sum()
    # trend = trend[(trend.T != 0).any()]
    trend = documents.sort_values(by='total_interactions', ascending=False)
    return trend
trend_analysis(documents)


Unnamed: 0,file_path,content,views,downloads,edits,feedback,ratings,last_accessed,total_interactions
6,Chapter6.pdf,6 | APPLICATIONS OF\nNEWTON'S LAWS\nFigure 6.1...,86,89,4,Poor,4.841094,2024-03-17 15:56:05.322908,179
2,demo.csv,...,74,77,0,Average,1.997083,2024-06-08 15:56:04.601086,151
0,Chapter1.pdf,1 | UNITS AND\nMEASUREMENT\nFigure 1.1 This im...,42,83,9,Good,1.910632,2023-12-22 15:56:04.197309,134
3,Chapter3.pdf,3 | MOTION ALONG A\nSTRAIGHT LINE\nFigure 3.1 ...,59,62,7,Poor,4.553427,2024-02-05 15:56:04.776375,128
5,Chapter5.pdf,5 | NEWTON'S LAWS OF\nMOTION\nFigure 5.1 The G...,94,24,8,Average,4.689144,2023-12-23 15:56:05.139617,126
1,Chapter2.pdf,2 | VECTORS\nFigure 2.1 A signpost gives infor...,39,48,3,Average,4.736407,2024-06-06 15:56:04.593005,90
10,Chapter10.pdf,10 | FIXED-AXIS ROTATION\nFigure 10.1 Brazos w...,68,19,0,Good,1.3071,2024-01-22 15:56:06.002131,87
7,Chapter7.pdf,7 | WORK AND KINETIC\nENERGY\nFigure 7.1 A spr...,67,17,1,Poor,3.808337,2024-04-10 15:56:05.463069,85
8,Chapter8.pdf,8 | POTENTIAL ENERGY\nAND CONSERVATION OF\nENE...,12,51,8,Poor,3.581967,2024-05-14 15:56:05.688519,71
9,Chapter9.pdf,9 | LINEAR MOMENTUM\nAND COLLISIONS\nFigure 9....,7,53,3,Good,3.17132,2024-08-03 15:56:05.857980,63
