In [2]:
import nltk
import textstat
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Ensure necessary resources are available
nltk.download('punkt')
nltk.download('stopwords')

# Content-based metrics function
def content_based_metrics(text):
    """
    Computes content-based metrics for a given text.
    
    Args:
    - text (str): The input text.
    
    Returns:
    - dict: A dictionary with various content-based metrics.
    """
    # Tokenize the text
    tokens = word_tokenize(text.lower())
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words and word.isalnum()]
    
    # Calculate content-based metrics
    word_count = len(filtered_tokens)
    unique_word_count = len(set(filtered_tokens))
    lexical_diversity = unique_word_count / word_count if word_count != 0 else 0
    average_word_length = sum(len(word) for word in filtered_tokens) / word_count if word_count != 0 else 0
    
    metrics = {
        "word_count": word_count,
        "unique_word_count": unique_word_count,
        "lexical_diversity": lexical_diversity,
        "average_word_length": average_word_length
    }
    
    return metrics

# Readability metrics function
def readability_metrics(text):
    """
    Computes readability metrics for a given text.
    
    Args:
    - text (str): The input text.
    
    Returns:
    - dict: A dictionary with various readability metrics.
    """
    # Compute readability metrics
    flesch_reading_ease = textstat.flesch_reading_ease(text)
    flesch_kincaid_grade = textstat.flesch_kincaid_grade(text)
    smog_index = textstat.smog_index(text)
    coleman_liau_index = textstat.coleman_liau_index(text)
    automated_readability_index = textstat.automated_readability_index(text)
    
    metrics = {
        "flesch_reading_ease": flesch_reading_ease,
        "flesch_kincaid_grade": flesch_kincaid_grade,
        "smog_index": smog_index,
        "coleman_liau_index": coleman_liau_index,
        "automated_readability_index": automated_readability_index
    }
    
    return metrics

# Example usage with research paper and generated summary
def evaluate_paper_and_summary(paper_text, summary_text):
    """
    Evaluates the content-based and readability metrics for both the research paper and its generated summary.
    
    Args:
    - paper_text (str): The full text of the research paper.
    - summary_text (str): The generated summary of the paper.
    
    Returns:
    - dict: A dictionary containing content-based and readability metrics for both the paper and the summary.
    """
    # Content-based metrics for both the paper and summary
    paper_content_metrics = content_based_metrics(paper_text)
    summary_content_metrics = content_based_metrics(summary_text)
    
    # Readability metrics for both the paper and summary
    paper_readability_metrics = readability_metrics(paper_text)
    summary_readability_metrics = readability_metrics(summary_text)
    
    # Return all metrics in a structured way
    evaluation_metrics = {
        "paper_content_metrics": paper_content_metrics,
        "summary_content_metrics": summary_content_metrics,
        "paper_readability_metrics": paper_readability_metrics,
        "summary_readability_metrics": summary_readability_metrics
    }
    
    return evaluation_metrics

# Sample texts (Replace with your paper and summary)
paper_text = """Queenstown, a vibrant tourist destination, thrives on its stunning scenery and diverse activities. However, this reliance on tourism presents a complex dynamic, with both opportunities and challenges for the local community. This paper explores the multifaceted impact of tourism on the Queenstown community, examining its economic benefits alongside the social and environmental consequences.\n\nTourism undeniably fuels the local economy, generating significant revenue through various sectors like accommodation, hospitality, and retail. Businesses directly and indirectly benefit from the influx of tourists, creating employment opportunities and stimulating economic growth.\n\nThe rapid growth of tourism also brings challenges. Overcrowding can lead to a decline in the quality of life for residents, with increased traffic congestion, strain on infrastructure, and a rise in housing costs pushing locals out. Furthermore, the environmental impact of tourism, such as pollution and habitat destruction, poses a long-term threat to the natural beauty that attracts tourists in the first place.\n\nTourism is a double-edged sword for Queenstown. While it provides economic opportunities, it also presents social and environmental challenges that require careful management. A sustainable approach to tourism development is crucial to ensure that the benefits are shared equitably and that the community's quality of life and the environment are protected for future generations."""

summary_text = """This research paper examines the dual impact of tourism on Queenstown, highlighting its economic benefits (job creation, revenue) alongside the social and environmental challenges (overcrowding, infrastructure strain, environmental degradation). It concludes that a sustainable tourism approach is vital for the community's well-being and the long-term preservation of its natural beauty."""


#Evaluate the metrics for both the research paper and the summary
evaluation_results = evaluate_paper_and_summary(paper_text, summary_text)

#Print the results
for key, value in evaluation_results.items():
    print(f"{key}:\n{value}\n")


paper_content_metrics:
{'word_count': 126, 'unique_word_count': 98, 'lexical_diversity': 0.7777777777777778, 'average_word_length': 8.0}

summary_content_metrics:
{'word_count': 31, 'unique_word_count': 29, 'lexical_diversity': 0.9354838709677419, 'average_word_length': 8.451612903225806}

paper_readability_metrics:
{'flesch_reading_ease': 10.3, 'flesch_kincaid_grade': 16.4, 'smog_index': 17.3, 'coleman_liau_index': 17.81, 'automated_readability_index': 17.1}

summary_readability_metrics:
{'flesch_reading_ease': -4.67, 'flesch_kincaid_grade': 20.1, 'smog_index': 0.0, 'coleman_liau_index': 20.83, 'automated_readability_index': 23.0}



[nltk_data] Downloading package punkt to /Users/amey/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/amey/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
#Approach 1: Lexical and Semantic Similarity

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Function to calculate cosine similarity between paper and summary
def cosine_similarity_tfidf(paper_text, summary_text):
    # Create TF-IDF vectorizer
    vectorizer = TfidfVectorizer(stop_words='english')
    
    # Combine paper and summary into a single list
    texts = [paper_text, summary_text]
    
    # Fit and transform the text data
    tfidf_matrix = vectorizer.fit_transform(texts)
    
    # Compute cosine similarity between paper and summary
    similarity_matrix = cosine_similarity(tfidf_matrix)
    
    return similarity_matrix[0, 1]  # Similarity score between paper and summary

# Example texts
paper_text = """Queenstown, a vibrant tourist destination, thrives on its stunning scenery and diverse activities. However, this reliance on tourism presents a complex dynamic, with both opportunities and challenges for the local community. This paper explores the multifaceted impact of tourism on the Queenstown community, examining its economic benefits alongside the social and environmental consequences.\n\nTourism undeniably fuels the local economy, generating significant revenue through various sectors like accommodation, hospitality, and retail. Businesses directly and indirectly benefit from the influx of tourists, creating employment opportunities and stimulating economic growth.\n\nThe rapid growth of tourism also brings challenges. Overcrowding can lead to a decline in the quality of life for residents, with increased traffic congestion, strain on infrastructure, and a rise in housing costs pushing locals out. Furthermore, the environmental impact of tourism, such as pollution and habitat destruction, poses a long-term threat to the natural beauty that attracts tourists in the first place.\n\nTourism is a double-edged sword for Queenstown. While it provides economic opportunities, it also presents social and environmental challenges that require careful management. A sustainable approach to tourism development is crucial to ensure that the benefits are shared equitably and that the community's quality of life and the environment are protected for future generations."""

summary_text = """This research paper examines the dual impact of tourism on Queenstown, highlighting its economic benefits (job creation, revenue) alongside the social and environmental challenges (overcrowding, infrastructure strain, environmental degradation). It concludes that a sustainable tourism approach is vital for the community's well-being and the long-term preservation of its natural beauty."""


similarity_score = cosine_similarity_tfidf(paper_text, summary_text)
print(f"Cosine Similarity (TF-IDF): {similarity_score}")


Cosine Similarity (TF-IDF): 0.4066029883245594


In [6]:
#Approach 1: Lexical and Semantic Similarity

from transformers import BertTokenizer, BertModel
import torch
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Function to get BERT embeddings
def get_bert_embeddings(text):
    # Tokenize input text
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
    
    # Get embeddings from BERT
    with torch.no_grad():
        outputs = model(**inputs)
        
    # Extract the last hidden state (embedding)
    embeddings = outputs.last_hidden_state.mean(dim=1)  # Mean pooling
    
    return embeddings.numpy()

# Function to compute cosine similarity using BERT embeddings
def cosine_similarity_bert(paper_text, summary_text):
    paper_embeddings = get_bert_embeddings(paper_text)
    summary_embeddings = get_bert_embeddings(summary_text)
    
    # Compute cosine similarity
    similarity = cosine_similarity(paper_embeddings, summary_embeddings)
    
    return similarity[0][0]

# Example texts
paper_text = """Queenstown, a vibrant tourist destination, thrives on its stunning scenery and diverse activities. However, this reliance on tourism presents a complex dynamic, with both opportunities and challenges for the local community. This paper explores the multifaceted impact of tourism on the Queenstown community, examining its economic benefits alongside the social and environmental consequences.\n\nTourism undeniably fuels the local economy, generating significant revenue through various sectors like accommodation, hospitality, and retail. Businesses directly and indirectly benefit from the influx of tourists, creating employment opportunities and stimulating economic growth.\n\nThe rapid growth of tourism also brings challenges. Overcrowding can lead to a decline in the quality of life for residents, with increased traffic congestion, strain on infrastructure, and a rise in housing costs pushing locals out. Furthermore, the environmental impact of tourism, such as pollution and habitat destruction, poses a long-term threat to the natural beauty that attracts tourists in the first place.\n\nTourism is a double-edged sword for Queenstown. While it provides economic opportunities, it also presents social and environmental challenges that require careful management. A sustainable approach to tourism development is crucial to ensure that the benefits are shared equitably and that the community's quality of life and the environment are protected for future generations."""

summary_text = """This research paper examines the dual impact of tourism on Queenstown, highlighting its economic benefits (job creation, revenue) alongside the social and environmental challenges (overcrowding, infrastructure strain, environmental degradation). It concludes that a sustainable tourism approach is vital for the community's well-being and the long-term preservation of its natural beauty."""


similarity_score_bert = cosine_similarity_bert(paper_text, summary_text)
print(f"Cosine Similarity (BERT): {similarity_score_bert}")


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Cosine Similarity (BERT): 0.9337857961654663


In [8]:
#Approach 2: Keyword Matching
from sklearn.feature_extraction.text import CountVectorizer

# Function to extract keywords using CountVectorizer
def extract_keywords(text, num_keywords=10):
    vectorizer = CountVectorizer(stop_words='english', max_features=num_keywords)
    word_counts = vectorizer.fit_transform([text])
    keywords = vectorizer.get_feature_names_out()
    return set(keywords)

# Compare the keywords from paper and summary
def compare_keywords(paper_text, summary_text, num_keywords=10):
    paper_keywords = extract_keywords(paper_text, num_keywords)
    summary_keywords = extract_keywords(summary_text, num_keywords)
    
    # Find the overlap
    overlap = paper_keywords.intersection(summary_keywords)
    coverage_percentage = len(overlap) / len(paper_keywords) * 100
    
    return len(overlap), coverage_percentage

# Example texts
paper_text = """Queenstown, a vibrant tourist destination, thrives on its stunning scenery and diverse activities. However, this reliance on tourism presents a complex dynamic, with both opportunities and challenges for the local community. This paper explores the multifaceted impact of tourism on the Queenstown community, examining its economic benefits alongside the social and environmental consequences.\n\nTourism undeniably fuels the local economy, generating significant revenue through various sectors like accommodation, hospitality, and retail. Businesses directly and indirectly benefit from the influx of tourists, creating employment opportunities and stimulating economic growth.\n\nThe rapid growth of tourism also brings challenges. Overcrowding can lead to a decline in the quality of life for residents, with increased traffic congestion, strain on infrastructure, and a rise in housing costs pushing locals out. Furthermore, the environmental impact of tourism, such as pollution and habitat destruction, poses a long-term threat to the natural beauty that attracts tourists in the first place.\n\nTourism is a double-edged sword for Queenstown. While it provides economic opportunities, it also presents social and environmental challenges that require careful management. A sustainable approach to tourism development is crucial to ensure that the benefits are shared equitably and that the community's quality of life and the environment are protected for future generations."""

summary_text = """This research paper examines the dual impact of tourism on Queenstown, highlighting its economic benefits (job creation, revenue) alongside the social and environmental challenges (overcrowding, infrastructure strain, environmental degradation). It concludes that a sustainable tourism approach is vital for the community's well-being and the long-term preservation of its natural beauty."""


overlap, coverage_percentage = compare_keywords(paper_text, summary_text)
print(f"Keyword Overlap: {overlap}")
print(f"Keyword Coverage Percentage: {coverage_percentage}%")


Keyword Overlap: 4
Keyword Coverage Percentage: 40.0%


In [10]:
#Approach 3: Named Entity Recognition (NER)

import spacy

# Load the pre-trained SpaCy model
nlp = spacy.load("en_core_web_sm")

# Function to extract named entities
def extract_named_entities(text):
    doc = nlp(text)
    return set(ent.text.lower() for ent in doc.ents)

# Compare NER from paper and summary
def compare_ner(paper_text, summary_text):
    paper_entities = extract_named_entities(paper_text)
    summary_entities = extract_named_entities(summary_text)
    
    # Find overlap of named entities
    overlap = paper_entities.intersection(summary_entities)
    coverage_percentage = len(overlap) / len(paper_entities) * 100 if paper_entities else 0
    
    return len(overlap), coverage_percentage

# Example texts
paper_text = """Queenstown, a vibrant tourist destination, thrives on its stunning scenery and diverse activities. However, this reliance on tourism presents a complex dynamic, with both opportunities and challenges for the local community. This paper explores the multifaceted impact of tourism on the Queenstown community, examining its economic benefits alongside the social and environmental consequences.\n\nTourism undeniably fuels the local economy, generating significant revenue through various sectors like accommodation, hospitality, and retail. Businesses directly and indirectly benefit from the influx of tourists, creating employment opportunities and stimulating economic growth.\n\nThe rapid growth of tourism also brings challenges. Overcrowding can lead to a decline in the quality of life for residents, with increased traffic congestion, strain on infrastructure, and a rise in housing costs pushing locals out. Furthermore, the environmental impact of tourism, such as pollution and habitat destruction, poses a long-term threat to the natural beauty that attracts tourists in the first place.\n\nTourism is a double-edged sword for Queenstown. While it provides economic opportunities, it also presents social and environmental challenges that require careful management. A sustainable approach to tourism development is crucial to ensure that the benefits are shared equitably and that the community's quality of life and the environment are protected for future generations."""

summary_text = """This research paper examines the dual impact of tourism on Queenstown, highlighting its economic benefits (job creation, revenue) alongside the social and environmental challenges (overcrowding, infrastructure strain, environmental degradation). It concludes that a sustainable tourism approach is vital for the community's well-being and the long-term preservation of its natural beauty."""


ner_overlap, ner_coverage_percentage = compare_ner(paper_text, summary_text)
print(f"NER Overlap: {ner_overlap}")
print(f"NER Coverage Percentage: {ner_coverage_percentage}%")


ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject