In [None]:
#Necessary Libraries Installation
!pip install PyMuPDF nltk scikit-learn pandas rdflib stopwords flask-ngrok spacy inflect
!python -m spacy download en_core_web_sm

In [None]:
#Extract words from target pdf and contrast pdf
import os
import fitz  # PyMuPDF
import re

def remove_url_func(text):
    return re.sub(r'(?:(ftp|http|https)?:\/\/)?(?:[\w-]+\.)+([a-z]|[A-Z]|[0-9]){2,6}', '', text)

def remove_email_func(text):
    return re.sub(r'[\w\.\-\_]+@[\w\.\-\_]+', '', text)

def remove_digits(text):
    return re.sub(r'\d+', '', text)

def count_words(text):
    return len(text.split())

def truncate_words(text, max_words):
    return ' '.join(text.split()[:max_words])

def substitute_dash(text):
    return re.sub(r'-', ' ', text)

#def substitute_ampersand(text):
    #return text.replace("&", "and")

def extract_text_from_pdfs(pdf_directory, maximum):
    total_words = 0
    all_texts = []
    if not os.path.exists(pdf_directory):
        raise FileNotFoundError(f"The directory {pdf_directory} does not exist.")
    for filename in os.listdir(pdf_directory):
        if filename.endswith('.pdf'):
            if total_words >= maximum:
                break
                
            pdf_path = os.path.join(pdf_directory, filename)
            doc = fitz.open(pdf_path)
            text = ""
            for page in doc:
                text += page.get_text()
            
            text = remove_url_func(text)
            text = remove_email_func(text)
            text = remove_digits(text)
            #text = substitute_ampersand(text)
            text = substitute_dash(text)
            
            word_count = count_words(text)
            if total_words + word_count > maximum:
                word_count = maximum - total_words
                text = truncate_words(text, word_count)
                assert word_count == count_words(text)
            
            total_words += word_count
            all_texts.append(text)
            doc.close()
    return all_texts, total_words


pdf_directory_target = '/Users/kyawsoehan/Desktop/pdf'
pdf_directory_contrast = '/Users/kyawsoehan/Desktop/contrastingpdf'

# Extract target texts
texts_target, target_words = extract_text_from_pdfs(pdf_directory_target, 1E9)
print(f"Total words in target corpus: {target_words}")

# Extract contrast texts with truncation
texts_contrast, contrast_words = extract_text_from_pdfs(pdf_directory_contrast, target_words)
print(f"Total words in contrast corpus: {contrast_words}")

for i, text in enumerate(texts_target):
    print(f"Text from Target PDF {i+1}:\n{text}\n")
for i, text in enumerate(texts_contrast):
    print(f"Text from Contrast PDF {i+1}:\n{text}\n")

In [None]:
#Data Cleansing with Stopword 

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('stopwords')
nltk.download('punkt')

def remove_stopwords(texts):
    stop_words = set(stopwords.words('english')) - {"and"}
    stop_words = stop_words.union({",", ";", "ii", "iii", ".", "(", ")", "!", "@", "£", "$", "%", "^", "*", "-", "+", "/"})
    #stop_words = {",", ";", "ii", ".", "(", ")", "!", "@", "£", "$", "%", "^", "*", "-", "+", "/", "‘", "’"}
    filtered_texts = []
    for text in texts:
        words = word_tokenize(text)
        filtered_text = ' '.join([word for word in words if not word.lower() in stop_words])
        filtered_texts.append(filtered_text)
    return filtered_texts

clean_texts_target = remove_stopwords(texts_target)
clean_texts_contrast = remove_stopwords(texts_contrast)

for i, clean_text in enumerate(clean_texts_target):
    print(f"Cleaned Target Text {i+1}:\n{clean_text}\n")
for i, clean_text in enumerate(clean_texts_contrast):
    print(f"Cleaned Contrast Text {i+1}:\n{clean_text}\n")

In [None]:
#function to generate and count N-grams
from nltk.util import ngrams
from collections import Counter

def generate_ngrams(text, n):
    # Tokenize the cleaned text into words
    tokens = word_tokenize(text.lower())
    
    # Generate n-grams
    n_grams = list(ngrams(tokens, n))
    
    # Count the frequency of each n-gram
    n_gram_freq = Counter(n_grams)
    
    return n_gram_freq

# Extract and print bigrams and trigrams for the cleaned texts
for i, clean_text in enumerate(clean_texts_target):
    bigrams = generate_ngrams(clean_text, 2)
    trigrams = generate_ngrams(clean_text, 3)
    print(f"Target Text {i+1} Bigrams:\n{bigrams}\n")
    print(f"Target Text {i+1} Trigrams:\n{trigrams}\n")

for i, clean_text in enumerate(clean_texts_contrast):
    bigrams = generate_ngrams(clean_text, 2)
    trigrams = generate_ngrams(clean_text, 3)
    print(f"Contrast Text {i+1} Bigrams:\n{bigrams}\n")
    print(f"Contrast Text {i+1} Trigrams:\n{trigrams}\n")


In [None]:
#Extract Noun Phrases using SpaCy
import spacy

nlp = spacy.load("en_core_web_sm")

def extract_noun_phrases_spacy(texts):
    noun_phrases = []
    for text in texts:
        doc = nlp(text)
        noun_phrases.extend([chunk.text for chunk in doc.noun_chunks])
    return noun_phrases

noun_phrases_target = extract_noun_phrases_spacy(clean_texts_target)
noun_phrases_contrast = extract_noun_phrases_spacy(clean_texts_contrast)

print(f"Noun Phrases (Target):\n{noun_phrases_target}\n")
print(f"Noun Phrases (Contrast):\n{noun_phrases_contrast}\n")

In [None]:
#Calculate TF-IDF and Extract Top Terms
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

# Function to calculate TF-IDF
def calculate_tfidf(phrases, ngram_range=(1, 4)):
    vectorizer = TfidfVectorizer(analyzer='word', ngram_range=ngram_range)
    tfidf_matrix = vectorizer.fit_transform(phrases)
    return tfidf_matrix, vectorizer.get_feature_names_out()

# For Ngrams
tfidf_matrix_ngram_target, feature_names_ngram_target = calculate_tfidf(clean_texts_target, ngram_range=(1, 4))
tfidf_matrix_ngram_contrast, feature_names_ngram_contrast = calculate_tfidf(clean_texts_contrast, ngram_range=(1, 4))

# For Noun Phrases
tfidf_matrix_noun_phrase_target, feature_names_noun_phrase_target = calculate_tfidf(noun_phrases_target, ngram_range=(1, 4))
tfidf_matrix_noun_phrase_contrast, feature_names_noun_phrase_contrast = calculate_tfidf(noun_phrases_contrast, ngram_range=(1, 4))

# Function to extract top terms
def extract_top_terms(tfidf_matrix, feature_names, top_n=10):
    top_terms = {}
    for i in range(tfidf_matrix.shape[0]):
        df = pd.DataFrame(tfidf_matrix[i].T.todense(), index=feature_names, columns=["TF-IDF"])
        df = df.sort_values('TF-IDF', ascending=False)
        top_terms[i] = df.head(top_n)
    return top_terms

# Extract top terms for target and contrast for ngrams and noun phrases
top_terms_ngram_target = extract_top_terms(tfidf_matrix_ngram_target, feature_names_ngram_target, top_n=100)
top_terms_ngram_contrast = extract_top_terms(tfidf_matrix_ngram_contrast, feature_names_ngram_contrast, top_n=100)
top_terms_noun_target = extract_top_terms(tfidf_matrix_noun_phrase_target, feature_names_noun_phrase_target, top_n=100)
top_terms_noun_contrast = extract_top_terms(tfidf_matrix_noun_phrase_contrast, feature_names_noun_phrase_contrast, top_n=100)

# Print top terms for ngrams (Target)
print("Top terms from Target (Ngrams):")
for doc_id, terms in top_terms_ngram_target.items():
    print(f"Document {doc_id+1}:")
    print(terms)
    print("\n")

# Print top terms for ngrams (Contrast)
print("Top terms from Contrast (Ngrams):")
for doc_id, terms in top_terms_ngram_contrast.items():
    print(f"Document {doc_id+1}:")
    print(terms)
    print("\n")

# Print top terms for noun phrases (Target)
print("Top terms from Target (Noun Phrases):")
for doc_id, terms in top_terms_noun_target.items():
    print(f"Document {doc_id+1}:")
    print(terms)
    print("\n")

# Print top terms for noun phrases (Contrast)
print("Top terms from Contrast (Noun Phrases):")
for doc_id, terms in top_terms_noun_contrast.items():
    print(f"Document {doc_id+1}:")
    print(terms)
    print("\n")


In [None]:
#Calculate Domain Relevence, Combine score of TF-IDF and DR and Aggregate Top 100 terms for ngrams and nouns 
from collections import Counter
import inflect
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.probability import FreqDist
from functools import reduce
from nltk.util import ngrams
import pandas as pd
import pickle


# Initialize inflect engine for singular/plural conversion
p = inflect.engine()
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

# Function to aggregate top terms from TF-IDF matrix
def aggregate_top_terms(tfidf_matrix, feature_names):
    all_terms = Counter()
    for i in range(tfidf_matrix.shape[0]):
        df = pd.DataFrame(tfidf_matrix[i].T.todense(), index=feature_names, columns=["TF-IDF"])
        df = df.sort_values('TF-IDF', ascending=False)
        for term, score in df.itertuples():
            all_terms[term] += score
    return all_terms

# Function to calculate domain relevance
def calculate_term_frequencies(phrases):
    all_words = ' '.join(phrases).split()
    return reduce(lambda a,b:a+b,[FreqDist(ngrams(all_words, n)) for n in range(1,4)])

def calculate_domain_relevance(domain_freqs, contrasting_freqs):
    dr_scores = {}
    for term, freq in domain_freqs.items():
        dr_scores[term] = freq / (contrasting_freqs[term] + 1)  # Add 1 to avoid division by zero
    return dr_scores

def combine_scores(tfidf_scores, dr_scores):
    combined_scores = {}
    for term, score in tfidf_scores.items():
        combined_scores[term] = score * dr_scores.get(term, 1)  # Use DR score if available, otherwise use 1
    return combined_scores

def filter_and_sort_terms(aggregated_terms, top_n=100):
    stop_words = set(stopwords.words('english'))
    filtered_terms = {term: score for term, score in aggregated_terms.items() if term.lower() not in stop_words and not term.isdigit()and len(term) > 1}
    
    # top and tail stop words
    f = {}
    for term, score in aggregated_terms.items():
        t = term.split()
        while t!=[] and t[0] in stop_words:
            t = t[1:]
        while t!=[] and t[-1] in stop_words:
            t.pop()
        if t!=[]:
            f[" ".join(t)] = score
    filtered_terms = f
    sorted_terms = Counter(filtered_terms).most_common()
    
    filtered_list = []
    printed_terms = set()
    count = 0
    
    for term, score in sorted_terms:
        if count >= top_n:
            break
        word = term.split()[0]
        singular_term = p.singular_noun(word) or word
        stemmed_term = stemmer.stem(singular_term)
        lemmatized_term = lemmatizer.lemmatize(singular_term)
        
        if stemmed_term not in printed_terms and lemmatized_term not in printed_terms:
            filtered_list.append((term, score))
            printed_terms.add(stemmed_term)
            printed_terms.add(lemmatized_term)
            count += 1
    return filtered_list

# Aggregate top terms for ngrams and noun phrases
aggregated_terms_ngram = aggregate_top_terms(tfidf_matrix_ngram_target, feature_names_ngram_target)
aggregated_terms_noun = aggregate_top_terms(tfidf_matrix_noun_phrase_target, feature_names_noun_phrase_target)

# Calculate domain relevance using contrast documents
domain_freqs_ngram = calculate_term_frequencies(clean_texts_target)
contrasting_freqs_ngram = calculate_term_frequencies(clean_texts_contrast)

dr_scores_ngram = calculate_domain_relevance(domain_freqs_ngram, contrasting_freqs_ngram)

domain_freqs_noun = calculate_term_frequencies(noun_phrases_target)
contrasting_freqs_noun = calculate_term_frequencies(noun_phrases_contrast)

dr_scores_noun = calculate_domain_relevance(domain_freqs_noun, contrasting_freqs_noun)

# Combine scores for ngrams and noun phrases
combined_scores_ngram = combine_scores(aggregated_terms_ngram, dr_scores_ngram)
combined_scores_noun = combine_scores(aggregated_terms_noun, dr_scores_noun)

# Filter and sort top terms for ngrams and noun phrases
filtered_final_scores_ngram = filter_and_sort_terms(combined_scores_ngram, top_n=100)
filtered_final_scores_noun = filter_and_sort_terms(combined_scores_noun, top_n=100)

# Print the top terms with scores for ngrams and noun phrases
def print_terms(label, filtered_terms):
    print(f"Filtered and Aggregated Top Terms ({label}):")
    for term, score in filtered_terms:
        print(f"  {term}: {score:.4f}")

print_terms("Ngrams - Target", filtered_final_scores_ngram)
print_terms("Noun Phrases - Target", filtered_final_scores_noun)

# Save the combined scores for further use
with open('filtered_final_scores_ngram_target.pkl', 'wb') as f:
    pickle.dump(filtered_final_scores_ngram, f)

with open('filtered_final_scores_noun_target.pkl', 'wb') as f:
    pickle.dump(filtered_final_scores_noun, f)


In [None]:
#SKOS Exporting 
from rdflib import Graph, Literal, RDF, URIRef, Namespace
from rdflib.namespace import SKOS
from urllib.parse import quote
from nltk.tokenize import sent_tokenize
from functools import reduce
import re
import pickle

def create_skos_combined(ngram_terms, noun_terms, sentences, base_uri):
    g = Graph()
    skos = Namespace("http://www.w3.org/2004/02/skos/core#")
    
    # Define the concept schemes for ngrams and nouns
    scheme_ngrams = URIRef(f"{base_uri}/ngrams/concept_scheme")
    scheme_nouns = URIRef(f"{base_uri}/nouns/concept_scheme")
    g.add((scheme_ngrams, RDF.type, skos.ConceptScheme))
    g.add((scheme_ngrams, SKOS.prefLabel, Literal("ngrams")))
    g.add((scheme_nouns, RDF.type, skos.ConceptScheme))
    g.add((scheme_nouns, SKOS.prefLabel, Literal("nouns")))

    seen_terms = {}

    # Add ngrams terms and their details
    for term, score in ngram_terms:
        if term not in seen_terms:
            seen_terms[term] = {'ngram_score': score, 'noun_score': 0}
        else:
            seen_terms[term]['ngram_score'] = score

    # Add noun phrases terms and their details
    for term, score in noun_terms:
        if term not in seen_terms:
            seen_terms[term] = {'ngram_score': 0, 'noun_score': score}
        else:
            seen_terms[term]['noun_score'] = score

    # Process terms and add to graph
    for term, scores in seen_terms.items():
        s = [s for s in sentences if term in s.lower()]
        s.sort(key=lambda s: len(s))
        sentence = s[len(s)//2] if s else "No example sentence available."
        
        encoded_term = quote(term)
        concept = URIRef(f"{base_uri}/concept/{encoded_term}")
        g.add((concept, RDF.type, skos.Concept))
        g.add((concept, SKOS.prefLabel, Literal(term, lang='en-GB')))
        scope_note = f"Ngram Score: {scores['ngram_score']:.4f}, Noun Score: {scores['noun_score']:.4f}"
        g.add((concept, SKOS.scopeNote, Literal(scope_note, lang='en-GB')))
        g.add((concept, SKOS.definition, Literal(sentence, lang='en-GB')))
        if scores['ngram_score'] > 0:
            g.add((concept, SKOS.inScheme, scheme_ngrams))
        if scores['noun_score'] > 0:
            g.add((concept, SKOS.inScheme, scheme_nouns))

    return g

# Base URI for the SKOS vocabulary
base_uri = "http://localhost/tematres/vocab"

# Assuming `sentences` and `filtered_terms_ngram`, `filtered_terms_noun` are already defined and available
sentences = reduce(lambda x, y: x + y, [sent_tokenize(t, language='english') for t in texts_target])
sentences = [re.sub(r'\n', ' ', s) for s in sentences]

# Load filtered terms for n-gram from the pickle file
with open('filtered_final_scores_ngram_target.pkl', 'rb') as f:
    filtered_terms_ngram = pickle.load(f)

# Load filtered terms for noun phrases from the pickle file
with open('filtered_final_scores_noun_target.pkl', 'rb') as f:
    filtered_terms_noun = pickle.load(f)

# Create combined SKOS RDF for n-grams and noun phrases
skos_graph_combined = create_skos_combined(filtered_terms_ngram, filtered_terms_noun, sentences, base_uri)

# Serialize SKOS RDF to XML format and print
skos_xml_combined = skos_graph_combined.serialize(format='xml', encoding='utf-8').decode('utf-8')
print("SKOS RDF Output for Combined N-grams and Noun Phrases:\n")
print(skos_xml_combined)

# Save the combined SKOS RDF to a file
output_file_path = '/Users/kyawsoehan/Desktop/skos_update_files/skos_output_combined.rdf'
with open(output_file_path, 'w', encoding='utf-8') as f:
    f.write(skos_xml_combined)

print(f"SKOS RDF file for Combined N-grams and Noun Phrases saved to {output_file_path}.")


In [None]:
#Tagged Text Output for Ngrams and Nouns for localhost to import TemaTres System 
import re
import pickle
from nltk.tokenize import sent_tokenize, word_tokenize
from functools import reduce

def create_tagged_text(ngram_terms, noun_terms, sentences, concept_scheme_ngram, concept_scheme_noun):
    output = [f"{concept_scheme_ngram}", f"{concept_scheme_noun}"]
    seen_terms = set()

    # Add ngrams terms and their details
    for term, score in ngram_terms:
        if term not in seen_terms:
            # Word-level matching for sentences
            matching_sentences = [s for s in sentences if all(word in word_tokenize(s) for word in term.split())]
            matching_sentences.sort(key=lambda s: len(s))
            sentence = matching_sentences[len(matching_sentences)//2] if matching_sentences else "No example sentence available."

            output.append(f"\n{term}")
            output.append(f"\tTT: {concept_scheme_ngram}")
            output.append(f"\tNA: Ngram Score: {score:.4f}")
            output.append(f"\tDF: {sentence}")
            output.append(f"\tBT: {concept_scheme_ngram}")
            seen_terms.add(term)
        else:
            output.append(f"\n{term}")
            output.append(f"\tTT: {concept_scheme_ngram}")
            output.append(f"\tNA: Ngram Score: {score:.4f}")
            output.append(f"\tBT: {concept_scheme_ngram}")

    # Add noun phrases terms and their details
    for term, score in noun_terms:
        if term not in seen_terms:
            # Word-level matching for sentences
            matching_sentences = [s for s in sentences if all(word in word_tokenize(s) for word in term.split())]
            matching_sentences.sort(key=lambda s: len(s))
            sentence = matching_sentences[len(matching_sentences)//2] if matching_sentences else "No example sentence available."

            output.append(f"\n{term}")
            output.append(f"\tTT: {concept_scheme_noun}")
            output.append(f"\tNA: Noun Score: {score:.4f}")
            output.append(f"\tDF: {sentence}")
            output.append(f"\tBT: {concept_scheme_noun}")
            seen_terms.add(term)
        else:
            output.append(f"\n{term}")
            output.append(f"\tTT: {concept_scheme_noun}")
            output.append(f"\tNA: Noun Score: {score:.4f}")
            output.append(f"\tBT: {concept_scheme_noun}")
    
    return "\n".join(output)

# Example usage
concept_scheme_ngram = "ngrams"
concept_scheme_noun = "nouns"
base_uri = "http://localhost/tematres/vocab"

# Assuming `sentences`, `filtered_terms_ngram`, and `filtered_terms_noun` are already defined and available
sentences = reduce(lambda x, y: x + y, [sent_tokenize(t, language='english') for t in texts_target])
sentences = [re.sub(r'\n', ' ', s) for s in sentences]


# Load filtered terms for ngrams and nouns from the pickle files
with open('filtered_final_scores_ngram_target.pkl', 'rb') as f:
    filtered_terms_ngram = pickle.load(f)

with open('filtered_final_scores_noun_target.pkl', 'rb') as f:
    filtered_terms_noun = pickle.load(f)

# Create tagged text output for both ngrams and nouns
tagged_text_output = create_tagged_text(filtered_terms_ngram, filtered_terms_noun, sentences, concept_scheme_ngram, concept_scheme_noun)

# Print the tagged text output
print(tagged_text_output)

# Save the tagged text output to a file
output_file_path = '/Users/kyawsoehan/Desktop/skos_update_files/tagged_text_output_combined_local.txt'
with open(output_file_path, 'w', encoding='utf-8') as f:
    f.write(tagged_text_output)

print(f"Tagged text file for Ngrams and Nouns saved to {output_file_path}.")


In [None]:
#Tagged Text Output for Ngrams and Nouns for herokuhost
import re
import pickle
from nltk.tokenize import sent_tokenize, word_tokenize
from functools import reduce

def create_tagged_text(ngram_terms, noun_terms, sentences, concept_scheme_ngram, concept_scheme_noun):
    output = [f"{concept_scheme_ngram}", f"{concept_scheme_noun}"]
    seen_terms = set()

    # Add ngrams terms and their details
    for term, score in ngram_terms:
        if term not in seen_terms:
            # Word-level matching for sentences
            matching_sentences = [s for s in sentences if all(word in word_tokenize(s) for word in term.split())]
            matching_sentences.sort(key=lambda s: len(s))
            sentence = matching_sentences[len(matching_sentences)//2] if matching_sentences else "No example sentence available."

            output.append(f"\n{term}")
            output.append(f"\tTT: {concept_scheme_ngram}")
            output.append(f"\tNA: Ngram Score: {score:.4f}")
            output.append(f"\tDF: {sentence}")
            output.append(f"\tBT: {concept_scheme_ngram}")
            seen_terms.add(term)
        else:
            output.append(f"\n{term}")
            output.append(f"\tTT: {concept_scheme_ngram}")
            output.append(f"\tNA: Ngram Score: {score:.4f}")
            output.append(f"\tBT: {concept_scheme_ngram}")

    # Add noun phrases terms and their details
    for term, score in noun_terms:
        if term not in seen_terms:
            # Word-level matching for sentences
            matching_sentences = [s for s in sentences if all(word in word_tokenize(s) for word in term.split())]
            matching_sentences.sort(key=lambda s: len(s))
            sentence = matching_sentences[len(matching_sentences)//2] if matching_sentences else "No example sentence available."

            output.append(f"\n{term}")
            output.append(f"\tTT: {concept_scheme_noun}")
            output.append(f"\tNA: Noun Score: {score:.4f}")
            output.append(f"\tDF: {sentence}")
            output.append(f"\tBT: {concept_scheme_noun}")
            seen_terms.add(term)
        else:
            output.append(f"\n{term}")
            output.append(f"\tTT: {concept_scheme_noun}")
            output.append(f"\tNA: Noun Score: {score:.4f}")
            output.append(f"\tBT: {concept_scheme_noun}")
    
    return "\n".join(output)

# Example usage
concept_scheme_ngram = "ngrams"
concept_scheme_noun = "nouns"
base_uri = "http://corporate-terminology-uwe-163e1432bf51.herokuapp.com/vocab/"

# Assuming `sentences`, `filtered_terms_ngram`, and `filtered_terms_noun` are already defined and available
sentences = reduce(lambda x, y: x + y, [sent_tokenize(t, language='english') for t in texts_target])
sentences = [re.sub(r'\n', ' ', s) for s in sentences]

# Load filtered terms for ngrams and nouns from the pickle files
with open('filtered_final_scores_ngram_target.pkl', 'rb') as f:
    filtered_terms_ngram = pickle.load(f)

with open('filtered_final_scores_noun_target.pkl', 'rb') as f:
    filtered_terms_noun = pickle.load(f)

# Create tagged text output for both ngrams and nouns
tagged_text_output = create_tagged_text(filtered_terms_ngram, filtered_terms_noun, sentences, concept_scheme_ngram, concept_scheme_noun)

# Print the tagged text output
print(tagged_text_output)

# Save the tagged text output to a file
output_file_path = '/Users/kyawsoehan/Desktop/skos_update_files/tagged_text_output_combined_heroku.txt'
with open(output_file_path, 'w', encoding='utf-8') as f:
    f.write(tagged_text_output)

print(f"Tagged text file for Ngrams and Nouns saved to {output_file_path}.")
