In [60]:
## !pip install PyMuPDF
## !pip install nltk
## !pip install pyLDAvis
## !pip install textblob

In [2]:
import fitz  # PyMuPDF
import pandas as pd

# Function to extract text from a single PDF file
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ''
    for page in doc:
        text += page.get_text()
    return text

# File paths
pdf_files = ['Task 3-01_XD_Interview 1.pdf', 'Task 3-01_XD_Interview 2.pdf']

# Extract text from each PDF and store in a list
pdf_texts = [extract_text_from_pdf(file) for file in pdf_files]

# Convert list to DataFrame
pdf_df = pd.DataFrame(pdf_texts, columns=['Text'])
pdf_df

Unnamed: 0,Text
0,Transcript of Interview with Young Millennial ...
1,Transcript of Interview with Young Millennial ...


In [5]:
pdf_df[0:1]

Unnamed: 0,Text
0,Transcript of Interview with Young Millennial ...


# Sentiment Analysis

In [59]:
from textblob import TextBlob

# Perform sentiment analysis on the extracted text
sentiments = pdf_df['Text'].apply(lambda x: TextBlob(x).sentiment)

# Extract polarity and subjectivity into separate columns
pdf_df['Polarity'] = sentiments.apply(lambda x: x.polarity)
pdf_df['Subjectivity'] = sentiments.apply(lambda x: x.subjectivity)

pdf_df[['Text', 'Polarity', 'Subjectivity']]

Unnamed: 0,Text,Polarity,Subjectivity
0,Transcript of Interview with Young Millennial ...,0.230675,0.47923
1,Transcript of Interview with Young Millennial ...,0.229644,0.506356


The sentiment analysis of the extracted PDF text data reveals the following:

- The first document has a polarity score of 0.23 and a subjectivity score of 0.48, indicating a generally positive sentiment and a moderately subjective tone.
- The second document has a polarity score of 0.23 and a subjectivity score of 0.51, also indicating a generally positive sentiment and a slightly more subjective tone than the first document.

Polarity scores close to 1 indicate a positive sentiment, scores around 0 are neutral, and scores close to -1 indicate a negative sentiment. Subjectivity scores close to 1 indicate a subjective opinion, while scores closer to 0 indicate a more objective text.

In [9]:
import nltk
nltk.download('wordnet')


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [41]:
# Preprocess the text data
import re
from gensim.parsing.preprocessing import preprocess_string, strip_tags, strip_punctuation, strip_multiple_whitespaces, strip_numeric, remove_stopwords, strip_short
from nltk.stem import WordNetLemmatizer

# Define preprocessing function
lemmatizer = WordNetLemmatizer()
def preprocess(text):
    # Apply gensim preprocessing filters, except for the stopword removal
    filters = [strip_tags, strip_punctuation, strip_multiple_whitespaces, strip_numeric, strip_short]
    text = preprocess_string(text, filters)
    # Lemmatize and remove stopwords
    text = [lemmatizer.lemmatize(word) for word in text if word not in custom_stopwords]
    return text

# Preprocess texts
processed_texts = [preprocess(text) for text in pdf_texts]

In [42]:
from gensim import corpora

# Create a dictionary without filtering extremes
dictionary = corpora.Dictionary(processed_texts)

# Convert processed_texts to a bag-of-words representation without filtering extremes
corpus = [dictionary.doc2bow(text) for text in processed_texts]

# Update the attempt to make a index to word dictionary
id2word = {id: word for word, id in dictionary.token2id.items()}

# Display the number of unique tokens and number of documents
len(dictionary), len(corpus)

(489, 2)

In [43]:
# With a non-empty dictionary, we can now attempt to train the LDA model again.
# This time, we won't apply the extreme filtering to ensure we have enough data for the model.

# Set training parameters.
num_topics = 5
chunksize = 2000
passes = 20
iterations = 400

# Train LDA model.
lda_model = LdaModel(
    corpus=corpus,
    id2word=id2word,  # FIX: Updated id2word to id2word_bigrams to match the bigrams corpus
    chunksize=chunksize,
    alpha='auto',
    eta='auto',
    iterations=iterations,
    num_topics=num_topics,
    passes=passes,
    eval_every=None
)

# Show the topics
lda_model.print_topics(num_words=5)

[(0,
  '0.002*"bank" + 0.002*"banking" + 0.002*"account" + 0.002*"want" + 0.002*"like"'),
 (1,
  '0.002*"bank" + 0.002*"account" + 0.002*"like" + 0.002*"going" + 0.002*"great"'),
 (2,
  '0.019*"bank" + 0.012*"like" + 0.012*"want" + 0.011*"I’m" + 0.011*"banking"'),
 (3,
  '0.002*"banking" + 0.002*"bank" + 0.002*"account" + 0.002*"want" + 0.002*"I’m"'),
 (4,
  '0.024*"bank" + 0.013*"great" + 0.011*"like" + 0.011*"banking" + 0.011*"day"')]

In [44]:
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

# Preparing the visualization
pyLDAvis.enable_notebook()
lda_vis = gensimvis.prepare(lda_model, corpus, dictionary)

# Displaying the visualization
pyLDAvis.display(lda_vis)

The topic modeling on the extracted PDF text data identified key themes or topics, with a focus on banking experiences. The top words in each topic suggest discussions around banking, account management, and personal sentiments towards banking services. Here are the key themes identified:

1. **General Banking Experiences**: Words like "bank", "account", "banking", "I’m", and "like" suggest general discussions around banking experiences.
2. **Preferences and Desires in Banking**: This topic includes words such as "bank", "like", "want", "banking", and "I’m", indicating conversations about what individuals want or like in their banking experiences.
3. **Positive Banking Experiences**: Words such as "bank", "great", "banking", "day", and "money" suggest positive experiences or sentiments related to banking.
4. **Similar to Topic 3 but with App or Web services**, indicating a possible overlap in discussions about positive banking experiences.
5. **Satisfaction with Banking Services**: This topic includes words like "bank", "great", "like", "I’m", and "banking", which may reflect satisfaction with banking services and the aspects that are appreciated.

These themes suggest that the discussions in the PDFs revolve around banking experiences, with a focus on personal sentiments, preferences, and positive experiences related to banking services

# Bi-gram Analysis

In [31]:
from gensim.models.phrases import Phrases, Phraser

# Create bigrams
bigram = Phrases(processed_texts, min_count=1)
bigram_phraser = Phraser(bigram)

# Apply the phraser to our processed texts to create bigrams
processed_texts_bigrams = [bigram_phraser[text] for text in processed_texts]

# Create a dictionary for bigrams without filtering extremes
dictionary_bigrams = corpora.Dictionary(processed_texts_bigrams)

# Convert processed_texts_bigrams to a bag-of-words representation without filtering extremes
corpus_bigrams = [dictionary_bigrams.doc2bow(text) for text in processed_texts_bigrams]

# Update the attempt to make a index to word dictionary for bigrams
id2word_bigrams = {id: word for word, id in dictionary_bigrams.token2id.items()}

# Display the number of unique tokens and number of documents for bigrams
len(dictionary_bigrams), len(corpus_bigrams)

(524, 2)

In [33]:
# With a non-empty dictionary, we can now attempt to train the LDA model again.
# This time, we won't apply the extreme filtering to ensure we have enough data for the model.

# Set training parameters.
num_topics = 5
chunksize = 2000
passes = 20
iterations = 400

# Train LDA model.
lda_model = LdaModel(
    corpus=corpus_bigrams,
    id2word=id2word_bigrams,  # FIX: Updated id2word to id2word_bigrams to match the bigrams corpus
    chunksize=chunksize,
    alpha='auto',
    eta='auto',
    iterations=iterations,
    num_topics=num_topics,
    passes=passes,
    eval_every=None
)

# Show the topics
lda_model.print_topics(num_words=5)

[(0, '0.002*"bank" + 0.002*"I’m" + 0.002*"want" + 0.002*"like" + 0.002*"app"'),
 (1,
  '0.023*"bank" + 0.012*"I’m" + 0.012*"like" + 0.010*"banking" + 0.010*"know"'),
 (2,
  '0.002*"bank" + 0.002*"want" + 0.002*"I’m" + 0.002*"account" + 0.002*"information"'),
 (3, '0.018*"bank" + 0.012*"I’m" + 0.012*"want" + 0.010*"like" + 0.010*"app"'),
 (4,
  '0.002*"bank" + 0.002*"I’m" + 0.002*"want" + 0.002*"great" + 0.002*"like"')]

In [34]:
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

# Preparing the visualization
pyLDAvis.enable_notebook()
lda_vis = gensimvis.prepare(lda_model, corpus_bigrams, dictionary_bigrams)

# Displaying the visualization
pyLDAvis.display(lda_vis)

# Context Finding

In [58]:
def find_context_for_top_terms(texts, terms, window_size=10):
    """
    :param window_size: The number of words to include before and after the term as context.
    :return: A dictionary where keys are terms and values are lists of context strings.
    """
    contexts = defaultdict(list)

    for text in texts:
        words = text.split()
        for i, word in enumerate(words):
            if word in terms:
                start = max(i - window_size, 0)
                end = min(i + window_size + 1, len(words))
                context = ' '.join(words[start:i]) + ' *' + words[i] + '* ' + ' '.join(words[i+1:end])
                contexts[word].append(context)
    
    return contexts

top_terms = ['bank', 'app', 'money']

#sample_texts = [' '.join(text) for text in processed_texts[:5]]
#sample_texts_context = [' '.join(text) for text in processed_texts_context[:5]]

# Finding context for top terms in processed_texts_context
contexts = find_context_for_top_terms(processed_texts_context, top_terms)

# Display contexts for a few top terms
for term in sample_terms:
    print(f'Contexts for \"{term}\":')
    for context in contexts[term][:5]:
        print(f'- {context}')
    print('\n')

Contexts for "bank":
- should start learn them ever want buy house suppose need *bank* who can clear their communication and who can tell the
- thank you And can you tell bit about your current *bank* and your current accounts why did you choose your current
- and your current accounts why did you choose your current *bank* use Hiya Bank for day day transactions best friend actually
- which always important But then actually set account with another *bank* who had good long term savings interest rate when salary
- from art comes try send that savings account with other *bank* and then keep with Hiya Bank spend throughout the month


Contexts for "app":
- Olympic Sport would win gold medal Anyway… suppose need banking *app* that clearly tells the breakdown where money being spent can
- yourself save web designer you must have great perspective how *app* should designed can you tell bit more about your bank’s
- should designed can you tell bit more about your bank’s *app* What are the fe

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=f7d0eb7a-c53e-46ac-9828-330fe3ad70a0' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>