# INSTALLING LIBRARIES

In [2]:
!pip install rake-nltk
!pip install keybert
!pip install pyLDAvis

Collecting rake-nltk
  Downloading rake_nltk-1.0.6-py3-none-any.whl.metadata (6.4 kB)
Downloading rake_nltk-1.0.6-py3-none-any.whl (9.1 kB)
Installing collected packages: rake-nltk
Successfully installed rake-nltk-1.0.6
Collecting keybert
  Downloading keybert-0.8.5-py3-none-any.whl.metadata (15 kB)
Collecting sentence-transformers>=0.3.8 (from keybert)
  Downloading sentence_transformers-3.0.1-py3-none-any.whl.metadata (10 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers>=0.3.8->keybert)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers>=0.3.8->keybert)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers>=0.3.8->keybert)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-many

# IMPORTING LIBRARIES

In [24]:
# Data manipulation and visualization libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Text processing and visualization libraries
from wordcloud import WordCloud

# Text feature extraction libraries
from sklearn.feature_extraction.text import TfidfVectorizer

# Keyword extraction libraries
from rake_nltk import Rake
from keybert import KeyBERT

# Topic modeling libraries
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer


  and should_run_async(code)


In [25]:
# Google Drive Integration
from google.colab import drive
drive.mount('/content/drive')

  and should_run_async(code)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [26]:
# Load your labelled dataset
hybrid_df = pd.read_csv('/content/drive/MyDrive/Dissertation24/Hybrid_RobertaPredictions.csv')

  and should_run_async(code)


In [27]:
# Map the labels
label_mapping = {0: 'negative', 1: 'neutral', 2: 'positive'}
hybrid_df['label'] = hybrid_df['label'].map(label_mapping)

  and should_run_async(code)


In [28]:
hybrid_df

  and should_run_async(code)


Unnamed: 0,content,score,at,source,length,label
0,ive revised rating two reason customer support...,4,2021-07-13,"Revolut: Spend, Save, Trade",275,negative
1,go world news recent update ruined ui,2,2024-04-13,BBC: World News & Stories,37,negative
2,ive using headspace age meditation excellent a...,4,2020-01-18,Headspace: Meditation & Sleep,258,negative
3,amazing major city ive used san fran london ti...,5,2020-01-29,Citymapper,89,positive
4,love app using year find app cluttered maybe p...,4,2023-09-20,Headspace: Meditation & Sleep,171,neutral
...,...,...,...,...,...,...
9715,excellent card useful like need pay various cu...,4,2020-03-30,"Revolut: Spend, Save, Trade",306,positive
9716,accurate quick reliable,5,2018-04-21,Bus Times -Live Public Transit,23,positive
9717,keep crashing lacking feature,1,2024-05-28,BBC: World News & Stories,29,negative
9718,changing design app made much worse switch dar...,1,2024-04-13,WhatsApp Messenger,192,negative


In [29]:
# Split the reviews by sentiment
positive_reviews = hybrid_df[hybrid_df['label'] == 'positive']
negative_reviews = hybrid_df[hybrid_df['label'] == 'negative']
neutral_reviews = hybrid_df[hybrid_df['label'] == 'neutral']

  and should_run_async(code)


In [30]:
# Initialise the KeyBERT model for keyword extraction
kw_model = KeyBERT()

  and should_run_async(code)


In [31]:
# Define a function to extract keywords from reviews using KeyBERT
def extract_keywords_keybert(reviews, top_n=10):
    keywords = kw_model.extract_keywords(' '.join(reviews), keyphrase_ngram_range=(1, 2), top_n=top_n)
    return [keyword[0] for keyword in keywords]

# Create a dictionary to store reports for each application
app_reports = {}

# Iterate through each unique app in the DataFrame
for app in hybrid_df['source'].unique():
    app_reviews = hybrid_df[hybrid_df['source'] == app]  # Filter reviews for the current app

    # Categorise reviews by sentiment
    positive_reviews = app_reviews[app_reviews['label'] == 'positive']
    negative_reviews = app_reviews[app_reviews['label'] == 'negative']
    neutral_reviews = app_reviews[app_reviews['label'] == 'neutral']

    # Extract keywords from positive, negative, and neutral reviews
    positive_keywords = extract_keywords_keybert(positive_reviews['content'])
    negative_keywords = extract_keywords_keybert(negative_reviews['content'])
    neutral_keywords = extract_keywords_keybert(neutral_reviews['content'])

    # Store the extracted keywords in the report dictionary for the current app
    app_reports[app] = {
        "Positive Features": positive_keywords,
        "Negative Features": negative_keywords,
        "Neutral Features": neutral_keywords
    }

  and should_run_async(code)


In [32]:
# Output the report for each app
for app, report in app_reports.items():
    print(f"Report for {app}:")
    print("Positive Features:", report['Positive Features'])
    print("Negative Features:", report['Negative Features'])
    print("Neutral Features:", report['Neutral Features'])
    print("-" * 50)

Report for Revolut: Spend, Save, Trade:
Positive Features: ['bank apps', 'banking app', 'bank app', 'use bank', 'everyday bank', 'banking service', 'financial app', 'reliable banking', 'banking solution', 'best banking']
Negative Features: ['banking apps', 'app banking', 'support bank', 'banking app', 'banking need', 'banking service', 'customer support', 'update bank', 'app bank', 'verification customer']
Neutral Features: ['money exchange', 'useable currency', 'crypto currency', 'currency easy', 'currency need', 'fee exchange', 'currency account', 'banking easy', 'banking apps', 'exchange money']
--------------------------------------------------
Report for BBC: World News & Stories:
Positive Features: ['functional appkudos', 'infrequent app', 'app balanced', 'app news', 'news apps', 'functional', 'app useful', 'apps tend', 'apps opinion', 'feature app']
Negative Features: ['bad apps', 'daily app', 'app daily', 'app frequently', 'app everyday', 'used apps', 'news uninstall', 'news ap

  and should_run_async(code)


In [33]:
# Define a function to generate and display word clouds from text
def generate_wordcloud(text, title):
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(title, fontsize=20)

    # Save the image before showing it
    plt.savefig(f'/content/drive/MyDrive/Dissertation24/images/plot_Sentiment_Features_{title}.png')
    plt.show()  # Display the image after saving
    plt.close()

# Generate and display word clouds for positive, negative, and neutral features for each app
for app, report in app_reports.items():
    # Combine keywords into a single string for each sentiment category
    positive_text = ' '.join(report['Positive Features'])
    negative_text = ' '.join(report['Negative Features'])
    neutral_text = ' '.join(report['Neutral Features'])

    # Display word clouds for each sentiment category
    print(f"Word Cloud for {app} - Positive Features")
    generate_wordcloud(positive_text, f"{app} - Positive Features")

    print(f"Word Cloud for {app} - Negative Features")
    generate_wordcloud(negative_text, f"{app} - Negative Features")

    print(f"Word Cloud for {app} - Neutral Features")
    generate_wordcloud(neutral_text, f"{app} - Neutral Features")

Output hidden; open in https://colab.research.google.com to view.

In [34]:
# Define the topic names (customize as needed)
topic_names = [
    "User Experience",
    "Performance Issues",
    "Feature Requests",
    "Design and UI",
    "Miscellaneous"
]

# Function to generate a word cloud
def generate_wordcloud(words, title):
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(' '.join(words))

    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.title(title, fontsize=16)
    plt.axis('off')

    # Save the image before showing it
    plt.savefig(f'/content/drive/MyDrive/Dissertation24/images/plot_Topic_Keywords_{title}.png')
    plt.show()  # Display the image after saving
    plt.close()

# Loop through each app and apply LDA
for app in hybrid_df['source'].unique():
    print(f"\nApplying LDA for {app}...\n")

    # Filter reviews for the current app
    app_reviews = hybrid_df[hybrid_df['source'] == app]['content']

    # Vectorize the reviews
    count_vectorizer = CountVectorizer(max_df=0.9, min_df=2, stop_words='english')
    count_data = count_vectorizer.fit_transform(app_reviews)

    # Apply LDA
    lda = LatentDirichletAllocation(n_components=5, random_state=42)
    lda.fit(count_data)

    # Display the top words per topic with word clouds
    for index, topic in enumerate(lda.components_):
        top_words = [count_vectorizer.get_feature_names_out()[i] for i in topic.argsort()[-10:]]
        print(f'Top words for {topic_names[index]} in {app}: {top_words}\n')

        # Generate and display the word cloud for the topic
        generate_wordcloud(top_words, f"{topic_names[index]} in {app}")

Output hidden; open in https://colab.research.google.com to view.