# INSTALLING LIBRARIES

In [None]:
!pip install emoji

Collecting emoji
  Downloading emoji-2.12.1-py3-none-any.whl.metadata (5.4 kB)
Downloading emoji-2.12.1-py3-none-any.whl (431 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m431.4/431.4 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: emoji
Successfully installed emoji-2.12.1


# IMPORTING LIBRARIES

In [None]:
# Google Drive Integration
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Data manipulation and handling
import pandas as pd

# Regular expressions and text processing
import re
import emoji
import html

# Natural Language Processing (NLP)
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# For sentiment analysis and text processing
from textblob import TextBlob

# Machine Learning model preparation
from sklearn.model_selection import train_test_split

In [None]:
# Downloading necessary NLTK data
nltk.download('punkt')  # Tokenizer
nltk.download('stopwords')  # Stopwords list
nltk.download('wordnet')  # WordNet lemmatizer

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

# LOADING DATASETS

In [None]:
app_reviews_df = pd.read_csv('/content/drive/MyDrive/Dissertation24/All_Reviews.csv')
manually_labelled_df = pd.read_csv('/content/drive/MyDrive/Dissertation24/ManuallyLabelledReviews.csv')
app_reviews_df_copy = app_reviews_df.copy()
manually_labelled_df_copy = manually_labelled_df.copy()

In [None]:
print(app_reviews_df_copy)
print(manually_labelled_df_copy)

                                   reviewId           userName  \
0      154c1b8c-f147-4ebd-afc5-16170b08dfad      A Google user   
1      e1adb357-a035-4d5e-8b17-32745dc047c6      A Google user   
2      41d2c39c-1027-4677-8451-9aa6b3b714c7      A Google user   
3      f9c74575-b297-40c7-8cf0-57ae7dfe0442      A Google user   
4      893f9c71-c0eb-417d-a0e2-fc849f9e6b6c      A Google user   
...                                     ...                ...   
49995  11d59964-62d9-4e6e-98ec-49d7a1edee22      A Google user   
49996  d827fd17-f39e-4233-9f1e-a5e663847430      A Google user   
49997  4bc4b36b-9153-4f6e-a463-05326b6d1bcf      A Google user   
49998  8ed1a88f-1de3-49a4-a083-668517b56741  black octobermoon   
49999  ac61809e-ac1e-499f-b71c-bf5dbb2fbb15      A Google user   

                                               userImage  \
0      https://play-lh.googleusercontent.com/EGemoI2N...   
1      https://play-lh.googleusercontent.com/EGemoI2N...   
2      https://play-lh.goog

In [None]:
app_reviews_df_copy.info()
manually_labelled_df_copy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   reviewId              50000 non-null  object
 1   userName              50000 non-null  object
 2   userImage             50000 non-null  object
 3   content               49999 non-null  object
 4   score                 50000 non-null  int64 
 5   thumbsUpCount         50000 non-null  int64 
 6   reviewCreatedVersion  46735 non-null  object
 7   at                    50000 non-null  object
 8   replyContent          9387 non-null   object
 9   repliedAt             9387 non-null   object
 10  appVersion            46735 non-null  object
 11  source                50000 non-null  object
dtypes: int64(2), object(10)
memory usage: 4.6+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9102 entries, 0 to 9101
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dty

# TEXT PREPROCESSING

In [None]:
def clean_whitespace(text):
    # Remove leading and trailing white spaces
    text = text.strip()
    # Split the text by whitespace and rejoin with a single space
    text = ' '.join(text.split())
    return text

# Remove missing values
app_reviews_df_copy = app_reviews_df_copy.dropna(subset=['content'])
manually_labelled_df_copy = manually_labelled_df_copy.dropna(subset=['content'])

# Remove duplicates values
app_reviews_df_copy = app_reviews_df_copy.drop_duplicates(subset=['content'])
manually_labelled_df_copy = manually_labelled_df_copy.drop_duplicates(subset=['content'])

# Replace missing or null values in the 'score' column with 0
app_reviews_df_copy['score'] = app_reviews_df_copy['score'].fillna(0)

# Modify the 'at' column to keep only the date part
app_reviews_df_copy['at'] = pd.to_datetime(app_reviews_df_copy['at']).dt.date

# Apply the clean_whitespace function to the 'content' column
app_reviews_df_copy['content'] = app_reviews_df_copy['content'].apply(clean_whitespace)
manually_labelled_df_copy['content'] = manually_labelled_df_copy['content'].apply(clean_whitespace)

# Convert 'content' columns to string
app_reviews_df_copy['content'] = app_reviews_df_copy['content'].astype('string')

# Filter out rows where the length of the 'content' is 0
app_reviews_df_copy = app_reviews_df_copy[app_reviews_df_copy['content'].apply(len) > 0]
manually_labelled_df_copy = manually_labelled_df_copy[manually_labelled_df_copy['content'].apply(len) > 0]

# Convert 'label' column to lowercase
manually_labelled_df_copy['label'] = manually_labelled_df_copy['label'].str.lower()

app_reviews_df_copy

Unnamed: 0,reviewId,userName,userImage,content,score,thumbsUpCount,reviewCreatedVersion,at,replyContent,repliedAt,appVersion,source
0,154c1b8c-f147-4ebd-afc5-16170b08dfad,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,"This use to be a great app, the only thing gre...",4,375,2.24.14.81,2024-07-25,,,2.24.14.81,WhatsApp Messenger
1,e1adb357-a035-4d5e-8b17-32745dc047c6,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,WhatsApp for Android is my go-to messaging app...,5,6769,2.24.13.77,2024-07-16,,,2.24.13.77,WhatsApp Messenger
2,41d2c39c-1027-4677-8451-9aa6b3b714c7,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,Good app. Although it could be made a lot bett...,3,56,2.24.9.78,2024-07-26,,,2.24.9.78,WhatsApp Messenger
3,f9c74575-b297-40c7-8cf0-57ae7dfe0442,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,I recently got a new Android phone. Tried to v...,1,917,2.24.13.77,2024-07-09,,,2.24.13.77,WhatsApp Messenger
4,893f9c71-c0eb-417d-a0e2-fc849f9e6b6c,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,It is extremely disappointing since Facebook g...,1,21544,2.24.9.78,2024-05-22,,,2.24.9.78,WhatsApp Messenger
...,...,...,...,...,...,...,...,...,...,...,...,...
49995,11d59964-62d9-4e6e-98ec-49d7a1edee22,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,"Great app, it usually helps me calm myself dow...",5,0,,2019-12-26,,,,Headspace: Meditation & Sleep
49996,d827fd17-f39e-4233-9f1e-a5e663847430,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,The download function doesn't work properly. E...,2,0,3.33.0,2019-09-05,Thank you for reaching out. We are sorry for t...,2019-09-05 13:40:35,3.33.0,Headspace: Meditation & Sleep
49997,4bc4b36b-9153-4f6e-a463-05326b6d1bcf,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,Used to use this on a daily basis. Now all the...,1,30,3.40.0,2019-10-20,Thanks for reaching out. We appreciate your fe...,2019-10-20 16:49:09,3.40.0,Headspace: Meditation & Sleep
49998,8ed1a88f-1de3-49a4-a083-668517b56741,black octobermoon,https://play-lh.googleusercontent.com/a/ACg8oc...,I used to love this ap there were lots of opti...,2,4,4.17.0,2020-10-25,,,4.17.0,Headspace: Meditation & Sleep


In [None]:
manually_labelled_df_copy

Unnamed: 0,content,length,positive,negative,label
0,"ver 12.14.1, delete all data menu not function...",23,2,-2,neutral
1,"used to be good. too many pop ups, ads article...",13,2,-4,negative
2,i couldn't even sign up i contacted fliphelp n...,12,1,-3,negative
3,for what i know about this app and it works re...,19,3,-1,positive
4,"love the app, though on occasions what is play...",95,3,-5,negative
...,...,...,...,...,...
9097,Personally i love the feature in which i can t...,310,3,-1,positive
9098,Seems to be a bug or something on android vers...,245,1,-1,neutral
9099,"This is a very good app, but you cannot downlo...",487,2,-2,neutral
9100,Its actually a nice app but these days i have ...,307,2,-2,neutral


## SAMPLE A DATASET FOR MANUAL LABELLING

In [None]:
# Function to sample data
def sample_data(group):
    n_samples = 50
    return group.sample(n=min(n_samples, len(group)), random_state=1)

# Apply the sampling function to each group determined by the 'source' column
sampled_data = app_reviews_df_copy.groupby('source').apply(sample_data).reset_index(drop=True)

# Calculate the length of each content
sampled_data['length'] = sampled_data['content'].apply(len)

sampled_data

Unnamed: 0,reviewId,userName,userImage,content,score,thumbsUpCount,reviewCreatedVersion,at,replyContent,repliedAt,appVersion,source,length
0,30aacbea-3411-4b52-96db-6fb56adc0be1,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,"Ads used to be bearable, but now they added in...",1,4,4.7.1.13 GNL,2020-10-12,,,4.7.1.13 GNL,BBC: World News & Stories,188
1,da910a0a-eaef-4e6b-8def-f7a8aad268ac,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,Nice but does put on some bias when writing th...,4,1,5.6.0.100,2019-05-11,,,5.6.0.100,BBC: World News & Stories,119
2,fa1778d9-2048-4864-a9d7-389e89f2d8eb,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,Nice interface. Better content than other news...,5,2,5.18.0,2021-08-25,,,5.18.0,BBC: World News & Stories,133
3,13c2711b-f373-4373-b6c3-46fbbd0062de,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,WHY DO NEWS ORGANIZATION INSIST ON MAKING MY O...,3,189,8.0.1.3,2024-04-04,,,8.0.1.3,BBC: World News & Stories,188
4,cf24dbec-4aa3-463c-9ce7-67c3ea7f2744,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,The news reporting is great. The new design of...,2,2,7.1.0.5385,2023-10-27,,,7.1.0.5385,BBC: World News & Stories,186
...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,5cdea7b6-b53e-46fa-bb79-127c58046bb9,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,Personally i love the feature in which i can t...,5,159,2.24.11.79,2024-06-17,,,2.24.11.79,WhatsApp Messenger,310
496,50c50cba-c54e-4b01-a56e-9cd4c16497ce,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,Seems to be a bug or something on android vers...,4,21,2.24.12.78,2024-06-21,,,2.24.12.78,WhatsApp Messenger,245
497,6ee99a2a-187a-48bb-a55a-a5a3456ad79c,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,"This is a very good app, but you cannot downlo...",2,22,2.24.12.78,2024-06-28,,,2.24.12.78,WhatsApp Messenger,487
498,1ba4866b-0da7-4ff4-9fe8-424650506012,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,Its actually a nice app but these days i have ...,1,6,2.24.13.77,2024-07-09,,,2.24.13.77,WhatsApp Messenger,307


In [None]:
# Remove the sampled rows from app_reviews_df_copy
app_reviews_df_copy = app_reviews_df_copy[~app_reviews_df_copy['reviewId'].isin(sampled_data['reviewId'])]

# Select only the specified columns: content, score, at, and source
app_reviews_df_copy = app_reviews_df_copy[['content', 'score', 'at', 'source']]
sampled_data = sampled_data[['content','score', 'at', 'source', 'length',]]

In [None]:
print(app_reviews_df_copy.shape)
print(sampled_data.shape)

(48617, 4)
(500, 5)


In [None]:
sampled_data

Unnamed: 0,content,score,at,source,length
0,"Ads used to be bearable, but now they added in...",1,2020-10-12,BBC: World News & Stories,188
1,Nice but does put on some bias when writing th...,4,2019-05-11,BBC: World News & Stories,119
2,Nice interface. Better content than other news...,5,2021-08-25,BBC: World News & Stories,133
3,WHY DO NEWS ORGANIZATION INSIST ON MAKING MY O...,3,2024-04-04,BBC: World News & Stories,188
4,The news reporting is great. The new design of...,2,2023-10-27,BBC: World News & Stories,186
...,...,...,...,...,...
495,Personally i love the feature in which i can t...,5,2024-06-17,WhatsApp Messenger,310
496,Seems to be a bug or something on android vers...,4,2024-06-21,WhatsApp Messenger,245
497,"This is a very good app, but you cannot downlo...",2,2024-06-28,WhatsApp Messenger,487
498,Its actually a nice app but these days i have ...,1,2024-07-09,WhatsApp Messenger,307


In [None]:
# Save the sampled data to a new CSV file
sampled_data.to_csv('/content/drive/MyDrive/Dissertation24/SampleReviews_ManualLabelling.csv', index=False)
print("Reviews have been saved to SampleReviews_ManuallyLabelled.csv")

Reviews have been saved to SampleReviews_ManuallyLabelled.csv


## DATA CLEANING

In [None]:
# Define comprehensive text cleaning function
def clean_text(text):

    # Convert to lowercase
    text = text.lower()

    # Remove emojis
    text = emoji.replace_emoji(text, replace='')

    # Remove URLs, mentions, and hashtags
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'#\w+', '', text)

    # Remove special characters and punctuation
    text = re.sub(r'[^\w\s]', '', text)

    # Remove numbers
    text = re.sub(r'\d+', '', text)

    # Decode HTML entities
    text = html.unescape(text)

    # Tokenize the text
    tokens = word_tokenize(text)

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    text = ' '.join(tokens)

    return text

# Apply text cleaning
app_reviews_df_copy['content'] = app_reviews_df_copy['content'].apply(clean_text)
manually_labelled_df_copy['content'] = manually_labelled_df_copy['content'].apply(clean_text)

# Calculate the length of each content
app_reviews_df_copy['length'] = app_reviews_df_copy['content'].apply(len)
manually_labelled_df_copy['length'] = manually_labelled_df_copy['content'].apply(len)

# Filter out rows where the length of the 'content' is 0
app_reviews_df_copy = app_reviews_df_copy[app_reviews_df_copy['length'] > 0]
manually_labelled_df_copy = manually_labelled_df_copy[manually_labelled_df_copy['length'] > 0]

app_reviews_df_copy

Unnamed: 0,content,score,at,source,length
0,use great app thing great end end encryption u...,4,2024-07-25,WhatsApp Messenger,244
1,whatsapp android goto messaging app incredibly...,5,2024-07-16,WhatsApp Messenger,234
2,good app although could made lot better visual...,3,2024-07-26,WhatsApp Messenger,250
3,recently got new android phone tried verify ne...,1,2024-07-09,WhatsApp Messenger,262
4,extremely disappointing since facebook got inv...,1,2024-05-22,WhatsApp Messenger,307
...,...,...,...,...,...
49995,great app usually help calm always understood ...,5,2019-12-26,Headspace: Meditation & Sleep,209
49996,download function doesnt work properly even th...,2,2019-09-05,Headspace: Meditation & Sleep,136
49997,used use daily basis thing listened locked beh...,1,2019-10-20,Headspace: Meditation & Sleep,197
49998,used love ap lot option still available free s...,2,2020-10-25,Headspace: Meditation & Sleep,269


In [None]:
manually_labelled_df_copy

Unnamed: 0,content,length,positive,negative,label
0,ver delete data menu function data still chat ...,78,2,-2,neutral
1,used good many pop ups ad article dont load an...,51,2,-4,negative
2,couldnt even sign contacted fliphelp mail till,46,1,-3,negative
3,know app work really good find exactly youre l...,52,3,-1,positive
4,love app though occasion played something inte...,266,3,-5,negative
...,...,...,...,...,...
9097,personally love feature text also meta ai howe...,164,3,-1,positive
9098,seems bug something android version trying quo...,163,1,-1,neutral
9099,good app download status back bring feature co...,298,2,-2,neutral
9100,actually nice app day encoutering lot problem ...,174,2,-2,neutral


## SPLITTING CLEANED DATASET INTO TRAIN AND TEST

In [None]:
# Split the dataset into 80% training and 20% testing
train_df, test_df = train_test_split(app_reviews_df_copy, test_size=0.2, random_state=42)

In [None]:
# Display the first few rows of the training and testing datasets
print(train_df)
print(test_df)

                                                 content  score          at  \
17223                                          fantastic      5  2020-03-14   
2121   recent update notification function working pr...      1  2024-07-16   
42384  fantastic app noticed added cost ireland used ...      5  2021-12-04   
33701  horrible experience ever got contacted deliver...      1  2021-05-20   
21737  thre mediation fall asleep nice ability wake w...      3  2022-08-08   
...                                                  ...    ...         ...   
11401  clear concise comprehensive impartial everythi...      5  2021-10-13   
46088  great app went ahead purchased subscription us...      4  2019-04-16   
39446  kyc process broken take long get agent close c...      1  2020-07-01   
866    although good initiative meta introduce ai act...      3  2024-07-11   
15957                      best apps bus time thanjs f l      5  2020-10-12   

                               source  length  
172

In [None]:
# Save all cleaned content to a CSV file
app_reviews_df_copy.to_csv('/content/drive/MyDrive/Dissertation24/Cleaned_Reviews.csv', index=False)
print("Reviews have been saved to Cleaned_Reviews.csv")

train_df.to_csv('/content/drive/MyDrive/Dissertation24/Training_Reviews.csv', index=False)
print("Reviews have been saved to Training_Reviews.csv")

test_df.to_csv('/content/drive/MyDrive/Dissertation24/Testing_Reviews.csv', index=False)
print("Reviews have been saved to Testing_Reviews.csv")

Reviews have been saved to Cleaned_Reviews.csv
Reviews have been saved to Training_Reviews.csv
Reviews have been saved to Testing_Reviews.csv


In [None]:
manually_labelled_df_copy.to_csv('/content/drive/MyDrive/Dissertation24/Cleaned_Manually_Labelled_Reviews.csv', index=False)
print("Reviews have been saved to Cleaned_Manually_Labelled_Reviews.csv")

Reviews have been saved to Cleaned_Manually_Labelled_Reviews.csv
