# Import every significant libraries to do this study

In [None]:
# Download the other packages
!pip install -U kaleido
!pip install pyLDAvis

In [None]:
# Data analysis
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.special import jv
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
from scipy import stats
from pprint import pprint
from wordcloud import WordCloud
import warnings
# Text analysis
import re
import nltk
from nltk.corpus import stopwords
from nltk.corpus import words
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
# TF-IDF Vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import PCA
# Clustering models
from sklearn.cluster import KMeans
from sklearn import metrics
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
from sklearn.metrics import silhouette_score
from sklearn.metrics import davies_bouldin_score
from scipy.spatial.distance import cdist
# LDA topica modelling
import gensim
from gensim.utils import simple_preprocess
import tqdm
from gensim.models import CoherenceModel
import spacy
from gensim import corpora
from gensim.models import LdaModel
import pyLDAvis.gensim
import pickle
import pyLDAvis
# Sentiment Analysis
from nltk.sentiment import SentimentIntensityAnalyzer
from tqdm.notebook import tqdm
import plotly.express as px

# Download text mining packages
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('words')
nltk.download('averaged_perceptron_tagger')
nltk.download('vader_lexicon')
warnings.filterwarnings('ignore')

stopwords = stopwords.words('english')
stopwords.extend(['from', 'subject', 're', 'edu', 'use'])

# Import dataset

In [None]:
from google.colab import files
uploaded = files.upload()
# df = pd.read_excel('./tiktok_df.xlsx')

In [None]:
df = pd.read_excel('./tiktok_df.xlsx')
# Select only unique data
selected_df = df.drop_duplicates(subset='id')
selected_df = selected_df.drop_duplicates(subset='description')
selected_df = selected_df[selected_df['description'].notnull() & (selected_df['description'] != '')]
selected_df = selected_df.reset_index(drop=True)

# Text preprocessing

## Helper function

In [None]:
# Stemming and Lemmatization
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
# Create a set of English words
english_words = set(words.words())
english_words = {item.lower() for item in english_words}
spam_words = ['viral', 'trend', 'xuhuong', 'video', 'fyp', 'foryou', 'foryoupage', 'fypviral', 'fypage', 'tiktok', 'fypforyou', 'fouryou', 'fyppppppppppppppppppppppp', 'viralvideo',
               'foryourpage', 'pov', 'trending',  'capcut', 'viraltiktok', 'new', 'сериал', 'bio', 'link', 'foryoupageofficial', 'fypp', 'xuhuongtiktok', 'xybca', 'fordig', 'youtube', 'name', 'edit', 'edits', 'xyzbca', 'know', 'part',  'back']
nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])

def filtering(description):
    def is_valid_word(word):
        return word.lower() in english_words
    words = nltk.word_tokenize(description)
    words = [w for w in words if w not in stopwords and w not in spam_words] # filter spam words
    # Apply lemmatization technique
    processed_words =[lemmatizer.lemmatize(word) for word in words]
    # Filter out non-English words
    filtered_words = [word for word in processed_words if is_valid_word(word)]
    return ' '.join(processed_words)

def remove_emoji(text):
    emoji_pattern = re.compile("["
                                u"\U0001F600-\U0001F64F"  # emoticons
                                u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                                u"\U0001F680-\U0001F6FF"  # transport & map symbols
                                u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                                u"\U00002702-\U000027B0"  # dingbats
                                u"\U000024C2-\U0001F251"  # enclosed characters
                                u"\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
                                u"\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
                                u"\U0001F700-\U0001F77F"  # Alchemical Symbols
                                u"\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
                                u"\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
                                u"\U0001FA00-\U0001FA6F"  # Chess Symbols
                                u"\U00002700-\U000027BF"  # Dingbats
                                u"\U0001F1E6-\U0001F1FF"  # Regional Indicator Symbols
                                u"\U0001F004"             # Mahjong Tile Red Dragon
                                u"\U0001F0CF"             # Playing Card Black Joker
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

def text_cleaning_custom(text):
    # Tokenize the text into words
    word_list = re.findall(r'\b\w+\b', text)
    # Filter out single-double-triple characters words
    filtered_words = [word for word in word_list if len(word) > 3]
    # Join the filtered words back into a single string
    filtered_text = ' '.join(filtered_words)

    return filtered_text

def text_preprocess (df):
    # Remove punctuations from the Description column
    punctuations = '''()-[]{};:"\,<>./?@#$%^&*£!~ '''
    text_list = []
    for text in df['description']:
        sentence = ""
        for char in text:
            if (char not in punctuations):
                sentence = sentence + char
            else:
                sentence = sentence + " "
        # Removing number
        pattern = r'\d+'
        sentence = re.sub(pattern, '', sentence)

        sentence = remove_emoji(sentence)
        sentence = text_cleaning_custom(sentence)

        # Mutuating the text cleaning column as sentence
        text_list.append(sentence.lower())
    # Apply the function to the Description column and create the Tokens column
    df['token'] = text_list
    df['sentence'] = df['token'].apply(filtering)

def calculate_confidence_interval_95(dataframe, column):
    mean = dataframe[column].mean()
    sem = stats.sem(dataframe[column])

    ci = stats.t.interval(0.95, len(dataframe[column])-1, loc=mean, scale=sem)
    return ci

# Unigram
def get_top_n_words(corpus, n=10):
    vec = CountVectorizer(stop_words='english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0)
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq = sorted(words_freq, key=lambda x: x[1], reverse=True)
    return words_freq[:n]

# Bigram
def get_top_n_bigrams(corpus, n=10):
    vec = CountVectorizer(ngram_range=(2, 2), stop_words='english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0)
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq = sorted(words_freq, key=lambda x: x[1], reverse=True)
    return words_freq[:n]

def plot_horizontal_bar_chart(data, title):
    words, freqs = zip(*data)
    plt.figure(figsize=(10, 6))
    plt.barh(words, freqs, color='#5ca7f7')
    plt.xlabel('Frequency')
    plt.ylabel('Words')
    plt.gca().invert_yaxis()  # Invert y-axis to have the highest frequency on top
    plt.title(title)
    plt.show()

def print_top_terms_per_k_mean_cluster(tfidf_df, terms, num_terms=10):
    cluster_centers = kmeans.cluster_centers_
    original_centroids = pca.inverse_transform(cluster_centers)  # Transform centroids back to original feature space
    terms = vectorizer.get_feature_names_out()

    for i, centroid in enumerate(original_centroids):
        top_indices = centroid.argsort()[-num_terms:][::-1]  # Get indices of top terms
        top_terms = [terms[idx] for idx in top_indices]
        print(f"Cluster {i + 1}:")
        print(", ".join(top_terms))
        print("\n")

# Hierarchical
def compute_wss_hierarchical(data, labels):
    wss = 0
    for label in np.unique(labels):
        cluster_data = data[labels == label]
        centroid = cluster_data.mean(axis=0)
        wss += ((cluster_data - centroid) ** 2).sum()
    return wss

def print_top_terms_per_hierarchical_cluster(tfidf_df, terms, num_terms=10):
    cluster_terms = {}
    for cluster in range(1, max_clusters + 1):
        cluster_data = tfidf_df[tfidf_df['cluster'] == cluster].drop(columns=['cluster'])
        cluster_mean = cluster_data.mean(axis=0)
        top_terms = cluster_mean.sort_values(ascending=False).head(num_terms).index.tolist()
        cluster_terms[cluster] = top_terms
        print(f"Cluster {cluster}:")
        print(", ".join(top_terms))
        print("\n")
    return cluster_terms

def tokenize_LDA_model(sentences):
    for sentence in sentences:
        # deacc=True removes punctuations
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))


def make_bigrams(texts, bigram_mod):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts, trigram_mod, bigram_mod):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent))
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

def compute_coherence_values(corpus, dictionary, text, k, a, b):
    lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                           id2word=dictionary,
                                           num_topics=k,
                                           random_state=100,
                                           chunksize=100,
                                           passes=5,
                                           alpha=a,
                                           eta=b)

    coherence_model_lda = CoherenceModel(model=lda_model, texts=text, dictionary=id2word, coherence='c_v')

    return coherence_model_lda.get_coherence()

def get_dominant_topic(lda_model, corpus):
    dominant_topics = []
    for doc_bow in corpus:
        doc_topics = lda_model.get_document_topics(doc_bow)
        dominant_topic = max(doc_topics, key=lambda x: x[1])[0]
        dominant_topics.append(dominant_topic + 1)
    return dominant_topics

# Sentiment analysis
def classify_sentiment(compound):
    if compound > 0:
      return "Positive"
    elif compound < 0:
      return "Negative"
    else:
      return "Neutral"

# Visualization insights
def convert_num_cluster_to_text(num):
  match num:
    case 1:
      return "Entertainment"
    case 2:
      return "Movie and Music"
    case 3:
      return "Travel"
    case 4:
      return "Lifestyle"
    case 5:
      return "Hobby"
    case 6:
      return "Sport and Comedy"
    case 7:
      return "Social Media"
    case 8:
      return "Humour"

def calculate_category_distribution(df, total_count, attribute):
  result = df.groupby(attribute).size().reset_index(name='Amount')
  result['% Amount'] = (result['Amount'] / total_count) * 100
  result['% Amount'] = result['% Amount'].round(2)
  result = result.sort_values(by='Amount', ascending=False)
  return result

def find_popular_distribution(df, attribute, label_attribute, n):
  top_100 = df.sort_values(attribute, ascending=False).head(n)
  result_df = top_100.groupby('Category').agg({attribute: ['count', 'mean']}).reset_index()
  result_df.columns = ['Category', 'AMOUNT', label_attribute]
  result_df = result_df.sort_values('AMOUNT', ascending=False)

  return result_df

In [None]:
muslim_keywords = [
    "allah", "muslimtiktok", "اسلام", "مسلم", "اسلامی", "قرآن",
    "حديث", "الحمدلله", "الله", "الدين", "محمد", "ابو_زير", "الدعاء",
    "مسجد", "الحجاب", "الصلاة", "الصوم", "الجهاد", "رمضان",
    "مكة", "المدينة", "الشهادة", "الزكاة", "الإسلام", "القرآن",
    "الدعوة", "شريعة", "أمة", "خليفة", "الفتوى", "الحديث",
    "عمرة", "حج", "الحلال", "الحرام", "النبى", "الصحابة", "الجنة",
    "جهنم", "يوم_القيامة", "شهيد", "الخلافة", "المسلمون"
]

selected_df = selected_df[~selected_df['description'].apply(lambda x: any(word in x.lower() for word in muslim_keywords))]
text_preprocess(selected_df)
selected_df = selected_df[selected_df['sentence'].notnull() & (selected_df['sentence'] != '')]

# Exploratory Data Analysis

In [None]:
print(selected_df.shape)
print(calculate_confidence_interval_95(selected_df, 'playCount'))
print(calculate_confidence_interval_95(selected_df, 'likeCount'))
print(calculate_confidence_interval_95(selected_df, 'commentCount'))
print(calculate_confidence_interval_95(selected_df, 'collectCount'))
print(calculate_confidence_interval_95(selected_df, 'shareCount'))

# Statistical summary of the dataset
selected_df.describe().T

num_cols = selected_df.select_dtypes(include=np.number).columns.tolist()
#num_cols.remove(['id'])

for col in num_cols:
  plt.figure(figsize = (15,4))
  plt.subplot(1,2,1)
  selected_df[col].hist(grid=False)
  plt.ylabel('Count')
  plt.subplot(1,2,2)
  sns.boxplot(x=df[col])
  plt.show()

# Word Cloud Visualization

In [None]:
long_string_without_cleaning = ','.join(list(selected_df['description'].values))
wordcloud = WordCloud(background_color= "white",
                      max_words= 5000, contour_width= 3,
                      contour_color= 'steelblue')

wordcloud.generate(long_string_without_cleaning)
# save word clound to jpg image
wordcloud.to_file("wordcloud_v1.jpg")

In [None]:
long_string = ','.join(list(selected_df['sentence'].values))

wordcloud = WordCloud(background_color= "white",
                      max_words= 5000, contour_width= 3,
                      contour_color= 'steelblue')

wordcloud.generate(long_string)
# save word clound to jpg image
wordcloud.to_file("wordcloud_v2.jpg")

# Unigram and Bigram Analysis

In [None]:
top_10_unigrams = get_top_n_words(selected_df['sentence'], 10)
top_10_bigrams = get_top_n_bigrams(selected_df['sentence'], 10)

# Print top 10 unigrams
print("Top 10 Unigrams:")
for word, freq in top_10_unigrams:
    print(f"{word}: {freq}")

# Print top 10 bigrams
print("\nTop 10 Bigrams:")
for word, freq in top_10_bigrams:
    print(f"{word}: {freq}")

# Plot top 10 unigrams
plot_horizontal_bar_chart(top_10_unigrams, 'Top 10 Unigrams')

# Plot top 10 bigrams
plot_horizontal_bar_chart(top_10_bigrams, 'Top 10 Bigrams')

# Partitional clustering models
- Build model
- Evaluation model
- Interprete the topic modelling (Word list)

# TF-IDF transformation to distance matrix

In [None]:
documents = selected_df['sentence'].values.astype("U")
# Convert text to distance matrix
# Modify the TfidfVectorizer to include N-grams (bigrams and trigrams)
vectorizer = TfidfVectorizer(ngram_range=(1, 3), max_features=10000)  # You can adjust max_features as needed

features = vectorizer.fit_transform(documents)
features_dense = features.toarray()

# Principle Component Analysis

In [None]:
# Dimensionality Reduction
pca = PCA(n_components=7)
features_reduced = pca.fit_transform(features_dense)

plt.plot(range(1, len(pca.explained_variance_ratio_) + 1), pca.explained_variance_ratio_)
plt.xlabel('Number of Components')
plt.ylabel('Explained Variance Ratio')
plt.title('Scree Plot')
plt.show()

# K-mean text clustering

# Do the hyperparametes tuning
- to find the optimal k-value

In [None]:
K = range(2, 20)
wss = []

for i in K:
  model = KMeans(
      n_clusters=i,
      init= "k-means++",
      random_state= 200
  )

  labels= model.fit(features_reduced).labels_

  wss_iter = model.inertia_
  wss.append(wss_iter)

metrics_centers = pd.DataFrame({
    'Clusters': K,
    'WSS': wss,
})

fig, ax = plt.subplots(1, 1, figsize=(18, 5))

# Plot the elbow method (WSS)
sns.lineplot(ax=ax, x='Clusters', y='WSS', data=metrics_centers, marker='+')
ax.set_title('Within-Cluster Sum of Squares (WSS)')

plt.show()

In [None]:
# Perform K-Mean Clustering with k Clusters.
# Select the optimal k-value = 11, considering by elbow-method at WSS metrics
k = 11
kmeans = KMeans(n_clusters= k, init= "k-means++")

labels= kmeans.fit(features_reduced).labels_

silhouette_score = metrics.silhouette_score(
      features_reduced,
      labels,
      metric= 'euclidean',
      sample_size= len(selected_df),
      random_state= 200
  )

db_index = metrics.davies_bouldin_score(features_reduced, labels)
print(silhouette_score)
print(db_index)

selected_df['k_mean_cluster'] = kmeans.labels_

# Get feature names (terms) from the vectorizer
terms = vectorizer.get_feature_names_out()

# Create a DataFrame from the original TF-IDF features (before PCA)
tfidf_df = pd.DataFrame(features_dense, columns=terms)
tfidf_df['cluster'] = kmeans.labels_

print_top_terms_per_k_mean_cluster(tfidf_df, terms)

In [None]:
# Plot the 3D k-means clustering distribution of each cluster

fig = plt.figure(figsize=(10, 10))
ax = fig.add_subplot(111, projection='3d', elev=40, azim=80)

# Scatter plot
sc = ax.scatter(selected_df['playCount'], selected_df['likeCount'], selected_df['commentCount'],
                c=selected_df['k_mean_cluster'], cmap='rainbow')

ax.set_xlabel("Play Count", labelpad=10)
ax.set_ylabel("Like Count", labelpad=10)
ax.set_zlabel("Comment Count", labelpad=20)

ax.set_facecolor('white')
plt.title("Tiktok Video Clustering using K Means", fontsize=14)

# Optional: Add a color bar
plt.colorbar(sc)
plt.show()

# Hierarchical model

In [None]:
ward_cluster = linkage(features_reduced, method= 'ward')

In [None]:
# Plot dendrogram chart
plt.figure(figsize=(10,5))
dendrogram(ward_cluster, labels=documents, leaf_rotation=90, leaf_font_size=10)
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('Document')
plt.ylabel('Distance')
plt.show()

In [None]:
# Run Hierarchical algorithm

K = range(2, 20)
wss = []

for max_clusters in K:
  hirarchical_clusters = fcluster(ward_cluster, max_clusters, criterion= 'maxclust')

  wss.append(compute_wss_hierarchical(features_reduced, hirarchical_clusters))

metrics_centers = pd.DataFrame({
    'Clusters': K,
    'WSS': wss
})

# plot elbow method (WSS)
fig, ax = plt.subplots(1, 1, figsize=(18, 5))
sns.lineplot(ax=ax, x='Clusters', y='WSS', data=metrics_centers, marker='+')
ax.set_title('Within-Cluster Sum of Squares (WSS)')

plt.show()

In [None]:
max_clusters = 9
hirarchical_clusters = fcluster(ward_cluster, max_clusters, criterion= 'maxclust')

sil_score = silhouette_score(features_reduced, hirarchical_clusters)
# 'numpy.float64' object is not callable if you found this error, please move back to run import libraries again (It may lose some required tools)
db_score = davies_bouldin_score(features_reduced, hirarchical_clusters)

print(sil_score)
print(db_score)
selected_df['hirarchical_cluster'] = hirarchical_clusters

# Get feature names (terms) from the vectorizer
terms = vectorizer.get_feature_names_out()

# Create a DataFrame from the original TF-IDF features (before PCA)
tfidf_df = pd.DataFrame(features_dense, columns=terms)
tfidf_df['cluster'] = hirarchical_clusters

cluster_terms = print_top_terms_per_hierarchical_cluster(tfidf_df, terms)

# LDA Model

In [None]:
data_words = selected_df['sentence'].values.tolist()
data_words = list(tokenize_LDA_model(data_words)) # tokenization

# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)

bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# Form Trigrams
data_words = make_trigrams(data_words, trigram_mod, bigram_mod)

# Create Dictionary
texts = data_words
id2word = corpora.Dictionary(texts)

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# Hyperparameter Tuning
## C_v
- measure is based on a sliding window, one-set segmentation of the top words and an indirect confirmation measure that uses normalized pointwise mutual information (NPMI) and the cosine similarity

In [None]:
# Topics range
min_topics = 8
max_topics = 9
step_size = 1
topics_range = range(min_topics, max_topics, step_size)

alpha = list(np.arange(0.01, 1, 0.1))
beta = list(np.arange(0.01, 1, 0.1))

# Validation sets
num_of_docs = len(corpus)
corpus_sets = [corpus]

model_results = {'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }
data_lemmatized = lemmatization(data_words, allowed_postags=['NOUN'])
# Can take a long time to run
if 1 == 1:
    for i in range(len(corpus_sets)):
        # iterate through number of topics
        for k in topics_range:
            # iterate through alpha values
            for a in alpha:
                # iterare through beta values
                for b in beta:
                    # get the coherence score for the given parameters
                    cv = compute_coherence_values(corpus=corpus_sets[i], dictionary=id2word, text=data_lemmatized, k=k, a=a, b=b)
                    model_results['Topics'].append(k)
                    model_results['Alpha'].append(a)
                    model_results['Beta'].append(b)
                    model_results['Coherence'].append(cv)

In [None]:
# export the experimental LDA results
file_path = 'coherance_results.xlsx'
pd.DataFrame(model_results).to_excel(file_path, index=False)

In [None]:
# Plot the some of LDA modelling to see the best performance parameter
plt.style.use("_classic_test_patch")
x= [2,3,4,5,6,7,8,9]
y = [0.47502992152505985,
  0.4739038870968999,
  0.5191712392844225,
  0.5696449051925503,
  0.5540307696200729,
  0.5702856314238206,
  0.5962194674972741,
  0.5820308009596259]

plt.plot(x, y, marker='o', color='#2C15FF')
plt.xlabel('the number of topics')
plt.ylabel('Coherence Score')
plt.show()

# Build LDA with optimal value

In [None]:
num_topics = 8

# Build LDA model
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=num_topics,
                                           random_state=100,
                                           chunksize=100,
                                           passes=10,
                                           alpha=0.81,
                                           eta=0.01)

coherence_model_lda = CoherenceModel(model=lda_model, texts=data_words, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('Coherence Score: ', coherence_lda)
# Get topic distribution for each document
doc_lda = lda_model.get_document_topics(corpus, minimum_probability=0)
selected_df['LDA_cluster'] = get_dominant_topic(lda_model, corpus)
# Plot the distribution of each topic of LDA model
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word, mds='mmds', R=10)
vis

In [None]:
# print dominant topic with topic_keywords of LDA model
data_dict = {'dominant_topic':[], 'perc_contribution':[], 'topic_keywords':[]}

for i, row in enumerate(lda_model[corpus]):
    row = sorted(row, key=lambda x: x[1], reverse=True)
    for j, (topic_num, prop_topic) in enumerate(row):
        wp = lda_model.show_topic(topic_num)
        topic_keywords = ", ".join([word for word, prop in wp])
        data_dict['dominant_topic'].append(int(topic_num))
        data_dict['perc_contribution'].append(round(prop_topic, 3))
        data_dict['topic_keywords'].append(topic_keywords)
        break

df_topics = pd.DataFrame(data_dict)
print(df_topics)

# mapping the video category to dominant_topic (LDA_cluster)
selected_df["Category"] = selected_df["LDA_cluster"].apply(convert_num_cluster_to_text)

# Sentiment Analysis

In [None]:
sia = SentimentIntensityAnalyzer()
selected_df['token'] = selected_df['sentence'].apply(lambda x: nltk.word_tokenize(x))

res = {}
for i, row in tqdm(selected_df.iterrows(), total=len(selected_df)):
  text = row['sentence']
  myid = row['id']
  res[myid] = sia.polarity_scores(text)

vaders = pd.DataFrame(res).T
vaders = vaders.reset_index().rename(columns= {'index': 'id'})
vaders = vaders.merge(selected_df, how= 'left')
# apply the emotional compund score to the dataframe
vaders['Sentiment'] = vaders['compound'].apply(classify_sentiment)

In [None]:


total_count= len(selected_df)
result = selected_df['k_mean_cluster'].value_counts().reset_index(name='Amount')
result['% Amount'] = (result['Amount'] / total_count) * 100
result['% Amount'] = result['% Amount'].round(2)
result

In [None]:
# Plot the proportion of each emotional type (positive, neutral, negative)
total_count= len(vaders)
result = vaders['Sentiment'].value_counts().reset_index(name='Amount')
result['% Amount'] = (result['Amount'] / total_count) * 100
result['% Amount'] = result['% Amount'].round(2)

fig = px.bar(result, x="Sentiment", y="% Amount", color= "Amount", color_continuous_scale='GnBu')
fig.show()

# Visual the insights

In [None]:
total_rows = len(selected_df)
category_dist_fig = calculate_category_distribution(selected_df, total_rows, 'Category')

fig = px.bar(category_dist_fig, x="% Amount", y="Category", color="Amount", orientation='h', color_continuous_scale='GnBu')
fig.update_layout(yaxis_title="Category")
fig.show()

popular_distribution_top_100 = find_popular_distribution(selected_df, 'playCount', 'Average Views', 100)
fig = px.bar(popular_distribution_top_100, x="AMOUNT", y="Category", color="Average Views", orientation='h', color_continuous_scale='Emrld')
fig.update_layout(yaxis_title="Category")
fig.show()

engagement_distribution_top_100 = find_popular_distribution(selected_df, 'commentCount', 'Average Comments', 100)
fig = px.bar(engagement_distribution_top_100, x="AMOUNT", y="Category", color="Average Comments", orientation='h', color_continuous_scale='Purp')
fig.update_layout(yaxis_title="Category")
fig.show()

# The percentage of the sentiment score of each category
total_count= len(vaders)
result = vaders.groupby(['Category', 'Sentiment']).size().reset_index(name='Amount')
result['% Amount'] = (result['Amount'] / total_count) * 100
result['% Amount'] = result['% Amount'].round(2)

fig = px.bar(result, x="Category", y="% Amount", color="Sentiment", text="Amount")
fig.show()

In [None]:
# Print the top 100 videos based on views
n = 100
result = selected_df[['id', 'description', 'playCount', 'likeCount', 'Category']].rename(columns={
      'description': 'Video Description',
      'playCount': 'Views',
      'likeCount': 'Likes'
  }).sort_values(by='Views', ascending=False).head(n)

result