# Comments Dataset Cleaning 

In [1]:
import pandas as pd
import numpy as np
import glob

## Combine Comment.csv
- importing and merging all comemnts.csv
- randomizing it
- taking only 10% = 50,000

In [None]:
path = '../data/comments*.csv'
cmtFiles = glob.glob(path)
cmtFilesList = []

for filename in cmtFiles:
    comments_df = pd.read_csv(filename, index_col=None, header=0)
    cmtFilesList.append(comments_df)

comments_df = pd.concat(cmtFilesList, axis=0, ignore_index=True)

comments_df = comments_df.sample(frac=0.1, random_state=42)

In [None]:
videos_df = pd.read_csv('../data/cleaned_videos.csv')

## Exploratory Data Analysis (EDA)

In [4]:
print(comments_df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 472501 entries, 3956772 to 4309356
Data columns (total 10 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   kind             472501 non-null  object 
 1   commentId        472501 non-null  int64  
 2   channelId        472501 non-null  int64  
 3   videoId          472501 non-null  int64  
 4   authorId         472501 non-null  int64  
 5   textOriginal     472483 non-null  object 
 6   parentCommentId  51928 non-null   float64
 7   likeCount        472501 non-null  int64  
 8   publishedAt      472501 non-null  object 
 9   updatedAt        472501 non-null  object 
dtypes: float64(1), int64(5), object(4)
memory usage: 39.7+ MB
None


## Data Preprocessing
- drop irrelevant columns
- removing null comments 

In [5]:
comments_df = comments_df.drop(columns=['kind','publishedAt','updatedAt'])

comments_df = comments_df.dropna(subset=['textOriginal'])

print(comments_df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 472483 entries, 3956772 to 4309356
Data columns (total 7 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   commentId        472483 non-null  int64  
 1   channelId        472483 non-null  int64  
 2   videoId          472483 non-null  int64  
 3   authorId         472483 non-null  int64  
 4   textOriginal     472483 non-null  object 
 5   parentCommentId  51928 non-null   float64
 6   likeCount        472483 non-null  int64  
dtypes: float64(1), int64(5), object(1)
memory usage: 28.8+ MB
None


## Emoji to Text Conversion

In [6]:
pip install demoji

Collecting demoji
  Downloading demoji-1.1.0-py3-none-any.whl.metadata (9.2 kB)
Downloading demoji-1.1.0-py3-none-any.whl (42 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.9/42.9 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: demoji
Successfully installed demoji-1.1.0
Note: you may need to restart the kernel to use updated packages.


In [7]:
import demoji

def emojitoText (text):
    return demoji.replace_with_desc(text, sep=' ')

comments_df['textCleaned'] = comments_df['textOriginal'].apply(emojitoText)

## Normalise Fancy Font

In [None]:
from unidecode import unidecode

def normaliseFont (text):
    return unidecode(text)

comments_df['textCleaned'] = comments_df['textOriginal'].apply(normaliseFont)

## Data Removal
- website link
- user ID
- long numbers (bank account / phone numbers)
- punctuation 

In [10]:
import re

urlPattern = r'\b(?:(?:https?:\/\/|www\.)[a-z0-9](?:[a-z0-9-]{0,61}[a-z0-9])?(?:\.[a-z]{2,})+(?::\d{2,5})?(?:\/[^\s]*)?)\b'
comments_df['textCleaned'] = comments_df['textCleaned'].str.replace(urlPattern, '', regex=True)

userPattern = r'@\w+'
comments_df['textCleaned'] = comments_df['textCleaned'].str.replace(userPattern, '', regex=True)

numberPattern = r'\b\d{10,}\b'
comments_df['textCleaned'] = comments_df['textCleaned'].str.replace(numberPattern, '', regex=True)

punctuationPattern = r'[^\w\s]'
comments_df['textCleaned'] = comments_df['textCleaned'].str.replace(punctuationPattern, '', regex=True)

## Stop Word Removal

In [12]:
comments_df['textCleaned'] = comments_df['textCleaned'].str.lower()

from nltk.corpus import stopwords
stopWords = set(stopwords.words('english'))

def removeStopwords(text):
    words = text.split()
    filteredWords = [word for word in words if word not in stopWords]
    return ' '.join(filteredWords)

comments_df['textCleaned'] = comments_df['textCleaned'].apply(removeStopwords)

In [13]:
comments_df = comments_df.dropna(subset=['textCleaned'])

# Comment Dataset Model Training

## Language Detection
- remove non english comments

In [14]:
pip install langdetect

Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m16.7 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: langdetect
  Building wheel for langdetect (setup.py) ... [?25l[?25hdone
  Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993223 sha256=c7ae176c8a4408328bf571a50f1648b4025bad9e31b9de655357ffef8ba357b2
  Stored in directory: /root/.cache/pip/wheels/0a/f2/b2/e5ca405801e05eb7c8ed5b3b4bcf1fcabcd6272c167640072e
Successfully built langdetect
Installing collected packages: langdetect
Successfully installed langdetect-1.0.9
Note: you may need to restart the kernel to use updated packages.


In [15]:
from langdetect import detect, LangDetectException

def is_english(text):
    try:
        return detect(text) == 'en'
    except LangDetectException:
        return False

initial_count = len(comments_df)
print(f"Initial number of comments: {initial_count}")

english_mask = comments_df['textOriginal'].apply(is_english)
comments_df = comments_df[english_mask]

final_count = len(comments_df)
print(f"Number of English comments found: {final_count}")
print(f"Number of non-English comments dropped: {initial_count - final_count}")


Initial number of comments: 472483
Number of English comments found: 246486
Number of non-English comments dropped: 225997


### Merge Comments & Videos CSV

In [16]:
df_merged = pd.merge(comments_df, videos_df, on='videoId', how='left')

df_merged['title'] = df_merged['title'].fillna('')
df_merged['description'] = df_merged['description'].fillna('')
df_merged['tags'] = df_merged['tags'].fillna('')
df_merged['textCleaned'] = df_merged['textCleaned'].fillna('')

df_merged['video_text'] = df_merged['title'] + ' ' + df_merged['description'] + ' ' + df_merged['tags']

## Relevancy Score

In [18]:
from sentence_transformers import SentenceTransformer, util

model = SentenceTransformer('all-MiniLM-L6-v2')

video_embeddings = model.encode(df_merged['video_text'].tolist(), convert_to_tensor=True, show_progress_bar=True)
comment_embeddings = model.encode(df_merged['textCleaned'].tolist(), convert_to_tensor=True, show_progress_bar=True)


Batches:   0%|          | 0/7703 [00:00<?, ?it/s]

Batches:   0%|          | 0/7703 [00:00<?, ?it/s]

In [19]:
import torch

batch_size = 256
all_relevancy_scores = []

for i in range(0, len(comment_embeddings), batch_size):
    comment_batch = comment_embeddings[i:i + batch_size]
    video_batch = video_embeddings[i:i + batch_size] 
    cosine_scores_batch = util.cos_sim(comment_batch, video_batch)
    relevancy_scores_batch = cosine_scores_batch.diag()
    all_relevancy_scores.append(relevancy_scores_batch)

relevancyScore = torch.cat(all_relevancy_scores)
df_merged.loc[df_merged.index, 'relevancyScore'] = relevancyScore.cpu().numpy()

## Spam Detection

In [20]:
# Advertisement
# Website link count
df_merged['adCount'] = df_merged['textOriginal'].str.count(urlPattern)

# Phone or Bank account count
df_merged['adCount'] += df_merged['textOriginal'].str.count(numberPattern)

# Repetition (same author id AND same text) OR (same text)
df_merged['isRepetition'] = df_merged.duplicated(subset=['authorId', 'textOriginal'], keep='first') | df_merged.duplicated(subset=['textOriginal'], keep='first')

# Relevancy Score 
relevancy_threshold = 0.2

# Spam Detection
df_merged['spam'] = ((df_merged['adCount'] > 0) | (df_merged['isRepetition'] == True)).astype(int) | (df_merged['relevancyScore'] <= relevancy_threshold)

spam_counts = df_merged['spam'].value_counts()
print(f"Number of non-spam comments: {spam_counts.get(False, 0)}")
print(f"Number of spam comments:     {spam_counts.get(True, 0)}")

Number of non-spam comments: 136377
Number of spam comments:     110109


## Categorisation

In [None]:
from transformers import pipeline
import pandas as pd

# Assume 'df_merged' is your DataFrame with 'spam' and 'textCleaned' columns

classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
comment_categories = ['Product Feedback', 'Brand Sentiment', 'Customer Inquiry', 'User Engagement']

# Create a clean copy to avoid warnings
df_non_spam = df_merged[df_merged['spam'] == False].copy()

# Create a mask to find comments that have actual text
valid_text_mask = df_non_spam['textCleaned'].str.strip() != ''

# Select only the non-empty comments to classify
comments_to_classify = df_non_spam.loc[valid_text_mask, 'textCleaned'].tolist()

# Initialize the new column with a default placeholder
df_non_spam['commentCategory'] = 'Uncategorized'

# Run the classifier only on the valid comments
if comments_to_classify:
    results = classifier(comments_to_classify, comment_categories, batch_size=32)
    classified_categories = [result['labels'][0] for result in results]
    
    # Place results back into the correct rows using the mask
    df_non_spam.loc[valid_text_mask, 'commentCategory'] = classified_categories

# Display a preview of the results
print(df_non_spam[['spam', 'textCleaned', 'commentCategory']].head())

Device set to use cuda:0


## Sentiment Analysis

In [25]:
sentiment_pipe = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")

comments_to_analyze = df_non_spam['textCleaned'].tolist()

if comments_to_analyze:
    print(f"Analyzing sentiment for {len(comments_to_analyze)} comments...")
    results = sentiment_pipe(comments_to_analyze, batch_size=64, truncation=True)

    df_non_spam['sentiment_label'] = [result['label'] for result in results]
    df_non_spam['sentiment_score'] = [result['score'] for result in results]

else:
    print("No comments to analyze.")

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Device set to use cuda:0


Analyzing sentiment for 138956 comments...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_non_spam['sentiment_label'] = [result['label'] for result in results]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_non_spam['sentiment_score'] = [result['score'] for result in results]


In [None]:
df_non_spam.to_csv('final_comments.csv', index=False, encoding='utf-8-sig')