In [1]:
import pandas as pd
import re
import string
import os
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [2]:
comments_df = pd.read_csv("../data/raw/comments.csv")
print(comments_df.shape)
comments_df.head()

(150, 4)


Unnamed: 0,author,text,like_count,published_at
0,@jrollthedj,we are now working on the new 2024 mix for eve...,6753,2024-01-08T23:30:51Z
1,@eliete3149,"Toda vez que ouço essa música, meu coração bat...",0,2025-03-31T21:14:18Z
2,@danieljohnson7020,2025 afrobeattt is now out hear the beats on e...,0,2025-03-31T19:37:59Z
3,@Vibesfirst,When we are happy that’s when we enjoy the son...,2,2025-03-31T14:56:10Z
4,@AyanshCharan-ui9mw,I'm here 2025 who else❤,5,2025-03-31T04:12:31Z


In [3]:
comments_df.describe(include="all")


Unnamed: 0,author,text,like_count,published_at
count,150,147,150.0,150
unique,145,144,,150
top,@dawnchu5449,❤,,2024-01-08T23:30:51Z
freq,3,2,,1
mean,,,118.326667,
std,,,1036.720154,
min,,,0.0,
25%,,,0.0,
50%,,,0.0,
75%,,,1.0,


In [4]:
# Check for missing values
comments_df.isnull().sum()

author          0
text            3
like_count      0
published_at    0
dtype: int64

In [5]:
comments_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   author        150 non-null    object
 1   text          147 non-null    object
 2   like_count    150 non-null    int64 
 3   published_at  150 non-null    object
dtypes: int64(1), object(3)
memory usage: 4.8+ KB


In [6]:
# Random sample of comments
comments_df["text"].sample(5).values


array(['❤', 'omg, afrobeat is best 💕💕🎶🎶', '❤❤❤❤❤❤❤❤',
       'Lestwins killed this song in 2023, with straight musicality, i love this song',
       'I F-IN LOVE YOU JESS!!! AND LAUREN!! 🤟🏿💯❤️🤗🤣💯'], dtype=object)

In [7]:
def clean_text(text):
    """
    Clean the text by removing unwanted characters and formatting.
    """
    # Check if the text is NaN
    if pd.isnull(text):
        return text
    
    text = text.lower()  # Convert to lowercase
    # Remove HTML tags
    text = re.sub(r"<.*?>", "", text)
    # Remove URLs
    text = re.sub(r"https?://\S+|www.\S+", "", text)

    # Remove emojis and non-ASCII characters
    text = re.sub(r"[^\x00-\x7F]+", "", text)

    #remove punctuation
    text = text.translate(str.maketrans("", "", string.punctuation))
    
    # Remove newlines and extra spaces
    text = text.replace("\n", " ").replace("\r", " ").strip()
    
    # Remove extra spaces
    text = re.sub(r"\s+", " ", text).strip()

    return text

In [8]:
comments_df['cleaned_text'] = comments_df['text'].apply(clean_text)

In [9]:
comments_df[['text', 'cleaned_text']].sample(5)

Unnamed: 0,text,cleaned_text
0,we are now working on the new 2024 mix for eve...,we are now working on the new 2024 mix for eve...
62,H,h
41,good,good
22,“Who is here in 2025”\n\n\nEveryoneeeeeee,who is here in 2025 everyoneeeeeee
34,M,m


In [10]:
analyzer = SentimentIntensityAnalyzer()
comments_df['sentiment_score'] = comments_df['cleaned_text'].astype(str).apply(lambda x: analyzer.polarity_scores(x)['compound'])

In [11]:
def label_sentiment(score):
    if score >= 0.05:
        return "positive"
    elif score <= -0.05:
        return "negative"
    else:
        return "neutral"

comments_df["sentiment_label"] = comments_df["sentiment_score"].apply(label_sentiment)


In [12]:
comments_df.head()

Unnamed: 0,author,text,like_count,published_at,cleaned_text,sentiment_score,sentiment_label
0,@jrollthedj,we are now working on the new 2024 mix for eve...,6753,2024-01-08T23:30:51Z,we are now working on the new 2024 mix for eve...,0.3612,positive
1,@eliete3149,"Toda vez que ouço essa música, meu coração bat...",0,2025-03-31T21:14:18Z,toda vez que ouo essa msica meu corao bate mai...,0.0,neutral
2,@danieljohnson7020,2025 afrobeattt is now out hear the beats on e...,0,2025-03-31T19:37:59Z,2025 afrobeattt is now out hear the beats on e...,0.6269,positive
3,@Vibesfirst,When we are happy that’s when we enjoy the son...,2,2025-03-31T14:56:10Z,when we are happy thats when we enjoy the song...,-0.1779,negative
4,@AyanshCharan-ui9mw,I'm here 2025 who else❤,5,2025-03-31T04:12:31Z,im here 2025 who else,0.0,neutral


In [13]:
# Make sure the processed folder exists
os.makedirs("../data/processed", exist_ok=True)

comments_df.to_csv("../data/processed/comments_clean.csv", index=False)
print("Saved to data/processed/comments_clean.csv ✅")

Saved to data/processed/comments_clean.csv ✅
