In [1]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Sankalp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from nltk.stem import PorterStemmer

# Load dataset
df = pd.read_csv("youtube_song_comments_data_1.csv")

# Initialize tools
tokenizer = TweetTokenizer()
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

# Helper function
def process_text(text):
    if not isinstance(text, str):
        return ""

    # Lowercase
    text = text.lower()
    
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Tokenize
    tokens = tokenizer.tokenize(text)
    
    # Remove stopwords and stem
    clean_tokens = [stemmer.stem(word) for word in tokens if word not in stop_words and word.isalpha()]
    
    return " ".join(clean_tokens)

# Apply preprocessing to comment column
df['processed_comment'] = df['comment'].apply(process_text)

# Save to new file
df.to_csv("youtube_comments_processed.csv", index=False)

print("Preprocessing complete. File saved as 'youtube_comments_processed.csv'")


Preprocessing complete. File saved as 'youtube_comments_processed.csv'


In [4]:
import pandas as pd
import re
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import TweetTokenizer
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from nltk import download

# Download required resources
download('stopwords')
download('wordnet')
download('averaged_perceptron_tagger')

# Load dataset
df = pd.read_csv("youtube_song_comments_data_1.csv")

# Initialize tools
tokenizer = TweetTokenizer()
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Helper function to map NLTK POS tags to WordNet POS tags
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

# Lemmatization helper function
def process_text(text):
    if not isinstance(text, str):
        return ""

    text = text.lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    tokens = tokenizer.tokenize(text)
    tagged_tokens = pos_tag(tokens)

    # Remove stopwords and lemmatize
    clean_tokens = [
        lemmatizer.lemmatize(word, get_wordnet_pos(pos))
        for word, pos in tagged_tokens
        if word not in stop_words and word.isalpha()
    ]

    return " ".join(clean_tokens)

# Apply preprocessing to comment column
df['processed_comment'] = df['comment'].apply(process_text)

# Save to new file
df.to_csv("youtube_comments_processed.csv", index=False)

print("Lemmatization complete. File saved as 'youtube_comments_processed.csv'")


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Sankalp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Sankalp\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Sankalp\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


Lemmatization complete. File saved as 'youtube_comments_processed.csv'
