In [4]:
import re
import nltk
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# # Ensure stopwords and other nltk resources are downloaded
# nltk.download('stopwords')
# nltk.download('punkt')
# nltk.download('wordnet')

data_new = pd.read_csv('IKN-08.csv')

# Remove duplicates and NaN/null values
data_new.drop_duplicates(subset=['id_str'], keep='first', inplace=True)
data_new.dropna(subset=['full_text', 'username'], inplace=True)

# Keep only one data point per username
data_new.drop_duplicates(subset=['username'], keep='first', inplace=True)

# Text preprocessing functions
def preprocess_text(text):
    # Case folding
    text = text.lower()
    # Remove URLs, mentions, and hashtags
    text = re.sub(r'http\S+|www\S+|https\S+|@\S+|#\S+', '', text, flags=re.MULTILINE)
    # Remove punctuation and special characters
    text = re.sub(r'\W', ' ', text)
    # Tokenization
    tokens = word_tokenize(text)
    # Remove stopwords
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

# Apply preprocessing to the full_text column
data_new['processed_text'] = data_new['full_text'].apply(preprocess_text)

# Sentiment analysis using VADER
analyzer = SentimentIntensityAnalyzer()
data_new['Sentiment'] = data_new['processed_text'].apply(lambda x: analyzer.polarity_scores(x)['compound'])

# Assign sentiment labels
def sentiment_label(score):
    if score >= 0.05:
        return 'Positive'
    elif score <= -0.05:
        return 'Negative'
    else:
        return 'Neutral'

data_new['Sentiment_Label'] = data_new['Sentiment'].apply(sentiment_label)

data_new.head()
data_new.shape

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Bryant\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Bryant\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Bryant\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


(1757, 18)

In [5]:
sentiment_counts = data_new['Sentiment_Label'].value_counts()
sentiment_counts

Sentiment_Label
Neutral     1572
Positive     136
Negative      49
Name: count, dtype: int64

In [6]:
final_data = data_new[['username', 'created_at', 'full_text']]
final_data.head()

Unnamed: 0,username,created_at,full_text
0,aetherienll,Wed May 15 23:54:40 +0000 2024,@tanyarlfes WKWKWKWK MIMPI APA LU PADA sekaran...
1,kompasiana,Wed May 15 23:53:01 +0000 2024,Terjebak Macet selama Tiga Jam di Kawasan IKN ...
2,lokastiti,Wed May 15 23:52:43 +0000 2024,Jadi presidennya mau. Giliran disuruh pindah k...
3,Kanoysim,Wed May 15 23:52:30 +0000 2024,Kebutuhan primer harus di utamakan bahkan di u...
5,PartaiPonsel,Wed May 15 23:48:58 +0000 2024,@Andria75777 Mampus kelen...wkwkwkkw . Makan t...


In [7]:
final_data.to_csv('Final-IKN.csv', index=False)