In [None]:
# 1. Data Loading and Initial Setup

import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from IPython.display import display

# Load the data from the CSV file
data = pd.read_csv("tweets-data.csv")

# Display a message indicating that the data has been successfully loaded
print("Loaded data:")

# Print the first 5 rows of the DataFrame with a nicer table format
display(data.head(5))

# Indonesian stopwords from NLTK and additional stopwords
stop_words_indo_nltk = stopwords.words('indonesian')
stop_words_eng_nltk = stopwords.words('english')
additional_stopwords = [
  'yang', 'di', 'ke', 'dari', 'ini', 'itu', 'pada', 'untuk', 'dan', 'dengan',
  'adalah', 'saya', 'kamu', 'dia', 'kita', 'mereka', 'akan', 'atau', 'seperti', 
  'FFFF00', 't co', 'FFFF00 ', 'https', 'segyongstar', 'ipi', 'ye', 'ha', 'a', 't', 
  'co' , 'i', 'font', 'fontcolor', 'fontcolor=', 'mkkkkkkkkkkk', '=', '#', '"', 'FFFF00', 'ffff'
]
stop_words_id = list(set(stop_words_indo_nltk + stop_words_eng_nltk + additional_stopwords))

# Initialize Sastrawi Stemmer
factory = StemmerFactory()
stemmer = factory.create_stemmer()

In [None]:
# 2. Text Cleaning and Stemming

# Select the 'full_text' column for sentiment analysis
text = data["full_text"]

# Display a message indicating the selected text column
print("Selected text column:")

# Set pandas display options to show all rows
pd.set_option('display.max_colwidth', None)
pd.set_option("display.max_rows", None)

# Convert the series to a DataFrame with the column named "full_text"
text_df = pd.DataFrame(text, columns=["full_text"])

# Display the first 10 rows of the DataFrame
display(text_df.head(10))

# Function to clean and stem the text data
def clean_and_stem_text(text):
text = text.lower() # Lowercase the text
text = re.sub(r"[^a-zA-Z0-9\s]", " ", text) # Remove special characters
text = re.sub(r'https?://\S+', '', text)
text = re.sub(r'\b\d+\b', '', text)
text = re.sub(r'[^a-zA-Z\s]', ' ', text)
text = text.replace('\t', ' ').replace('\n', ' ').replace('\\u', '
').replace('\\', '') # Replace escape characters
text = text.encode('ascii', 'replace').decode('ascii') # Encode to ASCII
tokens = word_tokenize(text) # Tokenize the text
filtered_tokens = [word for word in tokens if word not in stop_words_id] #
Remove stopwords
stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens] # Stem the tokens
return " ".join(stemmed_tokens) # Join the cleaned and stemmed tokens
                                                          
# Clean and stem the text data
text = text.apply(clean_and_stem_text)

# Set pandas display options to show all rows and full width of columns
pd.set_option("display.max_rows", None)
pd.set_option('display.max_colwidth', None)

# Save the cleaned and stemmed text to a new DataFrame
cleaned_data = pd.DataFrame({'cleaned_text': text})

print("Cleaned and stemmed text data:")
display(cleaned_data.head(20))

# Save the cleaned and stemmed text data to a CSV file
cleaned_data.to_csv("cleaned_data.csv", index=False)