### 1. Import Dependencies

In [13]:
import string
import re
import nltk
import pandas as pd
import numpy as np

nltk.download('punkt')
nltk.download('stopwords')

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
from nltk.stem.porter import PorterStemmer
from textblob import TextBlob

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Yonti's\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Yonti's\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
text = ('G0OgL3e!')

In [14]:
df = pd.read_csv('data/small.csv')
df

Unnamed: 0,text
0,"hari ini aku masih libur, masih ada waktu semi..."
1,yg ptm hari ini semangaattt
2,dom cilacap udh ptm 100% apa blm?
3,yg dom jabodetabek kalian ptm nya udah 100% kah?
4,"yg ptm yg genap seru bgt, yg ganjil kek nya ba..."
5,wtp hr ini gue libur tp BESOK PTM 100%


### 2. Preprocessing

In [17]:
df['lower_text']=df['text'].str.lower()  # Convert to lowercase
df['remove_url']=df['lower_text'].apply(lambda x: re.sub(r"http\S+", "", x))    # remove url
df['remove_num']=df['remove_url'].apply(lambda x: re.sub(r'\d+', '', x))    # Remove number
df['punctuation']=df['remove_num'].apply(lambda x: re.sub(r'[^\w\s]', '', x))    # Remove punctuation
df['tokenized_text']=df['punctuation'].apply(nltk.word_tokenize) # Tokenize the text

# Get the Indonesian stopwords
indonesian_stopwords = set(nltk.corpus.stopwords.words('indonesian'))

# Remove the stopwords from the tokenized texts
df['stopwords']=df['tokenized_text'].apply(lambda x: [w for w in x if not w in indonesian_stopwords])

# Initialize the Porter stemmer
stemmer = PorterStemmer()

# Stem the tokenized texts in the 'stopwords' column of the dataframe
df['stemmed']=df['stopwords'].apply(lambda x: [stemmer.stem(w) for w in x])

df['normalized']=df['stemmed'].apply(lambda x: ' '.join(x)) # Join the stemmed words into a single string

df.head()

Unnamed: 0,text,lower_text,remove_url,remove_num,punctuation,tokenized_text,stopwords,stemmed,normalized
0,"hari ini aku masih libur, masih ada waktu semi...","hari ini aku masih libur, masih ada waktu semi...","hari ini aku masih libur, masih ada waktu semi...","hari ini aku masih libur, masih ada waktu semi...",hari ini aku masih libur masih ada waktu semin...,"[hari, ini, aku, masih, libur, masih, ada, wak...","[libur, seminggu, rumah, manfaatin, libur, sem...","[libur, seminggu, rumah, manfaatin, libur, sem...",libur seminggu rumah manfaatin libur seminggu ...
1,yg ptm hari ini semangaattt,yg ptm hari ini semangaattt,yg ptm hari ini semangaattt,yg ptm hari ini semangaattt,yg ptm hari ini semangaattt,"[yg, ptm, hari, ini, semangaattt]","[yg, ptm, semangaattt]","[yg, ptm, semangaattt]",yg ptm semangaattt
2,dom cilacap udh ptm 100% apa blm?,dom cilacap udh ptm 100% apa blm?,dom cilacap udh ptm 100% apa blm?,dom cilacap udh ptm % apa blm?,dom cilacap udh ptm apa blm,"[dom, cilacap, udh, ptm, apa, blm]","[dom, cilacap, udh, ptm, blm]","[dom, cilacap, udh, ptm, blm]",dom cilacap udh ptm blm
3,yg dom jabodetabek kalian ptm nya udah 100% kah?,yg dom jabodetabek kalian ptm nya udah 100% kah?,yg dom jabodetabek kalian ptm nya udah 100% kah?,yg dom jabodetabek kalian ptm nya udah % kah?,yg dom jabodetabek kalian ptm nya udah kah,"[yg, dom, jabodetabek, kalian, ptm, nya, udah,...","[yg, dom, jabodetabek, ptm, nya, udah, kah]","[yg, dom, jabodetabek, ptm, nya, udah, kah]",yg dom jabodetabek ptm nya udah kah
4,"yg ptm yg genap seru bgt, yg ganjil kek nya ba...","yg ptm yg genap seru bgt, yg ganjil kek nya ba...","yg ptm yg genap seru bgt, yg ganjil kek nya ba...","yg ptm yg genap seru bgt, yg ganjil kek nya ba...",yg ptm yg genap seru bgt yg ganjil kek nya bak...,"[yg, ptm, yg, genap, seru, bgt, yg, ganjil, ke...","[yg, ptm, yg, genap, seru, bgt, yg, ganjil, ke...","[yg, ptm, yg, genap, seru, bgt, yg, ganjil, ke...",yg ptm yg genap seru bgt yg ganjil kek nya sep...


In [None]:
lower_text = text.lower()   # Convert text to lower

token_text = re.sub(r"\d+", "", lower_text)    # Remove number
token_text = re.sub('\s+',' ', token_text) # Remove multiple whitespace into single whitespace
token_text = ''.join(c for c in token_text if c not in string.punctuation)      # Remove punctuation
word_tokens = nltk.tokenize.word_tokenize(token_text)    # Tokenize the text
freq_tokens = nltk.FreqDist(word_tokens)    # Frequency word token

stop_words = [w for w in x if not w in indonesian_stopwords]   # Implement stopwords

stemmed_words = [stemmer.stem(w) for w in x]   # Stem the words

normalized_text = ''.join(stemmed_words)  # Join the words

In [19]:
print(f'Case Folding Result : {lower_text}')
print(f'Remove punctuation, number, multiple whitespace : {token_text}')
print('Tokenizing Result : ',word_tokens)
print('Frequency Token : ',freq_tokens.most_common())
print('Stopword : ',stop_words)
print('Stemmer : ',stemmed_words)
print('Normalized : ',normalized_text)

Case Folding Result : g0ogl3e!
Remove punctuation, number, multiple whitespace : gogle
Tokenizing Result :  ['gogle']
Frequency Token :  [('gogle', 1)]
Stopword :  ['gogle']
Stemmer :  ['gogl']
Normalized :  gogl


### 3. Sentiment Analysis and labelling

In [24]:
df['textblob'] = df['normalized'].apply(lambda x: TextBlob(x))

# Create a new column in the DataFrame with sentiment analysis scores
df['sentiment'] = df['textblob'].apply(lambda x: x.sentiment.polarity)

# Normalize the sentiment scores to range from -1 (negative) to 1 (positive)
df['sentiment'] = df['sentiment'].apply(lambda x: np.interp(x, [df['sentiment'].min(), df['sentiment'].max()], [-1, 1]))

print(df['sentiment'])

0    1.0
1    1.0
2    1.0
3    1.0
4    1.0
5    1.0
Name: sentiment, dtype: float64


In [23]:
analysis = TextBlob(normalized_text)

# Get sentiment polarity (-1 to 1: negative to positive)
sentiment = analysis.sentiment.polarity
if sentiment > 0: 
    print("Positive sentiment")
elif sentiment < 0: 
    print("Negative sentiment")
else: 
    print("Neutral sentiment")

Neutral sentiment
