In [1]:
import pandas as pd
import re, string, unicodedata
import nltk
from bs4 import BeautifulSoup
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer, WordNetLemmatizer

df = pd.read_csv("@IndiHome.csv", names=['time','full_text','label'])
df.head()

Unnamed: 0,time,full_text,label
0,2021-01-02 13:25:25,b'@andrerizqonm @IndiHome @TelkomCare @KemenBU...,netral;;
1,2021-01-02 13:23:32,b'Niat mo kerja drmh. Buka laptop. Sambil nont...,negatif;;
2,2021-01-02 13:23:17,b'@IndiHome TOLONG DM DIJAWAB',netral;;
3,2021-01-02 13:22:41,b'Ini @IndiHome lg knpa ya. Knpa kayak keong \...,negatif;;
4,2021-01-02 13:20:51,b'@IndiHome Belum ada beb',netral;;


In [2]:
#lowercase
df['full_text'] = df['full_text'].str.lower()
df['full_text'].head()

0    b'@andrerizqonm @indihome @telkomcare @kemenbu...
1    b'niat mo kerja drmh. buka laptop. sambil nont...
2                       b'@indihome tolong dm dijawab'
3    b'ini @indihome lg knpa ya. knpa kayak keong \...
4                           b'@indihome belum ada beb'
Name: full_text, dtype: object

In [3]:
def remove_tweet_special(text):
    # remove tab, new line, ans back slice
    text = str(text).replace('\\t'," ").replace('\\n'," ").replace('\\u'," ").replace('\\',"")
    # remove non ASCII (emoticon, chinese word, .etc)
    text = text.encode('ascii', 'replace').decode('ascii')
    # remove mention, link, hashtag
    text = ' '.join(re.sub("([@#][A-Za-z0-9]+)|(\w+:\/\/\S+)"," ", text).split())
    # remove incomplete URL
    return text.replace("http://", " ").replace("https://", " ")
                
df['full_text'] = df['full_text'].apply(remove_tweet_special)

#remove number
def remove_number(text):
    return  re.sub(r"\d+", "", text)

df['full_text'] = df['full_text'].apply(remove_number)

In [4]:
#remove punctuation
def remove_punctuation(text):
    return text.translate(str.maketrans("","",string.punctuation))

df['full_text'] = df['full_text'].apply(remove_punctuation)

#remove whitespace leading & trailing
def remove_whitespace_LT(text):
    return text.strip()

df['full_text'] = df['full_text'].apply(remove_whitespace_LT)

#remove multiple whitespace into single whitespace
def remove_whitespace_multiple(text):
    return re.sub('\s+',' ',text)

df['full_text'] = df['full_text'].apply(remove_whitespace_multiple)

# remove single char
def remove_singl_char(text):
    return re.sub(r"\b[a-zA-Z]\b", "", text)

df['full_text'] = df['full_text'].apply(remove_singl_char)

# word tokenize 
def word_tokenize_wrapper(text):
    return word_tokenize(text)

df['full_text'] = df['full_text'].apply(word_tokenize_wrapper)

print('Tokenizing Result : \n') 
print(df['full_text'].head())

Tokenizing Result : 

0              [segera, berganti, ke, myrepublic, pak]
1    [bniat, mo, kerja, drmh, buka, laptop, sambil,...
2                                [tolong, dm, dijawab]
3    [bini, lg, knpa, ya, knpa, kayak, keong, xfxfx...
4                                    [belum, ada, beb]
Name: full_text, dtype: object


In [5]:
#stopwords removal

# get stopword indonesia
list_stopwords = stopwords.words('indonesian')

# append additional stopword
list_stopwords.extend(["yg", "dg", "rt", "dgn", "ny", "d", 'klo', 
                       'kalo', 'amp', 'biar', 'bikin', 'bilang', 
                       'gak', 'ga', 'krn', 'nya', 'nih', 'sih', 
                       'si', 'tau', 'tdk', 'tuh', 'utk', 'ya', 
                       'jd', 'jgn', 'sdh', 'aja', 'n', 't', 
                       'nyg', 'hehe', 'pen', 'u', 'nan', 'loh', 'rt',
                       '&amp', 'yah'])

# read txt stopword using pandas
txt_stopword = pd.read_csv("stopwords.txt", names= ["stopwords"], header = None)

# convert stopword string to list & append additional stopword
list_stopwords.extend(txt_stopword["stopwords"][0].split(' '))

# convert list to dictionary
list_stopwords = set(list_stopwords)

#remove stopword pada list token
def stopwords_removal(words):
    return [word for word in words if word not in list_stopwords]

df['full_text'] = df['full_text'].apply(stopwords_removal)

print(df['full_text'].head())

0                               [berganti, myrepublic]
1    [bniat, mo, kerja, drmh, buka, laptop, nonton,...
2                                         [tolong, dm]
3    [bini, lg, knpa, knpa, kayak, keong, xfxfxxadx...
4                                                [beb]
Name: full_text, dtype: object


In [6]:
#lemmatization

def lemmatize_verbs(words):
    """Lemmatize verbs in list of tokenized words"""
    lemmatizer = WordNetLemmatizer()
    lemmas = []
    for word in words:
        lemma = lemmatizer.lemmatize(word, pos='v')
        lemmas.append(lemma)
    return lemmas

df['full_text'] = df['full_text'].apply(lemmatize_verbs)

print(df['full_text'].head())

0                               [berganti, myrepublic]
1    [bniat, mo, kerja, drmh, buka, laptop, nonton,...
2                                         [tolong, dm]
3    [bini, lg, knpa, knpa, kayak, keong, xfxfxxadx...
4                                                [beb]
Name: full_text, dtype: object


In [None]:
#save to csv
df.to_csv("dataset2.csv")

In [None]:
#Split the data into 80% training (X_train & y_train) and 20% testing (X_test & y_test) data sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df, df['label'], test_size = 0.20, random_state = 0)

# Get the shape of messages_bow
# messages_bow.shape
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()

from sklearn.feature_extraction.text import TfidfTransformer
X_train_tf = count_vect.fit_transform(X_train)
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_tf)

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

NBNaiveBayes = MultinomialNB()
NBNaiveBayes.fit(X_train_tfidf, y_train)
print(NBNaiveBayes)

In [None]:
X_test_tf = count_vect.transform(X_test)
X_test_tfidf = tfidf_transformer.transform(X_test_tf)

predicted = NBNaiveBayes.predict(X_test_tfidf)
print("Accuracy:", accuracy_score(y_test, predicted))

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, predicted, target_names=['negatif','netral','positif']))