In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from datetime import datetime
import pytz
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
# from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
import re
import pickle

In [2]:
#load Dataset
# data_frame = pd.read_csv('tweets_data.csv')
data_frame = pd.read_csv('Dataset/tweets_data.csv')
# data_frame = data_frame[:5000]
data_frame.head()

Unnamed: 0,created_at,user_id,username,tweet,replies_count,retweets_count,likes_count,retweet,polarity
0,2020-11-18 23:17:14 SE Asia Standard Time,1.17e+18,njmyg_,@sunnova1324 @tanyainrl iya sih.. tapi maksud ...,0,0,0,False,positive
1,2020-11-18 23:07:06 SE Asia Standard Time,1.31e+18,urbbyyyyyyyy,males kuliah online temennya sikit,0,0,0,False,negative
2,2020-11-18 23:05:21 SE Asia Standard Time,1.33e+18,risyaanggun,tumbenan td kuliah online dosennya minta join ...,0,0,0,False,negative
3,2020-11-18 22:58:58 SE Asia Standard Time,1.24e+18,nyctophilexxx,@monsouleil nangis krn kecapean kuliah online ...,1,0,0,False,negative
4,2020-11-18 22:50:53 SE Asia Standard Time,1.09e+18,anisanwl,Apa hanya aku yang merasa semenjak kuliah onli...,0,0,1,False,negative


In [3]:
data_frame.shape

(25000, 9)

# Data Preprocessing

In [4]:
data_frame['polarity'].replace(('neutral', 'positive', 'negative'), (0, 1, 2), inplace=True)
data_frame['polarity'].value_counts()

2.0    12733
1.0     8441
0.0     2525
Name: polarity, dtype: int64

In [5]:
data = data_frame['tweet'].values.tolist()
label = data_frame['polarity'].values.tolist()

# Split Data train dan Data test

In [6]:
train_X, test_X, y_train, y_test = train_test_split(data, label, test_size=0.2, shuffle=True)

print(f'Jumlah data training: {len(train_X)}')
print(f'Jumlah data testing: {len(test_X)}')

Jumlah data training: 20000
Jumlah data testing: 5000


# Text Processing

In [7]:
def cleaningText(text):
    text_clean = re.sub(r'@[A-Za-z0-9]+', '', str(text)) # Hapus Mention
    text_clean = re.sub(r'#[A-Za-z0-9]+', '', text_clean) # Hapus hashtag
    text_clean = re.sub(r'RT[\s]', '', text_clean) # Hapus RT
    text_clean = re.sub(r"http\S+", '', text_clean) # Hapus link
    text_clean = re.sub(r'[0-9]+', '', text_clean) # Hapus angka
    text_clean = text_clean.replace('\n', ' ') # Ganti enter ke spasi
    text_clean = text_clean.translate(str.maketrans('', '', string.punctuation)) # Hapus tanda baca
    text_clean = text_clean.strip(' ') # Hapus spasi tdk jelas
    return text_clean

def casefoldingText(text):
    lwr = text
    map(str.lower, lwr)
    text = lwr
    return text

tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,
                               reduce_len=True)
def tokenizingText(text): 
    text = tokenizer.tokenize(text)                  
    return text

def filteringText(text):
    listStopwords = set(stopwords.words('indonesian'))
    filtered = []
    for txt in text:
        if txt not in listStopwords:
            filtered.append(txt)
    text = filtered 
    return text

def stemmingText(text): 
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
    text = [stemmer.stem(word) for word in text]
    return text


In [17]:
def process_tweet(tweet):
    tweet_Cleaning = cleaningText(tweet)
    tweet_CaseFolding = casefoldingText(tweet_Cleaning)
    tweet_Tokenizing = tokenizingText(tweet_CaseFolding)
    tweet_Filtering = filteringText(tweet_Tokenizing)
#     tweet_Stemming = stemmingText(tweet_Filtering)    
    return tweet_Filtering

In [18]:

time_jkt = pytz.timezone('Asia/Jakarta')
print("Mulai:", datetime.now(time_jkt).strftime("%H:%M:%S"))

#preprocess data train
preprocessed_text = []
for i in range(0, len(train_X)):
    preprocessed_text.append(process_tweet(train_X[i]))

X_train = preprocessed_text

# Pkl_Filename = "X_train.pkl"  

# with open(Pkl_Filename, 'wb') as file:  
#     pickle.dump(X_train, file)


#preprocess data test
preprocessed_text = []
for i in range(0, len(test_X)):
    preprocessed_text.append(process_tweet(test_X[i]))

X_test = preprocessed_text

# Pkl_Filename = "y_train.pkl"  

# with open(Pkl_Filename, 'wb') as file:  
#     pickle.dump(y_train, file)
print("Selesai:", datetime.now(time_jkt).strftime("%H:%M:%S"))

Mulai: 20:52:22
Selesai: 20:52:34


# Create Dictionary

In [21]:
def createDictionary(data):
  dictionary = dict()
  for sampel in  data:
    for token in sampel:
      dictionary[token] = dictionary.get(token, 0) + 1
  #sorting dictionary berdasarkan nilainya
  daftar_dict = sorted(dictionary.items(), key=lambda x: x[1], reverse=True)
  return dict(daftar_dict)

In [22]:
bow = createDictionary(X_train)

print("Token teratas pada Dictionary:\n")
list(bow.items())[:10]

Token teratas pada Dictionary:



[('kuliah', 22299),
 ('online', 21185),
 ('😭', 3775),
 ('ga', 3229),
 ('yg', 2638),
 ('ya', 2529),
 ('aja', 2250),
 ('kalo', 2028),
 ('tugas', 1864),
 ('udah', 1841)]

In [23]:
print(len(bow))

24665


# NAIVE BAYES CLASIFIER

In [48]:
#Navie Bayes Classifier 
class NBClassifier:

    def __init__(self, X_train, y_train, size):  
      self.X_train = X_train
      self.y_train = y_train
      self.size = size

    def createDictionary(self):
      dictionary = dict()
    
      for sampel in  X_train:
        for token in sampel:
          dictionary[token] = dictionary.get(token, 0) + 1
      daftar_dict = sorted(dictionary.items(), key=lambda x: x[1], reverse=True)
      return dict(daftar_dict)
    
    def train(self):
      X_train_dict = self.createDictionary()
      if self.size == 'full':
        self.daftar_kata = list(X_train_dict.keys())
        self.jumlah_kata = dict.fromkeys(self.daftar_kata, None)
        
      else:
        self.daftar_kata = list(X_train_dict.keys())[:int(self.size)]
        self.jumlah_kata = dict.fromkeys(self.daftar_kata, None)
      
      train = pd.DataFrame(columns = ['X_train', 'y_train'])
      train['X_train'] = X_train
      train['y_train'] = y_train

      train_0 = train.copy()[train['y_train'] == 0]
      train_1 = train.copy()[train['y_train'] == 1]
      train_2 = train.copy()[train['y_train'] == 2]

      Prior_0 = train_0.shape[0]/train.shape[0]
      Prior_1 = train_1.shape[0]/train.shape[0]
      Prior_2 = train_2.shape[0]/train.shape[0]
      
      self.Prior = np.array([Prior_0, Prior_1, Prior_2])
        
      def flat(listOfList):
        jadi = []
        for elemen in listOfList:
          jadi.extend(elemen)
        return jadi
  
      X_train_0 = flat(train[train['y_train'] == 0]['X_train'].tolist())
      X_train_1 = flat(train[train['y_train'] == 1]['X_train'].tolist())
      X_train_2 = flat(train[train['y_train'] == 2]['X_train'].tolist())

      self.X_train_len = np.array([len(X_train_0), len(X_train_1), len(X_train_2)])

      for token in self.daftar_kata:
        res = []
        res.insert(0, X_train_0.count(token))
        res.insert(1, X_train_1.count(token))
        res.insert(2, X_train_2.count(token))
        self.jumlah_kata[token] = res
      return self

    def predict(self, X_test):     
      pred = []
      
      for sampel in X_test:
            
        mulai = np.array([1,1,1])
        
        for tokens in sampel:
          jumlah_vocab = len(self.daftar_kata)
          if tokens in self.daftar_kata:
            prob = ((np.array(self.jumlah_kata[tokens])+1) / (self.X_train_len + jumlah_vocab))
          else:
            prob = ((np.array([0,0,0])+1) / (self.X_train_len + jumlah_vocab))
          mulai = mulai * prob
        print(mulai)
        pos = mulai * self.Prior
        pred.append(np.argmax(pos))
      return pred
    
    def score(self, pred, labels):
      correct = (np.array(pred) == np.array(labels)).sum()
      accuracy = correct/len(pred)
      return correct, accuracy

In [50]:
#training classifier     
nb = NBClassifier(X_train, y_train, 5)  
nb.train()

<__main__.NBClassifier at 0x255fe8123a0>

In [51]:
nb.predict(X_test)

[4.33528991e-29 1.00430953e-32 1.03549641e-33]


TypeError: insert expected 2 arguments, got 1