## **Install Dan Import Library**

In [None]:
!pip install textblob

In [None]:
!pip install nltk

In [None]:
import re
import string
import nltk
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from textblob.classifiers import NaiveBayesClassifier

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

## **Menyimpan Dataset**

Source: https://github.com/dD2405/Twitter_Sentiment_Analysis

Label:
- 0 : Non Hatespeech
- 1 : Hatespeech

In [65]:
df = pd.read_csv('https://raw.githubusercontent.com/dD2405/Twitter_Sentiment_Analysis/master/train.csv')
df.drop('id', axis=1, inplace=True)
df.sample(5)

Unnamed: 0,label,tweet
11765,0,#ty for the #recent #follow @user @user to #...
6520,0,"check out this new trending #funny #gif ! , e..."
17341,0,everymanager: thesecret: be now. #feelgood ...
22664,0,@user we just gotta do what we gotta do...
7379,1,"@user no it doesn't, germans, even fascists ra..."


## **Preprocessing Data**

In [40]:
lemmatizer = nltk.stem.WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def tokenizing(text):
    # Mengubah setiap kata menjadi lowercase
    text =  text.lower()

    # Menghapus Link Dengan Pattern http/https dan www
    text = re.sub(r'http\S+', '', text)
    text = re.sub('(@\w+|#\w+)', '', text)

    # Menghapus Tag HTML
    text = re.sub('<.*?>', '', text)

    # Menghapus Tanda Baca Seperti Titik Dan Koma
    text = text.translate(str.maketrans(' ', ' ', string.punctuation))

    # Menghapus Karakter Selain Huruf a-z dan A-Z
    text = re.sub('[^a-zA-Z]', ' ', text)

    # Mengganti baris baru (enter) dengan spasi
    text = re.sub("\n", " ", text)

    # Menghapus Karakter Berulang (Contoh: Horeeee!!!! menjadi Hore!)
    text = re.sub(r'(\w)(\1{2,})', r"\1", text)

    # Menghapus 1 Karakter Terpisah
    text = re.sub(r"\b[a-zA-Z]\b", "", text)

    # Menghapus Spasi Yang Lebih Dari Satu
    text = re.sub('(s{2,})', ' ', text)

    # Mengembalikan Hasil Tokenizing Text
    return text


# Memisahkan Kata Singkatan (Abbreviaton). Contoh won't -> will not 
def decontracted(text):
    text = re.sub(r"n\'t", " not", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'s", " is", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'t", " not", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'m", " am", text)

    return text


# Lematisasi adalah teknik  untuk mereduksi kata menjadi Lemma atau bentuk dasar dari sebuah kata
# Contoh:  better -> good
def lemmatizing_text(text):
    final_text = ''
    words = word_tokenize(text)

    for w in words:
        final_text = final_text + ' ' + lemmatizer.lemmatize(w)

    return final_text
        

# Menghapus kata yang mengandung judul topik dan kata yang terdapat pada stopwords nltk 
def filtering_text(text):
    temp_text_split = []
    text_split = text.split(' ')

    for i in range(len(text_split)):
        if text_split[i] not in stop_words:
            temp_text_split.append(text_split[i])
    
    temp_text_split = list(set(temp_text_split))
    final_text = ' '.join(temp_text_split)
    
    return final_text.lstrip()

# Proses data cleansing setiap text
df['tweet'] = df['tweet'].apply(decontracted)
print("Done Decontracted")

df['tweet'] = df['tweet'].apply(tokenizing)
print("Done Tokenizing")

df['tweet'] = df['tweet'].apply(lemmatizing_text)
print("Done Lemmatizing")

df['tweet'] = df['tweet'].apply(filtering_text)
print("Done Filtering")

Done Decontracted
Done Tokenizing
Done Lemmatizing
Done Filtering


## **Menghapus Data Kosong**

In [41]:
df = df.replace(r'^\s*$', np.NaN, regex=True)
df.isna().sum()

label      0
tweet    560
dtype: int64

In [42]:
df.dropna(inplace=True)
df.isna().sum()

label    0
tweet    0
dtype: int64

## Split **Data Train Dan Test**

In [43]:
train, test = train_test_split(df, test_size=0.2)

In [44]:
first_column = train.pop('tweet')
train.insert(0, 'tweet', first_column)

df_train = train.to_records(index=False)
df_train = list(df_train)

df_train[50:55]

[('love know', 0),
 ('two owl', 0),
 ('relationship sometimes heritage crushed like feel', 0),
 ('seriously dead turning world nightclub injured shooting', 0),
 ('breakfast kitchen bihday lady corey', 0)]

## **Melatih Model**

In [45]:
cl = NaiveBayesClassifier(df_train)

In [46]:
first_column = test.pop('tweet')
test.insert(0, 'tweet', first_column)

df_test = test.to_records(index=False)
df_test = list(df_test)

df_test[50:55] 

[('stephen rest peace keshi', 0),
 ('challenge thankful', 0),
 ('follow thank twitfox', 0),
 ('culpable ww exists systemic dismantle deeply entrenched feel', 1),
 ('libtard might', 1)]

## **Evaluasi Model**

In [47]:
cl.accuracy(df_test)

0.9410921827734438

In [48]:
cl.classify('i am thankful for coffee')

0

In [51]:
cl.classify("culpable exists systemic dismantle deeply entrenched feel")

1