# TEXT PRE-PROCESSING

## Import Library and Dataset

In [1]:
import pandas as pd
import re
import nltk
from string import punctuation
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

stop_words = stopwords.words('Indonesian')
factory = StemmerFactory()
stemmer = factory.create_stemmer()

In [2]:
#import dataset
data = pd.read_csv('Twitter_Emotion_Dataset.csv')

In [3]:
#i only use 100 data
#to make it easier to undertand
data = data.head(100)

In [4]:
data.head()

Unnamed: 0,label,tweet
0,anger,"Soal jln Jatibaru,polisi tdk bs GERTAK gubernu..."
1,anger,"Sesama cewe lho (kayaknya), harusnya bisa lebi..."
2,happy,Kepingin gudeg mbarek Bu hj. Amad Foto dari go...
3,anger,"Jln Jatibaru,bagian dari wilayah Tn Abang.Peng..."
4,happy,"Sharing pengalaman aja, kemarin jam 18.00 bata..."


##Tahap Pre-Processing

1. convert to lowercase
2. removing whitespace

## Convert to lowecase

In [5]:
data['tweet'] = [x.lower() for x in data['tweet']]

In [6]:
data.head()

Unnamed: 0,label,tweet
0,anger,"soal jln jatibaru,polisi tdk bs gertak gubernu..."
1,anger,"sesama cewe lho (kayaknya), harusnya bisa lebi..."
2,happy,kepingin gudeg mbarek bu hj. amad foto dari go...
3,anger,"jln jatibaru,bagian dari wilayah tn abang.peng..."
4,happy,"sharing pengalaman aja, kemarin jam 18.00 bata..."


## Removing Whitespace

In [7]:
data['tweet'] = [x.strip() for x in data['tweet']]

In [8]:
data.head()

Unnamed: 0,label,tweet
0,anger,"soal jln jatibaru,polisi tdk bs gertak gubernu..."
1,anger,"sesama cewe lho (kayaknya), harusnya bisa lebi..."
2,happy,kepingin gudeg mbarek bu hj. amad foto dari go...
3,anger,"jln jatibaru,bagian dari wilayah tn abang.peng..."
4,happy,"sharing pengalaman aja, kemarin jam 18.00 bata..."


## Removing Punctuation

In [9]:
hapus = str.maketrans('', '', punctuation)
non_punc = []

for stc in data['tweet']:
    kata = [x.translate(hapus) for x in stc]
    
    non_punc.append(kata)

In [10]:
data['tweet'] = non_punc
data['tweet'] = [''.join(x) for x in data['tweet']]
data['tweet']

0     soal jln jatibarupolisi tdk bs gertak gubernur...
1     sesama cewe lho kayaknya harusnya bisa lebih r...
2     kepingin gudeg mbarek bu hj amad foto dari goo...
3     jln jatibarubagian dari wilayah tn abangpengat...
4     sharing pengalaman aja kemarin jam 1800 batali...
                            ...                        
95    mudah2an sudah terupload smua sebelum z mudik ...
96    orang pendukung khilafah memang harus di black...
97    jangan sok akrab ye mention mention gue  malin...
98    alhamdulillah prof setelah berbicara semalam d...
99    keren kirakira masih ada nggak yg bilang pak u...
Name: tweet, Length: 100, dtype: object

## Remove Stopwords 

In [11]:
non_stopwords = []

for stc in data['tweet']:
    kata = [x for x in stc if x not in stop_words]
    
    non_stopwords.append(kata)

In [12]:
data['tweet'] = non_stopwords
data['tweet'] = [''.join(x) for x in data['tweet']]
data['tweet']

0     soal jln jatibarupolisi tdk bs gertak gubernur...
1     sesama cewe lho kayaknya harusnya bisa lebih r...
2     kepingin gudeg mbarek bu hj amad foto dari goo...
3     jln jatibarubagian dari wilayah tn abangpengat...
4     sharing pengalaman aja kemarin jam 1800 batali...
                            ...                        
95    mudah2an sudah terupload smua sebelum z mudik ...
96    orang pendukung khilafah memang harus di black...
97    jangan sok akrab ye mention mention gue  malin...
98    alhamdulillah prof setelah berbicara semalam d...
99    keren kirakira masih ada nggak yg bilang pak u...
Name: tweet, Length: 100, dtype: object

## Remove or Convert Number into Text

In [13]:
non_num = []

for stc in data['tweet']:
    kata = [x for x in stc if not x.isdigit()]
    
    non_num.append(kata)

In [14]:
data['tweet'] = non_num
data['tweet'] = [''.join(x) for x in data['tweet']]
data['tweet']

0     soal jln jatibarupolisi tdk bs gertak gubernur...
1     sesama cewe lho kayaknya harusnya bisa lebih r...
2     kepingin gudeg mbarek bu hj amad foto dari goo...
3     jln jatibarubagian dari wilayah tn abangpengat...
4     sharing pengalaman aja kemarin jam  batalin ti...
                            ...                        
95    mudahan sudah terupload smua sebelum z mudik k...
96    orang pendukung khilafah memang harus di black...
97    jangan sok akrab ye mention mention gue  malin...
98    alhamdulillah prof setelah berbicara semalam d...
99    keren kirakira masih ada nggak yg bilang pak u...
Name: tweet, Length: 100, dtype: object

In [15]:
#data yang sudah clean

data

Unnamed: 0,label,tweet
0,anger,soal jln jatibarupolisi tdk bs gertak gubernur...
1,anger,sesama cewe lho kayaknya harusnya bisa lebih r...
2,happy,kepingin gudeg mbarek bu hj amad foto dari goo...
3,anger,jln jatibarubagian dari wilayah tn abangpengat...
4,happy,sharing pengalaman aja kemarin jam batalin ti...
...,...,...
95,fear,mudahan sudah terupload smua sebelum z mudik k...
96,anger,orang pendukung khilafah memang harus di black...
97,anger,jangan sok akrab ye mention mention gue malin...
98,happy,alhamdulillah prof setelah berbicara semalam d...


# Feature Extraction

1. Using Bag Of Words
2. Using Tf-idf

### Using BOW 

In [16]:
bow = CountVectorizer()

In [17]:
X_sentence = bow.fit_transform(data['tweet'])

In [18]:
bow.get_feature_names()

['aalamiin',
 'aamiin',
 'abang',
 'abangpengaturan',
 'abis',
 'abng',
 'acara',
 'action',
 'ada',
 'adalah',
 'adanya',
 'adem',
 'adzan',
 'against',
 'agama',
 'agar',
 'air',
 'aja',
 'aje',
 'akal',
 'akan',
 'akhir',
 'akhirnya',
 'akrab',
 'aku',
 'akun',
 'akutu',
 'alam',
 'alamat',
 'alazhar',
 'alhamdulillah',
 'alhmdllh',
 'alihkan',
 'allah',
 'alm',
 'alur',
 'amad',
 'aman',
 'amatiran',
 'amn',
 'amp',
 'ampe',
 'an',
 'anak',
 'anakanak',
 'anda',
 'aneh',
 'anies',
 'aniespernah',
 'aniessalah',
 'anjing',
 'anniversary',
 'ansor',
 'antara',
 'anteng',
 'antiislam',
 'antrian',
 'apa',
 'apakah',
 'apakayaknya',
 'apalagi',
 'apanya',
 'apapun',
 'apbd',
 'arah',
 'areal',
 'argumen',
 'asal',
 'asing',
 'atau',
 'atulah',
 'awal',
 'ayuting',
 'baca',
 'bacain',
 'bacanya',
 'badak',
 'bae',
 'bagi',
 'bagus',
 'bahagia',
 'bahas',
 'bahaya',
 'bahayanya',
 'bahwa',
 'baik',
 'baiknya',
 'bakal',
 'bakalnya',
 'bales',
 'balik',
 'bandit',
 'banget',
 'bangun',
 '

In [19]:
X_sentence.toarray()

array([[0, 0, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 1, 0, 0],
       [1, 1, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 2, 0, 0]], dtype=int64)

In [20]:
X_sentence_df = pd.DataFrame(X_sentence.toarray(), columns=bow.get_feature_names())

In [21]:
X_sentence_df

Unnamed: 0,aalamiin,aamiin,abang,abangpengaturan,abis,abng,acara,action,ada,adalah,...,yah,yaitu,yang,yaow,yassalaaam,yatim,ye,yg,ygdisebarkan,youtube
0,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,2,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
96,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
97,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,1,0,0
98,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Using Tf-Idf

In [22]:
tfidf = TfidfVectorizer()

X2_sentence = tfidf.fit_transform(data['tweet'])

In [23]:
tfidf.get_feature_names()

['aalamiin',
 'aamiin',
 'abang',
 'abangpengaturan',
 'abis',
 'abng',
 'acara',
 'action',
 'ada',
 'adalah',
 'adanya',
 'adem',
 'adzan',
 'against',
 'agama',
 'agar',
 'air',
 'aja',
 'aje',
 'akal',
 'akan',
 'akhir',
 'akhirnya',
 'akrab',
 'aku',
 'akun',
 'akutu',
 'alam',
 'alamat',
 'alazhar',
 'alhamdulillah',
 'alhmdllh',
 'alihkan',
 'allah',
 'alm',
 'alur',
 'amad',
 'aman',
 'amatiran',
 'amn',
 'amp',
 'ampe',
 'an',
 'anak',
 'anakanak',
 'anda',
 'aneh',
 'anies',
 'aniespernah',
 'aniessalah',
 'anjing',
 'anniversary',
 'ansor',
 'antara',
 'anteng',
 'antiislam',
 'antrian',
 'apa',
 'apakah',
 'apakayaknya',
 'apalagi',
 'apanya',
 'apapun',
 'apbd',
 'arah',
 'areal',
 'argumen',
 'asal',
 'asing',
 'atau',
 'atulah',
 'awal',
 'ayuting',
 'baca',
 'bacain',
 'bacanya',
 'badak',
 'bae',
 'bagi',
 'bagus',
 'bahagia',
 'bahas',
 'bahaya',
 'bahayanya',
 'bahwa',
 'baik',
 'baiknya',
 'bakal',
 'bakalnya',
 'bales',
 'balik',
 'bandit',
 'banget',
 'bangun',
 '

In [24]:
X2_sentence.toarray()

array([[0.        , 0.        , 0.1903319 , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.08471833, 0.        ,
        0.        ],
       [0.2727578 , 0.2727578 , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.16166887, 0.        ,
        0.        ]])

In [25]:
X2_sentence_df = pd.DataFrame(X2_sentence.toarray(), columns=tfidf.get_feature_names())

In [26]:
X2_sentence_df

Unnamed: 0,aalamiin,aamiin,abang,abangpengaturan,abis,abng,acara,action,ada,adalah,...,yah,yaitu,yang,yaow,yassalaaam,yatim,ye,yg,ygdisebarkan,youtube
0,0.000000,0.000000,0.190332,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.000,0.000000,0.0,0.0
1,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.190294,0.0,0.0,0.0,0.000,0.000000,0.0,0.0
2,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.000,0.000000,0.0,0.0
3,0.000000,0.000000,0.000000,0.189991,0.0,0.189991,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.000,0.082967,0.0,0.0
4,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.000,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.105192,0.0,0.0,0.0,0.000,0.000000,0.0,0.0
96,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.000,0.000000,0.0,0.0
97,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.194,0.084718,0.0,0.0
98,0.272758,0.272758,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.000,0.000000,0.0,0.0
