# Import Module

In [2]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
from plotly import graph_objs as go
import plotly.express as px
import plotly.figure_factory as ff

from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

import os

# Load the data

In [3]:
import os

folder_path = r"D:\Code\py_code\Text-Processing\data\twitter_data"
file_names = os.listdir(folder_path)

print(file_names)

['pajak-1.csv', 'pajak-aufa.csv', 'pajak-evi1.csv', 'pajak-evi2.csv', 'pajak.csv']


In [4]:
def load_data(folder_path, file_name):
    data = pd.read_csv(os.path.join(folder_path, file_name))
    return data

In [5]:
data1 = load_data(folder_path, file_names[0])
data2 = load_data(folder_path, file_names[1])
data3 = load_data(folder_path, file_names[2])
data4 = load_data(folder_path, file_names[3])
data5 = load_data(folder_path, file_names[4])

data = pd.concat([data1, data2, data3, data4, data5], axis=0)
data.shape

(2258, 15)

# Preprocessing

In [6]:
data.head()

Unnamed: 0,conversation_id_str,created_at,favorite_count,full_text,id_str,image_url,in_reply_to_screen_name,lang,location,quote_count,reply_count,retweet_count,tweet_url,user_id_str,username
0,1786040009260445713,Thu May 02 14:28:15 +0000 2024,3263,aku ga pernah punya pengalaman serupa tapi be...,1786040009260445713,https://pbs.twimg.com/media/GMlJ685aAAASA6n.jpg,,in,,300,109,212,https://twitter.com/convomfs/status/1786040009...,1284061445148209154,convomfs
1,1786085322604044347,Thu May 02 17:28:19 +0000 2024,2612,Wujud Revolusi Mental: - Yang dihajar yang min...,1786085322604044347,,,in,"Depok, Indonesia",11,18,1462,https://twitter.com/andikamalreza/status/17860...,499262326,andikamalreza
2,1786182532943479201,Thu May 02 23:54:36 +0000 2024,83,Lagi viral ! Pengusaha empek-empek di Palemban...,1786182532943479201,https://pbs.twimg.com/ext_tw_video_thumb/17861...,,in,"Ponorogo, Indonesia",8,14,41,https://twitter.com/Naandaa27/status/178618253...,1259753320152887296,Naandaa27
3,1785628643139698992,Wed May 01 11:13:38 +0000 2024,3985,Tolak Bayar Pajak Pasangan Ini Pilih Robek Tas...,1785628643139698992,https://pbs.twimg.com/media/GMfTyHyb0AAPHIF.jpg,,in,,49,297,567,https://twitter.com/Artic_monkey12/status/1785...,1388483598,Artic_monkey12
4,1785206063916310954,Tue Apr 30 07:14:27 +0000 2024,771,Hukum bekerja dikantor pajak dan bea cukai htt...,1785206063916310954,https://pbs.twimg.com/ext_tw_video_thumb/17852...,,in,Indonesia,48,95,271,https://twitter.com/fotodakwah/status/17852060...,105428309,fotodakwah


In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2258 entries, 0 to 593
Data columns (total 15 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   conversation_id_str      2258 non-null   int64 
 1   created_at               2258 non-null   object
 2   favorite_count           2258 non-null   int64 
 3   full_text                2258 non-null   object
 4   id_str                   2258 non-null   int64 
 5   image_url                857 non-null    object
 6   in_reply_to_screen_name  461 non-null    object
 7   lang                     2258 non-null   object
 8   location                 1532 non-null   object
 9   quote_count              2258 non-null   int64 
 10  reply_count              2258 non-null   int64 
 11  retweet_count            2258 non-null   int64 
 12  tweet_url                2258 non-null   object
 13  user_id_str              2258 non-null   int64 
 14  username                 2258 non-null   objec

In [8]:
data.describe()

Unnamed: 0,conversation_id_str,favorite_count,id_str,quote_count,reply_count,retweet_count,user_id_str
count,2258.0,2258.0,2258.0,2258.0,2258.0,2258.0,2258.0
mean,1.77003e+18,2066.30248,1.770052e+18,71.310895,74.966342,573.108503,6.704591e+17
std,8.328196e+16,6444.182483,8.328505e+16,377.751347,271.0801,1841.943679,6.974322e+17
min,2.498896e+17,0.0,2.498896e+17,0.0,0.0,0.0,744253.0
25%,1.782678e+18,9.0,1.782691e+18,0.0,1.0,2.0,198027100.0
50%,1.783815e+18,46.0,1.78382e+18,1.0,4.0,12.0,7.126836e+17
75%,1.784605e+18,444.0,1.784608e+18,8.0,20.0,98.0,1.368347e+18
max,1.786253e+18,64155.0,1.786253e+18,9325.0,3072.0,32782.0,1.775367e+18


In [9]:
data.isna().sum()

conversation_id_str           0
created_at                    0
favorite_count                0
full_text                     0
id_str                        0
image_url                  1401
in_reply_to_screen_name    1797
lang                          0
location                    726
quote_count                   0
reply_count                   0
retweet_count                 0
tweet_url                     0
user_id_str                   0
username                      0
dtype: int64

## Drop unwanted columns

In [10]:
data = data[['id_str', 'full_text']]
data.head()

Unnamed: 0,id_str,full_text
0,1786040009260445713,aku ga pernah punya pengalaman serupa tapi be...
1,1786085322604044347,Wujud Revolusi Mental: - Yang dihajar yang min...
2,1786182532943479201,Lagi viral ! Pengusaha empek-empek di Palemban...
3,1785628643139698992,Tolak Bayar Pajak Pasangan Ini Pilih Robek Tas...
4,1785206063916310954,Hukum bekerja dikantor pajak dan bea cukai htt...


## Drop missing values & duplicate data

In [11]:
data.isna().sum(), data.shape

(id_str       0
 full_text    0
 dtype: int64,
 (2258, 2))

In [12]:
data.drop_duplicates(inplace=True)
data.shape

(1177, 2)

In [13]:
data.to_csv(r'D:\Code\py_code\Text-Processing\data\labelled\clean-column.csv', index=False)
data.to_excel(r'D:\Code\py_code\Text-Processing\data\labelled\clean-column.xlsx', index=False)


In [14]:
labelled_data = pd.read_excel(r'D:\Code\py_code\Text-Processing\data\labelled\labelled_data.xlsx')
labelled_data.head()

Unnamed: 0,id_str,full_text,label
0,1,aku ga pernah punya pengalaman serupa tapi be...,0
1,2,Wujud Revolusi Mental: - Yang dihajar yang min...,0
2,3,Lagi viral ! Pengusaha empek-empek di Palemban...,2
3,4,Tolak Bayar Pajak Pasangan Ini Pilih Robek Tas...,0
4,5,Hukum bekerja dikantor pajak dan bea cukai htt...,2


# Text Preprocessing

In [15]:
import string 
import re

from nltk.tokenize import word_tokenize 
from nltk.probability import FreqDist

### Case Folding (Lowercase)

In [16]:
labelled_data['full_text'] = labelled_data['full_text'].str.lower()
labelled_data.head()

Unnamed: 0,id_str,full_text,label
0,1,aku ga pernah punya pengalaman serupa tapi be...,0
1,2,wujud revolusi mental: - yang dihajar yang min...,0
2,3,lagi viral ! pengusaha empek-empek di palemban...,2
3,4,tolak bayar pajak pasangan ini pilih robek tas...,0
4,5,hukum bekerja dikantor pajak dan bea cukai htt...,2


### Remove URL link

In [17]:
def remove_tweet_special(text):
    text = text.replace('\\t'," ").replace('\\n'," ").replace('\\u'," ").replace('\\',"")
    text = text.encode('ascii', 'replace').decode('ascii')
    text = ' '.join(re.sub(r"([@#][A-Za-z0-9]+)|(http://|https://)\S+", " ", text).split())
    return text.replace("http://", " ").replace("https://", " ")

In [18]:
labelled_data['full_text'] = labelled_data['full_text'].apply(remove_tweet_special)
labelled_data.head()

Unnamed: 0,id_str,full_text,label
0,1,aku ga pernah punya pengalaman serupa tapi bea...,0
1,2,wujud revolusi mental: - yang dihajar yang min...,0
2,3,lagi viral ! pengusaha empek-empek di palemban...,2
3,4,tolak bayar pajak pasangan ini pilih robek tas...,0
4,5,hukum bekerja dikantor pajak dan bea cukai,2


### Remove Numbers

In [19]:
def remove_number(text):
    return  re.sub(r"\d+", "", text)

In [20]:
labelled_data['full_text'] = labelled_data['full_text'].apply(remove_number)
labelled_data.head()

Unnamed: 0,id_str,full_text,label
0,1,aku ga pernah punya pengalaman serupa tapi bea...,0
1,2,wujud revolusi mental: - yang dihajar yang min...,0
2,3,lagi viral ! pengusaha empek-empek di palemban...,2
3,4,tolak bayar pajak pasangan ini pilih robek tas...,0
4,5,hukum bekerja dikantor pajak dan bea cukai,2


### Remove Punctuation

In [21]:
def remove_punctuation(text):
    return text.translate(str.maketrans("","",string.punctuation))

In [22]:
labelled_data['full_text'] = labelled_data['full_text'].apply(remove_punctuation)
labelled_data.head()

Unnamed: 0,id_str,full_text,label
0,1,aku ga pernah punya pengalaman serupa tapi bea...,0
1,2,wujud revolusi mental yang dihajar yang minta...,0
2,3,lagi viral pengusaha empekempek di palembang ...,2
3,4,tolak bayar pajak pasangan ini pilih robek tas...,0
4,5,hukum bekerja dikantor pajak dan bea cukai,2


### Remove Whitespace

In [23]:
def remove_whitespace_lt(text):
    return text.strip()

def remove_whitespace_multiple(text):
    return re.sub(r'\s+',' ',text)

In [24]:
labelled_data['full_text'] = labelled_data['full_text'].apply(remove_whitespace_lt)
labelled_data['full_text'] = labelled_data['full_text'].apply(remove_whitespace_multiple)
labelled_data.head()

Unnamed: 0,id_str,full_text,label
0,1,aku ga pernah punya pengalaman serupa tapi bea...,0
1,2,wujud revolusi mental yang dihajar yang minta ...,0
2,3,lagi viral pengusaha empekempek di palembang d...,2
3,4,tolak bayar pajak pasangan ini pilih robek tas...,0
4,5,hukum bekerja dikantor pajak dan bea cukai,2


### Remove Single Character

In [25]:
def remove_singl_char(text):
    return re.sub(r"\b[a-zA-Z]\b", "", text)

In [26]:
labelled_data['full_text'] = labelled_data['full_text'].apply(remove_singl_char)
labelled_data.head()

Unnamed: 0,id_str,full_text,label
0,1,aku ga pernah punya pengalaman serupa tapi bea...,0
1,2,wujud revolusi mental yang dihajar yang minta ...,0
2,3,lagi viral pengusaha empekempek di palembang d...,2
3,4,tolak bayar pajak pasangan ini pilih robek tas...,0
4,5,hukum bekerja dikantor pajak dan bea cukai,2


### Tokenization

In [27]:
def word_tokenize_wrapper(text):
    return word_tokenize(text)

In [28]:
labelled_data['tweet_tokens'] = labelled_data['full_text'].apply(word_tokenize_wrapper)
labelled_data.head()

Unnamed: 0,id_str,full_text,label,tweet_tokens
0,1,aku ga pernah punya pengalaman serupa tapi bea...,0,"[aku, ga, pernah, punya, pengalaman, serupa, t..."
1,2,wujud revolusi mental yang dihajar yang minta ...,0,"[wujud, revolusi, mental, yang, dihajar, yang,..."
2,3,lagi viral pengusaha empekempek di palembang d...,2,"[lagi, viral, pengusaha, empekempek, di, palem..."
3,4,tolak bayar pajak pasangan ini pilih robek tas...,0,"[tolak, bayar, pajak, pasangan, ini, pilih, ro..."
4,5,hukum bekerja dikantor pajak dan bea cukai,2,"[hukum, bekerja, dikantor, pajak, dan, bea, cu..."


### Stopwords

In [29]:
from nltk.corpus import stopwords

In [30]:
list_stopwords = stopwords.words('indonesian')
list_stopwords.extend(["yg", "dg", "rt", "dgn", "ny", "d", 'klo', 
                       'kalo', 'amp', 'biar', 'bikin', 'bilang', 
                       'gak', 'ga', 'krn', 'nya', 'nih', 'sih', 
                       'si', 'tau', 'tdk', 'tuh', 'utk', 'ya', 
                       'jd', 'jgn', 'sdh', 'aja', 'n', 't', 
                       'nyg', 'hehe', 'pen', 'u', 'nan', 'loh', 'rt',
                       '&amp', 'yah'])

txt_stopword = pd.read_csv("stopwords.txt", names= ["stopwords"], header = None)
list_stopwords.extend(txt_stopword["stopwords"][0].split(' '))
list_stopwords = set(list_stopwords)

In [31]:
def stopwords_removal(words):
    return [word for word in words if word not in list_stopwords]

In [32]:
labelled_data['tweet_tokens_wsw'] = labelled_data['tweet_tokens'].apply(stopwords_removal)
labelled_data.head()

Unnamed: 0,id_str,full_text,label,tweet_tokens,tweet_tokens_wsw
0,1,aku ga pernah punya pengalaman serupa tapi bea...,0,"[aku, ga, pernah, punya, pengalaman, serupa, t...","[pengalaman, bea, cukai, kerjaannya, dah, liat..."
1,2,wujud revolusi mental yang dihajar yang minta ...,0,"[wujud, revolusi, mental, yang, dihajar, yang,...","[wujud, revolusi, mental, dihajar, maaf, nyeto..."
2,3,lagi viral pengusaha empekempek di palembang d...,2,"[lagi, viral, pengusaha, empekempek, di, palem...","[viral, pengusaha, empekempek, palembang, dita..."
3,4,tolak bayar pajak pasangan ini pilih robek tas...,0,"[tolak, bayar, pajak, pasangan, ini, pilih, ro...","[tolak, bayar, pajak, pasangan, pilih, robek, ..."
4,5,hukum bekerja dikantor pajak dan bea cukai,2,"[hukum, bekerja, dikantor, pajak, dan, bea, cu...","[hukum, dikantor, pajak, bea, cukai]"


### Stemming

In [33]:
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import swifter
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [34]:
factory = StemmerFactory()
stemmer = factory.create_stemmer()

In [35]:
def stemmed_wrapper(term):
    return stemmer.stem(term)

term_dict = {}

for document in labelled_data['tweet_tokens_wsw']:
    for term in document:
        if term not in term_dict:
            term_dict[term] = ' '

for term in tqdm(term_dict):
    term_dict[term] = stemmed_wrapper(term)

def get_stemmed_term(document):
    return [term_dict[term] for term in document]

100%|██████████| 5996/5996 [05:17<00:00, 18.89it/s]


In [36]:
labelled_data['tweet_tokens_stemmed'] = labelled_data['tweet_tokens_wsw'].swifter.apply(get_stemmed_term)
labelled_data.head()

Pandas Apply: 100%|██████████| 1177/1177 [00:00<00:00, 343517.90it/s]


Unnamed: 0,id_str,full_text,label,tweet_tokens,tweet_tokens_wsw,tweet_tokens_stemmed
0,1,aku ga pernah punya pengalaman serupa tapi bea...,0,"[aku, ga, pernah, punya, pengalaman, serupa, t...","[pengalaman, bea, cukai, kerjaannya, dah, liat...","[alam, bea, cukai, kerja, dah, liat, berita, s..."
1,2,wujud revolusi mental yang dihajar yang minta ...,0,"[wujud, revolusi, mental, yang, dihajar, yang,...","[wujud, revolusi, mental, dihajar, maaf, nyeto...","[wujud, revolusi, mental, hajar, maaf, nyetor,..."
2,3,lagi viral pengusaha empekempek di palembang d...,2,"[lagi, viral, pengusaha, empekempek, di, palem...","[viral, pengusaha, empekempek, palembang, dita...","[viral, usaha, empekempek, palembang, tagih, p..."
3,4,tolak bayar pajak pasangan ini pilih robek tas...,0,"[tolak, bayar, pajak, pasangan, ini, pilih, ro...","[tolak, bayar, pajak, pasangan, pilih, robek, ...","[tolak, bayar, pajak, pasang, pilih, robek, ta..."
4,5,hukum bekerja dikantor pajak dan bea cukai,2,"[hukum, bekerja, dikantor, pajak, dan, bea, cu...","[hukum, dikantor, pajak, bea, cukai]","[hukum, kantor, pajak, bea, cukai]"


In [37]:
labelled_data = labelled_data.dropna()
labelled_data.to_csv(r'D:\Code\py_code\Text-Processing\data\clean\preprocessed_data.csv', index=False)

# TF-IDF

### Prepare Corpus

In [38]:
import ast

In [39]:
processed_data = pd.read_csv(r'D:\Code\py_code\Text-Processing\data\clean\preprocessed_data.csv')
processed_data.head()

Unnamed: 0,id_str,full_text,label,tweet_tokens,tweet_tokens_wsw,tweet_tokens_stemmed
0,1,aku ga pernah punya pengalaman serupa tapi bea...,0,"['aku', 'ga', 'pernah', 'punya', 'pengalaman',...","['pengalaman', 'bea', 'cukai', 'kerjaannya', '...","['alam', 'bea', 'cukai', 'kerja', 'dah', 'liat..."
1,2,wujud revolusi mental yang dihajar yang minta ...,0,"['wujud', 'revolusi', 'mental', 'yang', 'dihaj...","['wujud', 'revolusi', 'mental', 'dihajar', 'ma...","['wujud', 'revolusi', 'mental', 'hajar', 'maaf..."
2,3,lagi viral pengusaha empekempek di palembang d...,2,"['lagi', 'viral', 'pengusaha', 'empekempek', '...","['viral', 'pengusaha', 'empekempek', 'palemban...","['viral', 'usaha', 'empekempek', 'palembang', ..."
3,4,tolak bayar pajak pasangan ini pilih robek tas...,0,"['tolak', 'bayar', 'pajak', 'pasangan', 'ini',...","['tolak', 'bayar', 'pajak', 'pasangan', 'pilih...","['tolak', 'bayar', 'pajak', 'pasang', 'pilih',..."
4,5,hukum bekerja dikantor pajak dan bea cukai,2,"['hukum', 'bekerja', 'dikantor', 'pajak', 'dan...","['hukum', 'dikantor', 'pajak', 'bea', 'cukai']","['hukum', 'kantor', 'pajak', 'bea', 'cukai']"


In [40]:
def join_text_list(texts):
    texts = ast.literal_eval(texts)
    return ' '.join([text for text in texts])

In [41]:
processed_data['tweet_join'] = processed_data['tweet_tokens_stemmed'].apply(join_text_list)
processed_data.head()

Unnamed: 0,id_str,full_text,label,tweet_tokens,tweet_tokens_wsw,tweet_tokens_stemmed,tweet_join
0,1,aku ga pernah punya pengalaman serupa tapi bea...,0,"['aku', 'ga', 'pernah', 'punya', 'pengalaman',...","['pengalaman', 'bea', 'cukai', 'kerjaannya', '...","['alam', 'bea', 'cukai', 'kerja', 'dah', 'liat...",alam bea cukai kerja dah liat berita seliwer k...
1,2,wujud revolusi mental yang dihajar yang minta ...,0,"['wujud', 'revolusi', 'mental', 'yang', 'dihaj...","['wujud', 'revolusi', 'mental', 'dihajar', 'ma...","['wujud', 'revolusi', 'mental', 'hajar', 'maaf...",wujud revolusi mental hajar maaf nyetor pajak ...
2,3,lagi viral pengusaha empekempek di palembang d...,2,"['lagi', 'viral', 'pengusaha', 'empekempek', '...","['viral', 'pengusaha', 'empekempek', 'palemban...","['viral', 'usaha', 'empekempek', 'palembang', ...",viral usaha empekempek palembang tagih pajak n...
3,4,tolak bayar pajak pasangan ini pilih robek tas...,0,"['tolak', 'bayar', 'pajak', 'pasangan', 'ini',...","['tolak', 'bayar', 'pajak', 'pasangan', 'pilih...","['tolak', 'bayar', 'pajak', 'pasang', 'pilih',...",tolak bayar pajak pasang pilih robek tas herme...
4,5,hukum bekerja dikantor pajak dan bea cukai,2,"['hukum', 'bekerja', 'dikantor', 'pajak', 'dan...","['hukum', 'dikantor', 'pajak', 'bea', 'cukai']","['hukum', 'kantor', 'pajak', 'bea', 'cukai']",hukum kantor pajak bea cukai


### TF-IDF Term Rank

In [42]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [62]:
negative_label = processed_data[processed_data['label'] == 0]
positive_label = processed_data[processed_data['label'] == 1]
neutral_label = processed_data[processed_data['label'] == 2]

In [72]:
def tf_idf_calculator(data):
    tf_idf = TfidfVectorizer(max_features=1000, binary=True)
    tfidf_mat = tf_idf.fit_transform(data["tweet_join"]).toarray()
    terms = tf_idf.get_feature_names_out()

    # Word count calculation
    word_counts = {}
    for doc in data["tweet_join"]:
        for word in doc.split():
            word_counts[word] = word_counts.get(word, 0) + 1

    # Create the DataFrame
    temp = []
    for col, term in enumerate(terms):
        temp.append({
            'term': term,
            'rank': tfidf_mat.sum(axis=0)[col],
            'count': word_counts.get(term, 0)  # Get count from word_counts
        })

    ranking = pd.DataFrame(temp)
    ranking = ranking.sort_values('rank', ascending=False)

    return ranking

In [77]:
ranking = tf_idf_calculator(processed_data)
negative_ranking = tf_idf_calculator(negative_label)
positive_ranking = tf_idf_calculator(positive_label)
neutral_ranking = tf_idf_calculator(neutral_label)

### Visualization of the most frequent words

In [60]:
fig = px.bar(ranking.head(25), x="count", y="term", title='Common Words in Selected Text', orientation='h',  width=700, height=700, color='term')
fig.show()

In [61]:
fig = px.treemap(ranking.head(25), path=['term'], values='count',title='Tree of Most Common Words')
fig.show()

### Count Vectorizer

In [78]:

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import normalize

max_features = 1000

# calc TF vector
cvect = CountVectorizer(max_features=max_features)
TF_vector = cvect.fit_transform(processed_data["tweet_join"])

# normalize TF vector
normalized_TF_vector = normalize(TF_vector, norm='l1', axis=1)

# calc IDF
tfidf = TfidfVectorizer(max_features=max_features, smooth_idf=False)
tfs = tfidf.fit_transform(processed_data["tweet_join"])
IDF_vector = tfidf.idf_

# hitung TF x IDF sehingga dihasilkan TFIDF matrix / vector
tfidf_mat = normalized_TF_vector.multiply(IDF_vector).toarray()

In [81]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import normalize

max_features = 1000

cvect = CountVectorizer(max_features=max_features, ngram_range=(1,3))
counts = cvect.fit_transform(processed_data["tweet_join"])

normalized_counts = normalize(counts, norm='l1', axis=1)

tfidf = TfidfVectorizer(max_features=max_features, ngram_range=(1,3), smooth_idf=False)
tfs = tfidf.fit_transform(processed_data["tweet_join"])

tfidf_mat = normalized_counts.multiply(tfidf.idf_).toarray()
tfidf.get_feature_names_out()

array(['abis', 'abu', 'acara', 'adil', 'administrasi', 'ah', 'ahmad',
       'ajar', 'aju', 'akal', 'akaryawanampwiraswasta',
       'akaryawanampwiraswasta proses',
       'akaryawanampwiraswasta proses cepatampmudah', 'akibat', 'akun',
       'akuntansi', 'akuntansi akuntansi', 'akuntansi akuntansi sma',
       'akuntansi akuntansi uang', 'akuntansi siklus',
       'akuntansi siklus akuntansi', 'akuntansi sma',
       'akuntansi sma antar', 'akuntansi uang', 'akuntansi uang dll',
       'al', 'alam', 'alat', 'alat sehat', 'alias', 'all', 'alun',
       'alun trisambodo', 'ama', 'aman', 'ambil', 'an', 'anak', 'and',
       'aneh', 'anggap', 'anggar', 'anggota', 'anies', 'anjing', 'antar',
       'antar akuntansi', 'antar akuntansi siklus', 'apa', 'apaapa',
       'apbn', 'apresiasi', 'april', 'artikel', 'as', 'asa', 'asing',
       'asing maksimal', 'asing maksimal juta', 'asli', 'asn', 'asuransi',
       'atas', 'atur', 'ayo', 'ayo guys', 'ayo guys butuh', 'baca',
       'badan', 'ba

In [83]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import normalize

max_features = 1000


def generate_tfidf_mat(min_gram, max_gram):
    cvect = CountVectorizer(max_features=max_features, ngram_range=(min_gram, max_gram))
    counts = cvect.fit_transform(processed_data["tweet_join"])

    normalized_counts = normalize(counts, norm='l1', axis=1)

    tfidf = TfidfVectorizer(max_features=max_features, ngram_range=(min_gram, max_gram), smooth_idf=False)
    tfs = tfidf.fit_transform(processed_data["tweet_join"])

    tfidf_mat = normalized_counts.multiply(tfidf.idf_).toarray()
    
    TF = normalized_counts.toarray()
    IDF = tfidf.idf_
    TF_IDF = tfidf_mat
    return TF, IDF, TF_IDF, tfidf.get_feature_names_out()

# ngram_range (1, 1) to use unigram only
tf_mat_unigram, idf_mat_unigram, tfidf_mat_unigram, terms_unigram = generate_tfidf_mat(1,1)

# ngram_range (2, 2) to use bigram only
tf_mat_bigram, idf_mat_bigram, tfidf_mat_bigram, terms_bigram = generate_tfidf_mat(2,2)

# ngram_range (3, 3) to use trigram only
tf_mat_trigram, idf_mat_trigram, tfidf_mat_trigram, terms_trigram = generate_tfidf_mat(3,3)

# ---------- check sparse data -------------------
idx_sample = 0

print("Show TFIDF sample ke-" + str(idx_sample), "\n")
print(processed_data['tweet_tokens_stemmed'][idx_sample], "\n")

print("\t\t\t", "TF", "\t\t", "IDF", "\t\t", "TF-IDF", "\t", "Term\n")
for i, item in enumerate(zip(tf_mat_unigram[idx_sample], idf_mat_unigram, tfidf_mat_unigram[idx_sample], terms_unigram)):
    if(item[2] != 0.0):
        print ("array position " + str(i) + "\t", 
               "%.6f" % item[0], "\t", 
               "%.6f" % item[1], "\t", 
               "%.6f" % item[2], "\t", 
               item[3])


Show TFIDF sample ke-0 

['alam', 'bea', 'cukai', 'kerja', 'dah', 'liat', 'berita', 'seliwer', 'kaya', 'nguras', 'duit', 'orang', 'anjir', 'liat', 'kasih', 'pajak', 'lipat', 'harga', 'barang'] 

			 TF 		 IDF 		 TF-IDF 	 Term

array position 20	 0.058824 	 6.278965 	 0.369351 	 alam
array position 45	 0.058824 	 6.684430 	 0.393202 	 anjir
array position 94	 0.058824 	 3.416764 	 0.200986 	 barang
array position 108	 0.058824 	 3.352225 	 0.197190 	 bea
array position 125	 0.058824 	 6.461286 	 0.380076 	 berita
array position 195	 0.058824 	 3.506376 	 0.206257 	 cukai
array position 201	 0.058824 	 5.505775 	 0.323869 	 dah
array position 258	 0.058824 	 3.993187 	 0.234893 	 duit
array position 345	 0.058824 	 3.258540 	 0.191679 	 harga
array position 450	 0.058824 	 4.544364 	 0.267316 	 kasih
array position 453	 0.058824 	 4.220577 	 0.248269 	 kaya
array position 472	 0.058824 	 3.701276 	 0.217722 	 kerja
array position 525	 0.117647 	 5.362674 	 0.630903 	 liat
array position 

In [84]:
def get_TF_unigram(row):
    idx = row.name
    return [tf for tf in tf_mat_unigram[idx] if tf != 0.0]

processed_data["TF_UNIGRAM"] = processed_data.apply(get_TF_unigram, axis=1)

def get_IDF_unigram(row):
    idx = row.name
    return [item[1] for item in zip(tf_mat_unigram[idx], idf_mat_unigram) if item[0] != 0.0]

processed_data["IDF_UNIGRAM"] = processed_data.apply(get_IDF_unigram, axis=1)

def get_TFIDF_unigram(row):
    idx = row.name
    return [tfidf for tfidf in tfidf_mat_unigram[idx] if tfidf != 0.0]

processed_data["TFIDF_UNIGRAM"] = processed_data.apply(get_TFIDF_unigram, axis=1)

processed_data[["tweet_tokens_stemmed", "TF_UNIGRAM", "IDF_UNIGRAM", "TFIDF_UNIGRAM"]].head()

# save TFIDF Unigram to Excel

processed_data[["tweet_tokens_stemmed", "TF_UNIGRAM", "IDF_UNIGRAM", "TFIDF_UNIGRAM"]].to_excel(r"D:\Code\py_code\Text-Processing\data\clean\TFIDF_Unigram.xlsx")


In [85]:
def get_TF_bigram(row):
    idx = row.name
    return [tf for tf in tf_mat_bigram[idx] if tf != 0.0]

processed_data["TF_BIGRAM"] = processed_data.apply(get_TF_bigram, axis=1)

def get_IDF_bigram(row):
    idx = row.name
    return [item[1] for item in zip(tf_mat_bigram[idx], idf_mat_bigram) if item[0] != 0.0]

processed_data["IDF_BIGRAM"] = processed_data.apply(get_IDF_bigram, axis=1)

def get_TFIDF_bigram(row):
    idx = row.name
    return [tfidf for tfidf in tfidf_mat_bigram[idx] if tfidf != 0.0]

processed_data["TFIDF_BIGRAM"] = processed_data.apply(get_TFIDF_bigram, axis=1)

def get_Term_bigram(row):
    idx = row.name
    return [item[1] for item in zip(tf_mat_bigram[idx], terms_bigram) if item[0] != 0.0]

processed_data["TWEET_BIGRAM"] = processed_data.apply(get_Term_bigram, axis=1)

processed_data[["TWEET_BIGRAM", "TF_BIGRAM", "IDF_BIGRAM", "TFIDF_BIGRAM"]].head()


# save TFIDF Bigram to Excel

processed_data[["TWEET_BIGRAM", "TF_BIGRAM", "IDF_BIGRAM", "TFIDF_BIGRAM"]].to_excel(r"D:\Code\py_code\Text-Processing\data\clean\TFIDF_Bigram.xlsx")


In [86]:
def get_TF_trigram(row):
    idx = row.name
    return [tf for tf in tf_mat_trigram[idx] if tf != 0.0]

processed_data["TF_trigram"] = processed_data.apply(get_TF_trigram, axis=1)

def get_IDF_trigram(row):
    idx = row.name
    return [item[1] for item in zip(tf_mat_trigram[idx], idf_mat_trigram) if item[0] != 0.0]

processed_data["IDF_trigram"] = processed_data.apply(get_IDF_trigram, axis=1)

def get_TFIDF_trigram(row):
    idx = row.name
    return [tfidf for tfidf in tfidf_mat_trigram[idx] if tfidf != 0.0]

processed_data["TFIDF_trigram"] = processed_data.apply(get_TFIDF_trigram, axis=1)

def get_Term_trigram(row):
    idx = row.name
    return [item[1] for item in zip(tf_mat_trigram[idx], terms_trigram) if item[0] != 0.0]

processed_data["TWEET_TRIGRAM"] = processed_data.apply(get_Term_trigram, axis=1)

processed_data[["TWEET_TRIGRAM", "TF_trigram", "IDF_trigram", "TFIDF_trigram"]].head()


# save TFIDF Trigram to Excel

processed_data[["TWEET_TRIGRAM", "TF_trigram", "IDF_trigram", "TFIDF_trigram"]].to_excel(r"D:\Code\py_code\Text-Processing\data\clean\TFIDF_Trigram.xlsx")
