# Prepocessing Data

In [1]:
import re
import pandas as pd
import numpy as np
import nltk
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import swifter

In [2]:
dataset = pd.read_csv('data/Test.csv')
dataset.head(5)

Unnamed: 0,Tanggal&Waktu,Username,Tweet
0,2022-06-14 08:34:52+00:00,IAmMasterBlade,RT @the_sweeper_sol: 🎁 2 WL SPOTS @BakedBerser...
1,2022-06-14 08:34:51+00:00,naehrstff_nft,@Atmonez Some true words here! Where in the wo...
2,2022-06-14 08:34:51+00:00,Mdzahid85418836,RT @runxofficial: 👟RunX AIRDROP: 5 MILLION RNX...
3,2022-06-14 08:34:51+00:00,Annelie00284967,RT @NightmarePrjct: Discord Officially Opening...
4,2022-06-14 08:34:51+00:00,TKirwi,RT @NightmarePrjct: Discord Officially Opening...


In [3]:
print(dataset.shape)
print(dataset.columns)
# print dataframe.

(1000, 3)
Index(['Tanggal&Waktu', 'Username', 'Tweet'], dtype='object')


In [4]:
#Cleaning Text
def cleaning_text(text):
    # replace RT tag
    text = re.sub('RT\s', '', text)
    # replace @_username
    text = re.sub('\B@\w+', '', text)
    # replace URL
    text = re.sub('(http|https):\/\/\S+', '', text)
    # replace #_something_
    text = re.sub('#+', '', text)
    # konversi huruf kapital ke huruf kecil semua
    text = text.lower()
    # replace kata yang berulang-ulang ('oooooo' menjadi '00')
    text = re.sub(r'(.)\1+', r'\1\1', text)
    # replace punctuation repetition dengan single occurance ('!!!!!!' ,enjadi '!')
    text = re.sub(r'[\?\.\!]+(?=[\?.\!])', '', text)
    # menhilangkan angka dan spesial karakter, hanya mengambil alfabet saja
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    return text

dataset['Clean_text'] = dataset['Tweet'].apply(lambda x: cleaning_text(x))
dataset.head(10)

Unnamed: 0,Tanggal&Waktu,Username,Tweet,Clean_text
0,2022-06-14 08:34:52+00:00,IAmMasterBlade,RT @the_sweeper_sol: 🎁 2 WL SPOTS @BakedBerser...,wl spots giveaway like and fo...
1,2022-06-14 08:34:51+00:00,naehrstff_nft,@Atmonez Some true words here! Where in the wo...,some true words here where in the world are ...
2,2022-06-14 08:34:51+00:00,Mdzahid85418836,RT @runxofficial: 👟RunX AIRDROP: 5 MILLION RNX...,runx airdrop million rnx for k partici...
3,2022-06-14 08:34:51+00:00,Annelie00284967,RT @NightmarePrjct: Discord Officially Opening...,discord officially opening gt gt p...
4,2022-06-14 08:34:51+00:00,TKirwi,RT @NightmarePrjct: Discord Officially Opening...,discord officially opening gt gt p...
5,2022-06-14 08:34:51+00:00,jonas18931520,RT @M0range: It's not too late to join the @nf...,it s not too late to join the for the cultu...
6,2022-06-14 08:34:51+00:00,houseamacentral,RT @TheBinanceNFT: Subscription for @binary_x ...,subscription for binaryx igo rh nox is no...
7,2022-06-14 08:34:51+00:00,cryptoxplora,RT @marcmscs: Came such a long way…\r\nMint is...,came such a long way mint is in less than ...
8,2022-06-14 08:34:50+00:00,animalsfarm_nft,RT @VShapr: Check out my new item on OpenSea! ...,check out my new item on opensea via
9,2022-06-14 08:34:50+00:00,missufehan,RT @DeathFiat: 💥TOP NEWS💥#nft/ #NFTCommunity /...,top news nft nftcommunity openseanft our...


In [5]:
#Tokenization
def tokenization_tweet(text):
    text = re.split('\W+',text)
    return text

dataset['Tweet_tokenization'] = dataset['Clean_text'].apply(lambda x: tokenization_tweet(x.lower()))
dataset.head(5)

Unnamed: 0,Tanggal&Waktu,Username,Tweet,Clean_text,Tweet_tokenization
0,2022-06-14 08:34:52+00:00,IAmMasterBlade,RT @the_sweeper_sol: 🎁 2 WL SPOTS @BakedBerser...,wl spots giveaway like and fo...,"[, wl, spots, giveaway, like, and, follow, and..."
1,2022-06-14 08:34:51+00:00,naehrstff_nft,@Atmonez Some true words here! Where in the wo...,some true words here where in the world are ...,"[, some, true, words, here, where, in, the, wo..."
2,2022-06-14 08:34:51+00:00,Mdzahid85418836,RT @runxofficial: 👟RunX AIRDROP: 5 MILLION RNX...,runx airdrop million rnx for k partici...,"[, runx, airdrop, million, rnx, for, k, partic..."
3,2022-06-14 08:34:51+00:00,Annelie00284967,RT @NightmarePrjct: Discord Officially Opening...,discord officially opening gt gt p...,"[, discord, officially, opening, gt, gt, passw..."
4,2022-06-14 08:34:51+00:00,TKirwi,RT @NightmarePrjct: Discord Officially Opening...,discord officially opening gt gt p...,"[, discord, officially, opening, gt, gt, passw..."


In [6]:
#Stop Removal

stopword = nltk.corpus.stopwords.words('english')
stopword.extend([""])

def remove_stopwords(text):
    text = [word for word in text if word not in stopword]
    return text

dataset['Stop_removal'] = dataset['Tweet_tokenization'].apply(lambda x: remove_stopwords(x))
dataset.head(10)

Unnamed: 0,Tanggal&Waktu,Username,Tweet,Clean_text,Tweet_tokenization,Stop_removal
0,2022-06-14 08:34:52+00:00,IAmMasterBlade,RT @the_sweeper_sol: 🎁 2 WL SPOTS @BakedBerser...,wl spots giveaway like and fo...,"[, wl, spots, giveaway, like, and, follow, and...","[wl, spots, giveaway, like, follow, tag, friends]"
1,2022-06-14 08:34:51+00:00,naehrstff_nft,@Atmonez Some true words here! Where in the wo...,some true words here where in the world are ...,"[, some, true, words, here, where, in, the, wo...","[true, words, world, friend]"
2,2022-06-14 08:34:51+00:00,Mdzahid85418836,RT @runxofficial: 👟RunX AIRDROP: 5 MILLION RNX...,runx airdrop million rnx for k partici...,"[, runx, airdrop, million, rnx, for, k, partic...","[runx, airdrop, million, rnx, k, participants,..."
3,2022-06-14 08:34:51+00:00,Annelie00284967,RT @NightmarePrjct: Discord Officially Opening...,discord officially opening gt gt p...,"[, discord, officially, opening, gt, gt, passw...","[discord, officially, opening, gt, gt, passwor..."
4,2022-06-14 08:34:51+00:00,TKirwi,RT @NightmarePrjct: Discord Officially Opening...,discord officially opening gt gt p...,"[, discord, officially, opening, gt, gt, passw...","[discord, officially, opening, gt, gt, passwor..."
5,2022-06-14 08:34:51+00:00,jonas18931520,RT @M0range: It's not too late to join the @nf...,it s not too late to join the for the cultu...,"[, it, s, not, too, late, to, join, the, for, ...","[late, join, culture, team, nonstop, work, pus..."
6,2022-06-14 08:34:51+00:00,houseamacentral,RT @TheBinanceNFT: Subscription for @binary_x ...,subscription for binaryx igo rh nox is no...,"[, subscription, for, binaryx, igo, rh, nox, i...","[subscription, binaryx, igo, rh, nox, live, pr..."
7,2022-06-14 08:34:51+00:00,cryptoxplora,RT @marcmscs: Came such a long way…\r\nMint is...,came such a long way mint is in less than ...,"[, came, such, a, long, way, mint, is, in, les...","[came, long, way, mint, less, hours, wait, sta..."
8,2022-06-14 08:34:50+00:00,animalsfarm_nft,RT @VShapr: Check out my new item on OpenSea! ...,check out my new item on opensea via,"[, check, out, my, new, item, on, opensea, via, ]","[check, new, item, opensea, via]"
9,2022-06-14 08:34:50+00:00,missufehan,RT @DeathFiat: 💥TOP NEWS💥#nft/ #NFTCommunity /...,top news nft nftcommunity openseanft our...,"[, top, news, nft, nftcommunity, openseanft, o...","[top, news, nft, nftcommunity, openseanft, top..."


In [7]:
#Case Folding
dataset['Username'] = dataset['Username'].str.lower()
dataset.head(10)

Unnamed: 0,Tanggal&Waktu,Username,Tweet,Clean_text,Tweet_tokenization,Stop_removal
0,2022-06-14 08:34:52+00:00,iammasterblade,RT @the_sweeper_sol: 🎁 2 WL SPOTS @BakedBerser...,wl spots giveaway like and fo...,"[, wl, spots, giveaway, like, and, follow, and...","[wl, spots, giveaway, like, follow, tag, friends]"
1,2022-06-14 08:34:51+00:00,naehrstff_nft,@Atmonez Some true words here! Where in the wo...,some true words here where in the world are ...,"[, some, true, words, here, where, in, the, wo...","[true, words, world, friend]"
2,2022-06-14 08:34:51+00:00,mdzahid85418836,RT @runxofficial: 👟RunX AIRDROP: 5 MILLION RNX...,runx airdrop million rnx for k partici...,"[, runx, airdrop, million, rnx, for, k, partic...","[runx, airdrop, million, rnx, k, participants,..."
3,2022-06-14 08:34:51+00:00,annelie00284967,RT @NightmarePrjct: Discord Officially Opening...,discord officially opening gt gt p...,"[, discord, officially, opening, gt, gt, passw...","[discord, officially, opening, gt, gt, passwor..."
4,2022-06-14 08:34:51+00:00,tkirwi,RT @NightmarePrjct: Discord Officially Opening...,discord officially opening gt gt p...,"[, discord, officially, opening, gt, gt, passw...","[discord, officially, opening, gt, gt, passwor..."
5,2022-06-14 08:34:51+00:00,jonas18931520,RT @M0range: It's not too late to join the @nf...,it s not too late to join the for the cultu...,"[, it, s, not, too, late, to, join, the, for, ...","[late, join, culture, team, nonstop, work, pus..."
6,2022-06-14 08:34:51+00:00,houseamacentral,RT @TheBinanceNFT: Subscription for @binary_x ...,subscription for binaryx igo rh nox is no...,"[, subscription, for, binaryx, igo, rh, nox, i...","[subscription, binaryx, igo, rh, nox, live, pr..."
7,2022-06-14 08:34:51+00:00,cryptoxplora,RT @marcmscs: Came such a long way…\r\nMint is...,came such a long way mint is in less than ...,"[, came, such, a, long, way, mint, is, in, les...","[came, long, way, mint, less, hours, wait, sta..."
8,2022-06-14 08:34:50+00:00,animalsfarm_nft,RT @VShapr: Check out my new item on OpenSea! ...,check out my new item on opensea via,"[, check, out, my, new, item, on, opensea, via, ]","[check, new, item, opensea, via]"
9,2022-06-14 08:34:50+00:00,missufehan,RT @DeathFiat: 💥TOP NEWS💥#nft/ #NFTCommunity /...,top news nft nftcommunity openseanft our...,"[, top, news, nft, nftcommunity, openseanft, o...","[top, news, nft, nftcommunity, openseanft, top..."


In [8]:
# STEMMING
# create stemmer
factory = StemmerFactory()
stemmer = factory.create_stemmer()

# stemmed
def stemmed_wrapper(term):
    return stemmer.stem(term)

term_dict = {}

for document in dataset['Stop_removal']:
    for term in document:
        if term not in term_dict:
            term_dict[term] = ' '
            
print(len(term_dict))
print("------------------------")

for term in term_dict:
    term_dict[term] = stemmed_wrapper(term)
    print(term,":" ,term_dict[term])
    
print(term_dict)
print("------------------------")


# apply stemmed term to dataframe
def get_stemmed_term(document):
    return [term_dict[term] for term in document]

dataset['Steaming'] = dataset['Stop_removal'].swifter.apply(get_stemmed_term)
print(dataset['Steaming'])

2044
------------------------
wl : wl
spots : spots
giveaway : giveaway
like : like
follow : follow
tag : tag
friends : friends
true : true
words : words
world : world
friend : friend
runx : runx
airdrop : airdrop
million : million
rnx : rnx
k : k
participants : participants
total : total
reward : reward
user : user
per : per
r : r
discord : discord
officially : officially
opening : opening
gt : gt
password : password
nmp : nmp
win : win
enter : enter
rt : rt
fol : fol
late : late
join : join
culture : culture
team : team
nonstop : nonstop
work : work
pushing : pushing
bounds : bounds
nft : nft
space : space
subscription : subscription
binaryx : binaryx
igo : igo
rh : rh
nox : nox
live : live
prepare : prepare
bnx : bnx
spot : spot
wallet : wallet
subscribe : subscribe
particip : particip
came : came
long : long
way : way
mint : mint
less : less
hours : hours
wait : wait
start : start
building : building
fam : fam
catch : catch
check : check
new : new
item : item
opensea : opensea
via 

oat : oat
campaign : campaign
learn : learn
please : please
lin : lin
yet : yet
ducks : ducks
row : row
quackity : quackity
baby : baby
muse : muse
call : call
fatman : fatman
rasmus : rasmus
dogs : dogs
anericanbullies : anericanbullies
california : california
nigeria : nigeria
auc : auc
sold : sold
favourite : favourite
accounts : accounts
nftsold : nftsold
landscape : landscape
involving : involving
library : library
zfh : zfh
artist : artist
artwork : artwork
tonight : tonight
future : future
blockchain : blockchain
apaches : apaches
dissemination : dissemination
native : native
american : american
tribes : tribes
inspire : inspire
showdown : showdown
consensus : consensus
reignofterror : reignofterror
heading : heading
leading : leading
largest : largest
event : event
ucc : ucc
hacked : hacked
sign : sign
anything : anything
group : group
careful : careful
message : message
avax : avax
giveaways : giveaways
last : last
report : report
value : value
patient : patient
hard : hard
re

truely : truely
historic : historic
document : document
anywhere : anywhere
ugly : ugly
pets : pets
little : little
bout : bout
chicken : chicken
coop : coop
ico : ico
end : end
days : days
gulf : gulf
exchange : exchange
faceless : faceless
ta : ta
ruin : ruin
thread : thread
solananfts : solananfts
lava : lava
screen : screen
non : non
shitbeast : shitbeast
wanting : wanting
nostalgia : nostalgia
nd : nd
scre : scre
gmgm : gmgm
annebel : annebel
well : well
cnftgiveaway : cnftgiveaway
senate : senate
nodes : nodes
galactic : galactic
weeks : weeks
dia : dia
okayy : okayy
rest : rest
woof : woof
among : among
sharks : sharks
service : service
provider : provider
sciencemagic : sciencemagic
studios : studios
raised : raised
pre : pre
seed : seed
round : round
testing : testing
activationcode : activationcode
category : category
finger : finger
hustler : hustler
urban : urban
dragons : dragons
roadmap : roadmap
pleasure : pleasure
phases : phases
proyect : proyect
beautiful : beautiful


wall : wall
players : players
angel : angel
hidden : hidden
magicforest : magicforest
expect : expect
kill : kill
monsters : monsters
seco : seco
inspir : inspir
sleeve : sleeve
eight : eight
posted : posted
prices : prices
minutes : minutes
creation : creation
content : content
gamification : gamification
starts : starts
customization : customization
solananft : solananft
slow : slow
verifications : verifications
everything : everything
though : though
difficult : difficult
surely : surely
omfg : omfg
realized : realized
pushed : pushed
fine : fine
hou : hou
buddies : buddies
donating : donating
awesome : awesome
forest : forest
ap : ap
teamed : teamed
cro : cro
collec : collec
joined : joined
abo : abo
pig : pig
speechless : speechless
grow : grow
enthusiasts : enthusiasts
builders : builders
memorable : memorable
remar : remar
buyer : buyer
im : im
back : back
card : card
packs : packs
rule : rule
dc : dc
tradertersesat : tradertersesat
called : called
bridge : bridge
remaining : re

asupremacy : asupremacy
blocksmith : blocksmith
purchased : purchased
discover : discover
changing : changing
online : online
yuser : yuser
polkadot : polkadot
stay : stay
control : control
virus : virus
lives : lives
brilliant : brilliant
fluffy : fluffy
animeart : animeart
animenft : animenft
karachi : karachi
metagorgeous : metagorgeous
waters : waters
metagorgeo : metagorgeo
satari : satar
studio : studio
wrote : wrote
medium : medium
article : article
explaining : explaining
instructions : instructions
dusktopia : dusktopia
hydrated : hydrated
delightful : delightful
summer : summer
pill : pill
summervibes : summervibes
summertime : summertime
{'wl': 'wl', 'spots': 'spots', 'giveaway': 'giveaway', 'like': 'like', 'follow': 'follow', 'tag': 'tag', 'friends': 'friends', 'true': 'true', 'words': 'words', 'world': 'world', 'friend': 'friend', 'runx': 'runx', 'airdrop': 'airdrop', 'million': 'million', 'rnx': 'rnx', 'k': 'k', 'participants': 'participants', 'total': 'total', 'reward': 

Pandas Apply:   0%|          | 0/1000 [00:00<?, ?it/s]

0      [wl, spots, giveaway, like, follow, tag, friends]
1                           [true, words, world, friend]
2      [runx, airdrop, million, rnx, k, participants,...
3      [discord, officially, opening, gt, gt, passwor...
4      [discord, officially, opening, gt, gt, passwor...
                             ...                        
995    [okayy, today, good, news, wl, amp, nfts, give...
996    [giving, away, wl, spots, dusktopia, qualify, ...
997    [nft, wizards, free, mint, giveaway, announcin...
998    [stay, hydrated, fun, wish, delightful, summer...
999             [gm, platform, best, crypto, nft, space]
Name: Steaming, Length: 1000, dtype: object


In [9]:
prepocessing = dataset[['Tanggal&Waktu','Username','Clean_text','Tweet_tokenization','Stop_removal', 'Steaming']]

In [10]:
prepocessing

Unnamed: 0,Tanggal&Waktu,Username,Clean_text,Tweet_tokenization,Stop_removal,Steaming
0,2022-06-14 08:34:52+00:00,iammasterblade,wl spots giveaway like and fo...,"[, wl, spots, giveaway, like, and, follow, and...","[wl, spots, giveaway, like, follow, tag, friends]","[wl, spots, giveaway, like, follow, tag, friends]"
1,2022-06-14 08:34:51+00:00,naehrstff_nft,some true words here where in the world are ...,"[, some, true, words, here, where, in, the, wo...","[true, words, world, friend]","[true, words, world, friend]"
2,2022-06-14 08:34:51+00:00,mdzahid85418836,runx airdrop million rnx for k partici...,"[, runx, airdrop, million, rnx, for, k, partic...","[runx, airdrop, million, rnx, k, participants,...","[runx, airdrop, million, rnx, k, participants,..."
3,2022-06-14 08:34:51+00:00,annelie00284967,discord officially opening gt gt p...,"[, discord, officially, opening, gt, gt, passw...","[discord, officially, opening, gt, gt, passwor...","[discord, officially, opening, gt, gt, passwor..."
4,2022-06-14 08:34:51+00:00,tkirwi,discord officially opening gt gt p...,"[, discord, officially, opening, gt, gt, passw...","[discord, officially, opening, gt, gt, passwor...","[discord, officially, opening, gt, gt, passwor..."
...,...,...,...,...,...,...
995,2022-06-14 08:32:51+00:00,mathslab_arts,okayy so today s good news wl amp ...,"[, okayy, so, today, s, good, news, wl, amp, n...","[okayy, today, good, news, wl, amp, nfts, give...","[okayy, today, good, news, wl, amp, nfts, give..."
996,2022-06-14 08:32:51+00:00,oden_nft,giving away wl spots for dusktopia to...,"[, giving, away, wl, spots, for, dusktopia, to...","[giving, away, wl, spots, dusktopia, qualify, ...","[giving, away, wl, spots, dusktopia, qualify, ..."
997,2022-06-14 08:32:51+00:00,amber99631422,nft wizards free mint giveaway announci...,"[, nft, wizards, free, mint, giveaway, announc...","[nft, wizards, free, mint, giveaway, announcin...","[nft, wizards, free, mint, giveaway, announcin..."
998,2022-06-14 08:32:51+00:00,crtr_nft,stay hydrated and have fun we wish you all ...,"[, stay, hydrated, and, have, fun, we, wish, y...","[stay, hydrated, fun, wish, delightful, summer...","[stay, hydrated, fun, wish, delightful, summer..."


In [11]:
prepocessing.to_csv("data/PreprocessingData.csv")