# Import Module

In [53]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
from plotly import graph_objs as go
import plotly.express as px
import plotly.figure_factory as ff

from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

import os

# Load the data

In [54]:
import os

folder_path = r"D:\Code\py_code\Text-Processing\data\twitter_data"
file_names = os.listdir(folder_path)

print(file_names)

['pajak (1).csv', 'pajak (2).csv', 'pajak-1.csv', 'pajak-10.csv', 'pajak-aufa.csv', 'pajak-evi1.csv', 'pajak-evi2.csv', 'pajak.csv']


In [55]:
def load_data(folder_path, file_name):
    data = pd.read_csv(os.path.join(folder_path, file_name))
    return data

In [56]:
datas = []

for file_name in file_names:
    data = load_data(folder_path, file_name)
    datas.append(data)

data = pd.concat(datas, axis=0)
data.shape

(2955, 15)

# Preprocessing

In [57]:
data.head()

Unnamed: 0,conversation_id_str,created_at,favorite_count,full_text,id_str,image_url,in_reply_to_screen_name,lang,location,quote_count,reply_count,retweet_count,tweet_url,user_id_str,username
0,1786040009260445713,Thu May 02 14:28:15 +0000 2024,9620,aku ga pernah punya pengalaman serupa tapi be...,1786040009260445713,https://pbs.twimg.com/media/GMlJ685aAAASA6n.jpg,,in,,638,262,627,https://twitter.com/convomfs/status/1786040009...,1284061445148209154,convomfs
1,1786277896056864854,Fri May 03 06:13:32 +0000 2024,210,Wuiiih perpanjang kontrak dgn red spark Megawa...,1786277896056864854,https://pbs.twimg.com/ext_tw_video_thumb/17862...,,in,58%,3,58,81,https://twitter.com/toe_giman/status/178627789...,1336576386987790336,toe_giman
2,1788725387201261835,Fri May 10 00:18:59 +0000 2024,156,Ada 3 kepastian di dunia ini: 1) Kematian 2) P...,1788725387201261835,,,in,"Michuhol-gu, Republic of Korea",1,9,18,https://twitter.com/ardisatriawan/status/17887...,170542374,ardisatriawan
3,1786778563557163490,Sat May 04 15:23:00 +0000 2024,42,Pemerintahan Joe Biden membakar uang pajak war...,1786778563557163490,https://pbs.twimg.com/media/GMvpomIbkAAYrSE.jpg,,in,Indonesia,0,3,8,https://twitter.com/Vendra_Deje/status/1786778...,384176072,Vendra_Deje
4,1788764422900732144,Fri May 10 02:54:06 +0000 2024,37,Indonesia memburuk sangat cepat marilah bangki...,1788764422900732144,https://pbs.twimg.com/media/GNL3w1Ja4AAHY7u.jpg,,in,,2,0,24,https://twitter.com/Raky4tB3rs4tu__/status/178...,1629184074781954048,Raky4tB3rs4tu__


In [58]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2955 entries, 0 to 593
Data columns (total 15 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   conversation_id_str      2955 non-null   int64 
 1   created_at               2955 non-null   object
 2   favorite_count           2955 non-null   int64 
 3   full_text                2955 non-null   object
 4   id_str                   2955 non-null   int64 
 5   image_url                1202 non-null   object
 6   in_reply_to_screen_name  595 non-null    object
 7   lang                     2955 non-null   object
 8   location                 1955 non-null   object
 9   quote_count              2955 non-null   int64 
 10  reply_count              2955 non-null   int64 
 11  retweet_count            2955 non-null   int64 
 12  tweet_url                2955 non-null   object
 13  user_id_str              2955 non-null   int64 
 14  username                 2955 non-null   objec

In [59]:
data.describe()

Unnamed: 0,conversation_id_str,favorite_count,id_str,quote_count,reply_count,retweet_count,user_id_str
count,2955.0,2955.0,2955.0,2955.0,2955.0,2955.0,2955.0
mean,1.774143e+18,1727.250761,1.774163e+18,57.832487,64.388156,473.622673,7.020478e+17
std,7.31734e+16,5766.266791,7.317592e+16,332.04532,241.746094,1637.588647,7.048841e+17
min,2.498896e+17,0.0,2.498896e+17,0.0,0.0,0.0,744253.0
25%,1.783009e+18,8.0,1.783017e+18,0.0,1.0,2.0,237530300.0
50%,1.784411e+18,45.0,1.784413e+18,1.0,4.0,12.0,7.865173e+17
75%,1.78591e+18,378.0,1.785937e+18,7.0,20.0,90.0,1.391916e+18
max,1.788873e+18,64155.0,1.788873e+18,9325.0,3072.0,32782.0,1.775367e+18


In [60]:
data.isna().sum()

conversation_id_str           0
created_at                    0
favorite_count                0
full_text                     0
id_str                        0
image_url                  1753
in_reply_to_screen_name    2360
lang                          0
location                   1000
quote_count                   0
reply_count                   0
retweet_count                 0
tweet_url                     0
user_id_str                   0
username                      0
dtype: int64

## Drop unwanted columns

In [61]:
data = data[['full_text']]
data.head()

Unnamed: 0,full_text
0,aku ga pernah punya pengalaman serupa tapi be...
1,Wuiiih perpanjang kontrak dgn red spark Megawa...
2,Ada 3 kepastian di dunia ini: 1) Kematian 2) P...
3,Pemerintahan Joe Biden membakar uang pajak war...
4,Indonesia memburuk sangat cepat marilah bangki...


## Drop missing values & duplicate data

In [62]:
data.isna().sum(), data.shape

(full_text    0
 dtype: int64,
 (2955, 1))

In [63]:
data.drop_duplicates(inplace=True)
data.shape

(1533, 1)

In [64]:
data.to_csv(r'D:\Code\py_code\Text-Processing\data\labelled\clean-column.csv', index=False)
data.to_excel(r'D:\Code\py_code\Text-Processing\data\labelled\clean-column.xlsx', index=False)


In [65]:
labelled_data = pd.read_excel(r'D:\Code\py_code\Text-Processing\data\labelled\labelled_data.xlsx')
labelled_data.head()

Unnamed: 0,id_str,full_text,label
0,1,aku ga pernah punya pengalaman serupa tapi be...,0
1,2,Wujud Revolusi Mental: - Yang dihajar yang min...,0
2,3,Lagi viral ! Pengusaha empek-empek di Palemban...,1
3,4,Tolak Bayar Pajak Pasangan Ini Pilih Robek Tas...,0
4,5,Hukum bekerja dikantor pajak dan bea cukai htt...,1
