# Scrape data from Twitter

In [1]:
import pandas as pd
import snscrape.modules.twitter as sntwitter

pd.set_option("max_columns", None)
pd.set_option("max_colwidth", None)
pd.set_option("max_rows", None)

## How to use snscrape 
https://betterprogramming.pub/how-to-scrape-tweets-with-snscrape-90124ed006af

### Scrape from specific user.

In [None]:
# Creating list to append tweet data 
tweets_list1 = []

# Using TwitterSearchScraper to scrape data and append tweets to list
for i,tweet in enumerate(sntwitter.TwitterSearchScraper('from:jack').get_items()): #declare a username 
    if i>1000: #number of tweets you want to scrape
        break
    tweets_list1.append([tweet.date, tweet.id, tweet.content, tweet.user.username]) #declare the attributes to be returned
    
# Creating a dataframe from the tweets list above 
tweets_df1 = pd.DataFrame(tweets_list1, columns=['Datetime', 'Tweet Id', 'Text', 'Username'])


In [None]:
tweets_df1 = pd.DataFrame(tweets_list1, columns=['Datetime', 'Tweet Id', 'Text', 'Username'])

### Scrape by searching for keywords.

In [None]:
# Creating list to append tweet data to
tweets_list2 = []

# Using TwitterSearchScraper to scrape data and append tweets to list
for i,tweet in enumerate(sntwitter.TwitterSearchScraper('its the elephant since:2020-06-01 until:2020-07-31').get_items()):
    if i>500:
        break
    tweets_list2.append([tweet.date, tweet.id, tweet.content, tweet.user.username])
    
# Creating a dataframe from the tweets list above
tweets_df2 = pd.DataFrame(tweets_list2, columns=['Datetime', 'Tweet Id', 'Text', 'Username'])

### 1) Data Collection by keywords

**Import set of keywords from csv file**

In [196]:
keywords = pd.read_csv('keywords_hate.csv')
keywords

Unnamed: 0,keywords
0,‡∏ó‡∏∏‡πÄ‡∏£‡∏®
1,‡πÄ‡∏õ‡∏£‡∏ï
2,‡πÄ‡∏™‡∏ô‡∏µ‡∏¢‡∏î
3,‡∏™‡∏±‡∏ô‡∏î‡∏≤‡∏ô
4,‡∏™‡∏ß‡∏∞
5,‡∏õ‡∏±‡∏ç‡∏ç‡∏≤‡∏≠‡πà‡∏≠‡∏ô
6,‡πÑ‡∏û‡∏£‡πà
7,‡∏£‡πà‡∏≤‡∏ô
8,‡∏à‡∏±‡∏ç‡πÑ‡∏£
9,‡∏ï‡∏≠‡πÅ‡∏´‡∏•


In [4]:
keywords.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24 entries, 0 to 23
Data columns (total 1 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   keywords  24 non-null     object
dtypes: object(1)
memory usage: 320.0+ bytes


### 2) Scrape 100 tweets per keyword

In [197]:
# Creating list to append tweet data to
tweets_list = []

# Using TwitterSearchScraper to scrape data and append tweets to list
for key in keywords['keywords']:
    for i,tweet in enumerate(sntwitter.TwitterSearchScraper(f'{key} since:2021-01-01 until:2021-10-31').get_items()):
        if i>100:
            break
        tweets_list.append([tweet.date, tweet.id, tweet.content, tweet.user.username])

In [198]:
print(f'Total sweets: {len(tweets_list)}')

Total sweets: 4646


In [199]:
# Creating a dataframe from the tweets list above
tweets_df = pd.DataFrame(tweets_list, columns=['Datetime', 'Tweet Id', 'Text', 'Username'])

**Save scraped tweets to csv file**

In [200]:
tweets_df.to_csv('tweet_from_keywords_topic.csv')

---
# Clean ThaiToxicityTweetCorpus
**How to get data to look into the file /ThaiToxicityTweetCorpus_load.ipynb**  
source: https://huggingface.co/datasets/thai_toxicity_tweet

In [28]:
tweets_df = pd.read_csv('ThaiToxicityTweetCorpus.csv')

In [None]:
tweets_df.iloc[lambda x: range(1,1000,10)]

In [29]:
tweets_df.head(50)

Unnamed: 0.1,Unnamed: 0,tweet_id,Text,toxic_votes,nontoxic_votes,is_toxic
0,0,898576382384418817,‡∏ß‡∏±‡∏ô‡πÜ ‡∏ô‡∏µ‡πà‡∏Ñ‡∏∏‡∏¢‡∏Å‡∏∞‡∏´‡∏°‡∏≤ ‡πÅ‡∏°‡∏ß ‡∏´‡∏°‡∏π ‡πÑ‡∏Å‡πà ‡∏°‡πâ‡∏≤ ‡∏Ñ‡∏ß‡∏≤‡∏¢ ‡∏°‡∏≤‡∏Å‡∏Å‡∏ß‡πà‡∏≤‡∏Ñ‡∏∏‡∏¢‡∏Å‡∏±‡∏ö‡∏Ñ‡∏ô‡πÑ‡∏õ‡∏•‡∏∞,0,3,0
1,2,899587505493692417,‡∏´‡∏•‡πà‡∏≠‡∏°‡∏≤‡∏Å‡∏Å‡∏Å ‡∏´‡∏•‡πà‡∏≠‡∏ß‡∏±‡∏ß‡∏ï‡∏≤‡∏¢‡∏Ñ‡∏ß‡∏≤‡∏¢‡∏•‡πâ‡∏°‡∏Å‡∏±‡∏ô‡πÄ‡∏•‡∏¢‡∏ó‡∏µ‡πÄ‡∏î‡∏µ‡∏¢‡∏ß‡∏ß‡∏ß‡∏ß,0,3,0
2,3,898920493763280897,‡∏™‡∏¥‡∏ß‡πÄ‡∏´‡∏µ‡πâ‡∏¢‡πÑ‡∏£‡∏Ç‡∏∂‡πâ‡∏ô‡∏´‡∏•‡∏±‡∏á‡∏´‡∏π ‡πÄ‡∏™‡∏µ‡∏¢‡∏ä‡∏≤‡∏ï‡∏¥‡πÄ‡∏Å‡∏¥‡∏î‡∏°‡∏±‡πâ‡∏¢ ‡πÄ‡∏Å‡∏¥‡∏î‡∏°‡∏≤‡∏°‡∏±‡∏ô‡∏ï‡πâ‡∏≠‡∏á‡∏ï‡πâ‡∏≠‡∏á‡πÇ‡∏î‡∏î‡πÄ‡∏î‡πà‡∏ô‡πÄ‡∏î‡πâ‡∏á‡∏î‡∏∂‡πã‡∏á‡∏≠‡∏¢‡πà‡∏≤‡πÄ‡∏ä‡πà‡∏ô‡∏Ç‡∏∂‡πâ‡∏ô‡∏ó‡∏µ‡πà‡∏´‡∏ô‡πâ‡∏≤‡πÑ‡∏£‡∏á‡∏µ‡πâ ‡∏≠‡∏µ‡∏Ñ‡∏ß‡∏≤‡∏¢,0,3,0
3,4,896808964326694912,‡∏≠‡πà‡∏∞ ‡∏õ‡πà‡∏ß‡∏¢‡∏Å‡πá‡∏õ‡πà‡∏ß‡∏¢ ‡∏á‡∏≤‡∏ô‡∏à‡πâ‡∏≤‡∏á‡∏Å‡πá‡∏ï‡πâ‡∏≠‡∏á‡∏ó‡∏≥ ‡∏á‡∏≤‡∏ô‡πÄ‡∏£‡∏µ‡∏¢‡∏ô‡∏Å‡πá‡∏ï‡πâ‡∏≠‡∏á‡∏ó‡∏≥ ‡∏ß‡∏¥‡∏ä‡∏≤‡∏î‡∏µ‡πÑ‡∏ã‡∏ô‡πå‡∏ï‡∏±‡∏ß‡∏£‡πâ‡∏≤‡∏¢‡∏Å‡∏±‡∏ö‡∏Ñ‡∏ß‡∏≤‡∏¢‡∏≠‡∏¢‡πà‡∏≤‡∏á‡∏Å‡∏π‡πÄ‡∏≠‡∏á,0,3,0
4,6,896808093949976576,‡∏ô‡∏µ‡πà‡∏Å‡πá‡πÄ‡∏û‡∏¥‡πà‡∏á‡∏£‡∏π‡πâ‡∏ß‡πà‡∏≤ ‡πÄ‡∏Å‡∏¥‡∏î‡∏ä‡∏≤‡∏ï‡∏¥‡∏ô‡∏µ‡πâ‡∏ä‡∏≤‡∏ï‡∏¥‡πÄ‡∏î‡∏µ‡∏¢‡∏ß ‡πÄ‡∏õ‡πá‡∏ô‡∏ó‡∏±‡πâ‡∏á ‡πÄ‡∏´‡∏µ‡πâ‡∏¢ ‡πÄ‡∏õ‡πá‡∏ô‡∏ó‡∏±‡πâ‡∏á ‡∏Ñ‡∏ß‡∏≤‡∏¢ ‡πÄ‡∏•‡∏¢ ‡∏Ñ‡∏∏‡πâ‡∏°‡πÅ‡∏ó‡πâ‡πÜ üòÜ,2,1,1
5,7,898919574413783041,‡πÇ‡∏ß‡πâ‡∏¢‡∏≠‡∏µ‡∏Ñ‡∏ß‡∏≤‡∏¢ ‡∏´‡∏ô‡∏±‡∏Å‡∏Å‡∏ß‡πà‡∏≤‡∏Å‡∏π‡∏Å‡πá‡∏°‡∏∂‡∏á‡∏≠‡∏∞,2,1,1
6,8,899584228056281088,‡∏Ç‡∏≠‡πÇ‡∏ó‡∏© ‡∏ô‡∏∞‡∏Ñ‡∏£‡∏±‡∏ö ‡∏≠‡∏¢‡∏≤‡∏Å‡πÑ‡∏î‡πâ‡∏Ñ‡∏ß‡∏≤‡∏¢‡πÉ‡∏´‡πâ‡πÑ‡∏õ‡∏Ñ‡∏ö‡∏Å‡∏±‡∏ö‡∏Ñ‡∏ô‡∏≠‡∏∑‡πà‡∏ô‡∏ô‡∏∞\n‡∏û‡∏≠‡∏î‡∏µ‡∏Å‡∏π‡∏Ñ‡∏ô‡πÑ‡∏°‡πà‡πÉ‡∏ä‡πà‡∏Ñ‡∏ß‡∏≤‡∏¢ ‚òª,3,0,1
7,9,898919452036653056,‡∏õ‡∏•‡∏±‡πä‡∏Å‡πÑ‡∏ü‡∏≠‡∏¢‡∏π‡πà‡∏û‡∏∑‡πâ‡∏ô ‡πÄ‡∏•‡∏¢‡∏•‡∏á‡∏°‡∏≤‡∏ô‡∏≠‡∏ô‡∏ö‡∏ô‡∏û‡∏∑‡πâ‡∏ô ‡∏≠‡∏µ‡∏Ñ‡∏ß‡∏≤‡∏¢ ‡πÑ‡∏ü‡∏à‡∏∞‡∏î‡∏π‡∏î‡∏Å‡∏π‡∏°‡∏±‡πâ‡∏¢ ‡πÄ‡∏™‡∏µ‡∏¢‡∏ö‡∏ä‡∏≤‡∏£‡πå‡∏à‡πÑ‡∏ü‡∏Å‡πá‡πÅ‡∏•‡∏ö‡∏≠‡∏≠‡∏Å‡∏°‡∏≤ ‡∏≠‡∏µ‡πÄ‡∏´‡∏µ‡πâ‡∏¢‡∏¢‡∏¢,0,3,0
8,11,899583996383776769,‡∏Å‡∏π‡∏°‡∏±‡∏ô‡πÇ‡∏á‡πà‡πÄ‡∏≠‡∏á‡∏ó‡∏µ‡πà‡∏¢‡∏±‡∏á‡πÄ‡∏ä‡∏∑‡πà‡∏≠‡∏Ñ‡∏≥‡∏™‡∏±‡∏ç‡∏ç‡∏≤‡∏ö‡πâ‡∏≤‡∏ö‡∏≠‡∏Ç‡∏≠‡∏á‡∏Ñ‡∏ô‡∏Ñ‡∏ô‡∏ô‡∏∂‡∏á‡∏≠‡∏¢‡∏π‡πà...\n...‡∏Å‡∏π‡∏ô‡∏µ‡πâ‡∏°‡∏±‡∏ô‡∏Ñ‡∏ß‡∏≤‡∏¢‡∏à‡∏£‡∏¥‡∏á‡πÜ #üòè,0,3,0
9,12,898919256288501762,‡∏Ñ‡∏ß‡∏≤‡∏¢‡∏°‡∏±‡∏ô‡∏≠‡∏¢‡∏π‡πà‡∏ö‡∏ô‡∏î‡∏¥‡∏ô‡πÑ‡∏á ‡∏°‡∏±‡∏ô‡∏ö‡∏¥‡∏ô‡πÑ‡∏°‡πà‡πÑ‡∏î‡πâ,0,3,0


In [30]:
tweets_df['Text'] = tweets_df['Text'].str.replace('@([^ ]+)|#([^ ]+)|https.*|[a-zA-Z./0-9]|[^‡∏Å-‡πè\s.]','', regex=True)
tweets_df['Text'] = tweets_df['Text'].str.replace('\n',' ', regex=True)
tweets_df['Text'] = tweets_df['Text'].str.strip()

In [31]:
tweets_df.head(40)

Unnamed: 0.1,Unnamed: 0,tweet_id,Text,toxic_votes,nontoxic_votes,is_toxic
0,0,898576382384418817,‡∏ß‡∏±‡∏ô‡πÜ ‡∏ô‡∏µ‡πà‡∏Ñ‡∏∏‡∏¢‡∏Å‡∏∞‡∏´‡∏°‡∏≤ ‡πÅ‡∏°‡∏ß ‡∏´‡∏°‡∏π ‡πÑ‡∏Å‡πà ‡∏°‡πâ‡∏≤ ‡∏Ñ‡∏ß‡∏≤‡∏¢ ‡∏°‡∏≤‡∏Å‡∏Å‡∏ß‡πà‡∏≤‡∏Ñ‡∏∏‡∏¢‡∏Å‡∏±‡∏ö‡∏Ñ‡∏ô‡πÑ‡∏õ‡∏•‡∏∞,0,3,0
1,2,899587505493692417,‡∏´‡∏•‡πà‡∏≠‡∏°‡∏≤‡∏Å‡∏Å‡∏Å ‡∏´‡∏•‡πà‡∏≠‡∏ß‡∏±‡∏ß‡∏ï‡∏≤‡∏¢‡∏Ñ‡∏ß‡∏≤‡∏¢‡∏•‡πâ‡∏°‡∏Å‡∏±‡∏ô‡πÄ‡∏•‡∏¢‡∏ó‡∏µ‡πÄ‡∏î‡∏µ‡∏¢‡∏ß‡∏ß‡∏ß‡∏ß,0,3,0
2,3,898920493763280897,‡∏™‡∏¥‡∏ß‡πÄ‡∏´‡∏µ‡πâ‡∏¢‡πÑ‡∏£‡∏Ç‡∏∂‡πâ‡∏ô‡∏´‡∏•‡∏±‡∏á‡∏´‡∏π ‡πÄ‡∏™‡∏µ‡∏¢‡∏ä‡∏≤‡∏ï‡∏¥‡πÄ‡∏Å‡∏¥‡∏î‡∏°‡∏±‡πâ‡∏¢ ‡πÄ‡∏Å‡∏¥‡∏î‡∏°‡∏≤‡∏°‡∏±‡∏ô‡∏ï‡πâ‡∏≠‡∏á‡∏ï‡πâ‡∏≠‡∏á‡πÇ‡∏î‡∏î‡πÄ‡∏î‡πà‡∏ô‡πÄ‡∏î‡πâ‡∏á‡∏î‡∏∂‡πã‡∏á‡∏≠‡∏¢‡πà‡∏≤‡πÄ‡∏ä‡πà‡∏ô‡∏Ç‡∏∂‡πâ‡∏ô‡∏ó‡∏µ‡πà‡∏´‡∏ô‡πâ‡∏≤‡πÑ‡∏£‡∏á‡∏µ‡πâ ‡∏≠‡∏µ‡∏Ñ‡∏ß‡∏≤‡∏¢,0,3,0
3,4,896808964326694912,‡∏≠‡πà‡∏∞ ‡∏õ‡πà‡∏ß‡∏¢‡∏Å‡πá‡∏õ‡πà‡∏ß‡∏¢ ‡∏á‡∏≤‡∏ô‡∏à‡πâ‡∏≤‡∏á‡∏Å‡πá‡∏ï‡πâ‡∏≠‡∏á‡∏ó‡∏≥ ‡∏á‡∏≤‡∏ô‡πÄ‡∏£‡∏µ‡∏¢‡∏ô‡∏Å‡πá‡∏ï‡πâ‡∏≠‡∏á‡∏ó‡∏≥ ‡∏ß‡∏¥‡∏ä‡∏≤‡∏î‡∏µ‡πÑ‡∏ã‡∏ô‡πå‡∏ï‡∏±‡∏ß‡∏£‡πâ‡∏≤‡∏¢‡∏Å‡∏±‡∏ö‡∏Ñ‡∏ß‡∏≤‡∏¢‡∏≠‡∏¢‡πà‡∏≤‡∏á‡∏Å‡∏π‡πÄ‡∏≠‡∏á,0,3,0
4,6,896808093949976576,‡∏ô‡∏µ‡πà‡∏Å‡πá‡πÄ‡∏û‡∏¥‡πà‡∏á‡∏£‡∏π‡πâ‡∏ß‡πà‡∏≤ ‡πÄ‡∏Å‡∏¥‡∏î‡∏ä‡∏≤‡∏ï‡∏¥‡∏ô‡∏µ‡πâ‡∏ä‡∏≤‡∏ï‡∏¥‡πÄ‡∏î‡∏µ‡∏¢‡∏ß ‡πÄ‡∏õ‡πá‡∏ô‡∏ó‡∏±‡πâ‡∏á ‡πÄ‡∏´‡∏µ‡πâ‡∏¢ ‡πÄ‡∏õ‡πá‡∏ô‡∏ó‡∏±‡πâ‡∏á ‡∏Ñ‡∏ß‡∏≤‡∏¢ ‡πÄ‡∏•‡∏¢ ‡∏Ñ‡∏∏‡πâ‡∏°‡πÅ‡∏ó‡πâ‡πÜ,2,1,1
5,7,898919574413783041,‡πÇ‡∏ß‡πâ‡∏¢‡∏≠‡∏µ‡∏Ñ‡∏ß‡∏≤‡∏¢ ‡∏´‡∏ô‡∏±‡∏Å‡∏Å‡∏ß‡πà‡∏≤‡∏Å‡∏π‡∏Å‡πá‡∏°‡∏∂‡∏á‡∏≠‡∏∞,2,1,1
6,8,899584228056281088,‡∏Ç‡∏≠‡πÇ‡∏ó‡∏© ‡∏ô‡∏∞‡∏Ñ‡∏£‡∏±‡∏ö ‡∏≠‡∏¢‡∏≤‡∏Å‡πÑ‡∏î‡πâ‡∏Ñ‡∏ß‡∏≤‡∏¢‡πÉ‡∏´‡πâ‡πÑ‡∏õ‡∏Ñ‡∏ö‡∏Å‡∏±‡∏ö‡∏Ñ‡∏ô‡∏≠‡∏∑‡πà‡∏ô‡∏ô‡∏∞ ‡∏û‡∏≠‡∏î‡∏µ‡∏Å‡∏π‡∏Ñ‡∏ô‡πÑ‡∏°‡πà‡πÉ‡∏ä‡πà‡∏Ñ‡∏ß‡∏≤‡∏¢,3,0,1
7,9,898919452036653056,‡∏õ‡∏•‡∏±‡πä‡∏Å‡πÑ‡∏ü‡∏≠‡∏¢‡∏π‡πà‡∏û‡∏∑‡πâ‡∏ô ‡πÄ‡∏•‡∏¢‡∏•‡∏á‡∏°‡∏≤‡∏ô‡∏≠‡∏ô‡∏ö‡∏ô‡∏û‡∏∑‡πâ‡∏ô ‡∏≠‡∏µ‡∏Ñ‡∏ß‡∏≤‡∏¢ ‡πÑ‡∏ü‡∏à‡∏∞‡∏î‡∏π‡∏î‡∏Å‡∏π‡∏°‡∏±‡πâ‡∏¢ ‡πÄ‡∏™‡∏µ‡∏¢‡∏ö‡∏ä‡∏≤‡∏£‡πå‡∏à‡πÑ‡∏ü‡∏Å‡πá‡πÅ‡∏•‡∏ö‡∏≠‡∏≠‡∏Å‡∏°‡∏≤ ‡∏≠‡∏µ‡πÄ‡∏´‡∏µ‡πâ‡∏¢‡∏¢‡∏¢,0,3,0
8,11,899583996383776769,‡∏Å‡∏π‡∏°‡∏±‡∏ô‡πÇ‡∏á‡πà‡πÄ‡∏≠‡∏á‡∏ó‡∏µ‡πà‡∏¢‡∏±‡∏á‡πÄ‡∏ä‡∏∑‡πà‡∏≠‡∏Ñ‡∏≥‡∏™‡∏±‡∏ç‡∏ç‡∏≤‡∏ö‡πâ‡∏≤‡∏ö‡∏≠‡∏Ç‡∏≠‡∏á‡∏Ñ‡∏ô‡∏Ñ‡∏ô‡∏ô‡∏∂‡∏á‡∏≠‡∏¢‡∏π‡πà ‡∏Å‡∏π‡∏ô‡∏µ‡πâ‡∏°‡∏±‡∏ô‡∏Ñ‡∏ß‡∏≤‡∏¢‡∏à‡∏£‡∏¥‡∏á‡πÜ,0,3,0
9,12,898919256288501762,‡∏Ñ‡∏ß‡∏≤‡∏¢‡∏°‡∏±‡∏ô‡∏≠‡∏¢‡∏π‡πà‡∏ö‡∏ô‡∏î‡∏¥‡∏ô‡πÑ‡∏á ‡∏°‡∏±‡∏ô‡∏ö‡∏¥‡∏ô‡πÑ‡∏°‡πà‡πÑ‡∏î‡πâ,0,3,0


In [262]:
tweets_df = tweets_df[tweets_df['Text'].str.match('.+[a-zA-Z]') == False]

In [263]:
tweets_df = tweets_df[tweets_df['Text'].str.match('^[‡∏Å-‡πè\s.]+$') == True]

In [264]:
tweets_df.head(50)

Unnamed: 0.1,Unnamed: 0,Datetime,Tweet Id,Text,Username
0,0,2021-10-30 23:49:15+00:00,1454596289916641281,‡∏¢‡∏±‡∏á‡πÑ‡∏°‡πà‡πÑ‡∏î‡πâ‡∏ô‡∏≠‡∏ô ‡πÄ‡∏ß‡∏£‡∏ó‡∏∏‡πÄ‡∏£‡∏®‡∏ó‡∏∏‡∏£‡∏±‡∏á,koyteera
1,1,2021-10-30 23:46:38+00:00,1454595632946049033,‡∏ó‡∏∏‡πÄ‡∏£‡∏®‡∏°‡∏≤‡∏Å ‡∏≠‡∏µ‡∏™‡∏±‡∏î ‡∏£‡∏û.‡πÄ‡∏≠‡∏Å‡∏ä‡∏ô‡πÅ‡∏ö‡πà‡∏á‡πÉ‡∏´‡πâ‡∏õ‡∏£‡∏∞‡∏ä‡∏≤‡∏ä‡∏ô‡∏à‡∏≠‡∏á‡∏Å‡∏µ‡πà‡∏£‡∏û ‡πÅ‡∏•‡πâ‡∏ß‡∏°‡∏∂‡∏á‡∏Å‡πá‡πÄ‡∏™‡∏ô‡∏≠‡∏´‡∏ô‡πâ‡∏≤‡∏°‡∏≤‡πÄ‡∏™‡∏ô‡∏≠‡∏à‡∏≠‡∏á‡∏ï‡∏±‡∏î‡∏´‡∏ô‡πâ‡∏≤‡∏Ñ‡∏ô‡∏≠‡∏∑‡πà‡∏ô‡∏≠‡∏∞,meowhunhun94
2,2,2021-10-30 23:45:52+00:00,1454595437923426304,‡∏ô‡πà‡∏≤‡∏Å‡∏•‡∏±‡∏ß ‡∏ó‡∏∏‡πÄ‡∏£‡∏®‡∏°‡∏≤‡∏Å‡∏Å‡∏Å‡∏Å,269zann
3,3,2021-10-30 23:43:40+00:00,1454594883813998593,‡πÇ‡∏´‡πâ‡πÄ‡∏´‡πá‡∏ô‡πÅ‡∏•‡πâ‡∏ß‡∏à‡∏∞‡∏≠‡πâ‡∏ß‡∏Å‡∏Å‡∏ä‡∏±‡πâ‡∏ô‡πÑ‡∏°‡πà‡∏≠‡∏¢‡∏≤‡∏Å‡πÉ‡∏´‡πâ‡∏û‡∏¥‡πÄ‡∏à‡∏ô‡∏°‡∏≤‡πÄ‡∏´‡πá‡∏ô‡∏≠‡∏∞‡πÑ‡∏£‡∏ó‡∏∏‡πÄ‡∏£‡∏®‡πÜ‡πÅ‡∏ö‡∏ö‡∏ô‡∏µ‡πâ‡∏≠‡πà‡∏∞,krysxpp2
4,4,2021-10-30 23:35:22+00:00,1454592797529440256,‡∏ó‡∏∏‡πÄ‡∏£‡∏®,bpsms_
6,6,2021-10-30 23:16:05+00:00,1454587943738228737,‡∏ó‡∏∏‡πÄ‡∏£‡∏®‡∏ï‡∏±‡πâ‡∏á‡πÅ‡∏ï‡πà‡∏ï‡πâ‡∏≠‡∏á‡∏à‡πà‡∏≤‡∏¢‡∏ï‡∏±‡∏á‡∏â‡∏µ‡∏î‡∏ß‡∏±‡∏Ñ‡∏ã‡∏µ‡∏ô‡∏•‡∏∞‡∏≠‡∏´ ‡∏õ‡∏£‡∏∞‡πÄ‡∏ó‡∏®‡∏´‡∏±‡∏ß‡∏Ñ‡∏¢,MaDMSU_603
7,7,2021-10-30 23:13:36+00:00,1454587317486718977,‡πÑ‡∏ó‡∏°‡πå‡πÇ‡∏ã‡∏ô‡∏ó‡∏∏‡πÄ‡∏£‡∏®,yinhasdin
8,8,2021-10-30 23:11:15+00:00,1454586727595532290,‡∏ó‡∏∏‡πÄ‡∏£‡∏®‡∏°‡∏≤‡∏Å‡∏≠‡∏µ‡∏™‡∏±‡∏™,na_jaemin_10
9,9,2021-10-30 23:06:02+00:00,1454585415239823363,‡∏ó‡∏∏‡πÄ‡∏£‡∏®‡πÄ‡∏Å‡∏¥‡∏ô‡πÑ‡∏õ‡∏õ‡πà‡∏∞‡∏Ñ‡∏∞,khimTF
13,13,2021-10-30 22:39:41+00:00,1454578781855748098,‡∏ó‡∏∏‡πÄ‡∏£‡∏®‡∏°‡∏≤‡∏Å‡∏Å‡∏Å ‡∏Ñ‡∏¥‡∏î‡πÑ‡∏î‡πâ‡πÑ‡∏á‡∏Å‡πà‡∏≠‡∏ô ‡∏≠‡∏¥‡∏ß‡∏≤‡∏¢‡∏à‡∏µ‡∏≠‡∏¢‡πà‡∏≤‡∏õ‡∏•‡πà‡∏≠‡∏¢‡πÄ‡∏ö‡∏•‡∏≠‡∏ô‡∏∞‡∏≠‡∏¥‡∏™‡∏±‡∏™‡∏°‡∏∂‡∏á‡∏ï‡πâ‡∏≠‡∏á‡πÄ‡∏≠‡∏≤‡πÄ‡∏£‡∏∑‡πà‡∏≠‡∏á‡πÉ‡∏´‡πâ‡∏ñ‡∏∂‡∏á‡∏ó‡∏µ‡πà‡∏™‡∏∏‡∏î,poohhsiri


In [265]:
tweets_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2796 entries, 0 to 4645
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  2796 non-null   int64 
 1   Datetime    2796 non-null   object
 2   Tweet Id    2796 non-null   int64 
 3   Text        2796 non-null   object
 4   Username    2796 non-null   object
dtypes: int64(2), object(3)
memory usage: 131.1+ KB


In [32]:
tweets_df.to_csv('Thai_Toxicity_Tweet_Corpus_clean.csv', encoding='utf-8-sig')