# Data Cleaning & Split Date

In [45]:
import pandas as pd
from bs4 import BeautifulSoup as bs
from datetime import datetime,timedelta
from pytz import timezone
import os

In [46]:
all_agent = [
    'ThaiPublica',
    'PracharChat',
    'Sanook',
    'Manager',
    'NaewNa',
    'Matichon',
    'VoaThai',
    'BangkokToday',
    'Infoquest',
    'PrachaTai',
    'SiamBlockchain',
    'PostToday',
    'ThaiRath'
]

'''
Thaiware
ThaiPBS
Mono29
Mcot
KhaoSod
Investing
Goethe
'''

clean_path = '../assets/CleanedData'

In [47]:
def createFolder(directory):
    try:
        if not os.path.exists(directory):
            os.makedirs(directory)
    except OSError:
        print ('Error: Creating directory. ' +  directory)

# Cleaned Data

In [48]:
def html_parser(news):    
    temp =  bs(str(news),'html.parser').get_text()
    result = ' '.join(temp.split())
    return result

In [49]:
def local_time(time):
    date = []
    if time[-1] == 'T':
        date = datetime.strptime(time,'%a, %d %b %Y %H:%M:%S %Z')
        date = date.astimezone(timezone('Asia/Bangkok'))
    elif time[-3] == ':' and time[-6] == ' ':
        date = datetime.strptime(time,'%B %d, %Y %H:%M')
    elif time[-3] == ':':
        date = datetime.strptime(time,'%Y-%m-%d %H:%M:%S')
    else:
        date = datetime.strptime(time,'%a, %d %b %Y %H:%M:%S %z')
        date = date.astimezone(timezone('Asia/Bangkok'))
    return date.strftime('%Y_%m_%d')

In [50]:
def clean_csv(df_news):
  temp = df_news
  temp['summary'] = temp['summary'].apply(html_parser)
  temp['published'] = temp['published'].apply(local_time)
  result = temp.drop_duplicates(subset=['summary'])
  result = result.reset_index(drop=True)
  return result

In [51]:
def split_time(df_news,agent):
  temp = df_news
  time = df_news['published'].unique()
  for i in time:
    data = df_news[df_news['published'] == i]
    data.reset_index(drop=True,inplace=True)
    createFolder('../assets/CleanedData/'+agent)
    data.to_csv('../assets/CleanedData/'+agent+'/'+i+'.csv',index=False)

In [52]:
def to_csv(agent):
  for i in agent:
    print('Process : '+i)
    df_news = pd.read_csv('../assets/Data/'+i+'.csv')
    temp = clean_csv(df_news)
    split_time(temp,i)
    print('==========')

In [53]:
to_csv(all_agent)

Process : ThaiPublica
Process : PracharChat
Process : Sanook
Process : Manager
Process : NaewNa
Process : Matichon
Process : VoaThai
Process : BangkokToday
Process : Infoquest
Process : PrachaTai
Process : SiamBlockchain
Process : PostToday
Process : ThaiRath


# Split Date

In [54]:
def splitDate(data_path,news_agent):
    path = '../assets/SplitData'
    createFolder('../assets/SplitData')

    ## Loop for all Agent News
    for agent in news_agent:
        all_date = os.listdir(data_path+'/'+agent)
        for f in all_date:

            ## Create If dont have file
            if not os.path.isfile(path+'/'+f):

                create = pd.DataFrame([],columns=['title','summary','link','published'])
                create.to_csv(path+'/'+f,index=False)

            ## Concat
            df1 = pd.read_csv(path+'/'+f)
            df2 = pd.read_csv(data_path+'/'+agent+'/'+f)
            result = df1.append(df2,ignore_index=False)
            result.to_csv(path+'/'+f,index=False)

        # print(all_date)

            

In [55]:
splitDate(clean_path,all_agent)