In [57]:
import pandas as pd
import json
import glob
from datetime import datetime
import re


In [58]:
files = glob.glob("sample_data/*.json")
data = []
for file in files:
    with open(file) as f:
        try:
            doc = json.load(f)
            doc['file_name'] = file
            data.append(doc)
        except Exception as e:
            print('Invalid JSON file format - %s'%file)

df = pd.DataFrame(data, dtype='str')

In [59]:
df.head()

Unnamed: 0,title,content,url,date,category,file_name
0,Police probing two suspicious deaths in Uragas...,COLOMBO (News 1st); Sri Lanka Police launched ...,https://www.newsfirst.lk/2021/10/22/police-pro...,2021-10-22,crime,sample_data/3_en.json
1,"No fuel shortage, long queues due to rumours: ...",There is no shortage of fuel in the country an...,https://www.dailymirror.lk/latest_news/No-fuel...,2021-10-21,,sample_data/10_en.json
2,16 மற்றும் 17 வயதான பாடசாலை மாணவர்களுக்கு தடுப...,Colombo (News 1st) 16 மற்றும் 17 வயதான பாடசாலை...,https://www.newsfirst.lk/tamil/2021/10/22/16-%...,202110-22,"health, covid",sample_data/2_ta.json
3,"CID ය සිය WhatsApp ඇමතුම්වලට සවන් දී ඇතැයි, පා...",අධිකරණයේ අවසරයකින් තොරව අපරාධ පරීක්ෂණ දෙපාර්තම...,https://www.newsfirst.lk/sinhala/2021/10/21/ci...,2021-10-15,,sample_data/7_si.json
4,Mahela to leave SL team after today’s game,Former national cricket captain Mahela Jayawar...,https://www.dailymirror.lk/breaking_news/Mahel...,2021-10-22,sports,sample_data/2_en.json


In [60]:
nan_file_name = df[pd.isna(df['file_name'])]
if len(nan_file_name.index):
    print('File name should not be empty/null')

In [61]:
def is_valid_column(content):
    if isinstance(content, float):
        return False
    if isinstance(content, str) and not content or content.isspace():
        return False
    

In [62]:
valid_categories = ['general', 'crime', 'political', 'business', 'economic', 'sports', 'arts', 'entertainment', 'education', 'tech', 'auto', 'legal', 'lifestyle', 'health', 'covid', 'weather']

In [63]:
invalid_title = []
invalid_content = []
invalid_url = []
invalid_date = []
invalid_category = []
for index, row in df.iterrows():
    if is_valid_column(row['title']) == False:
        invalid_title.append(row['file_name'])
    if is_valid_column(row['content']) == False:
        invalid_content.append(row['file_name'])
    if is_valid_column(row['url']) == False:
        invalid_url.append(row['file_name'])
        
    try:
        if is_valid_column(row['date'])  == False:
            invalid_date.append(row['file_name'])
        else:
            datetime_object = datetime.strptime(row['date'], '%Y-%m-%d')
    except Exception as e:
        print(e, row['file_name'])
        invalid_date.append(row['file_name'])
    
    if is_valid_column(row['category']) == False:
        invalid_category.append(row['file_name'])
    else:
        categories = re.sub(r"\s+", "", row['category'], flags=re.UNICODE)
        categories_arr = categories.split(',')
        cat_check = []
        for category in categories_arr:
            if category not in valid_categories:
                print('invalid category - %s -> %s'%(category, row['file_name']))
                cat_check.append(category)
        if cat_check:
            invalid_category.append(row['file_name'])

duplicate_title = df[df.duplicated('title')]
duplicate_content = df[df.duplicated('content')]

time data '202110-22' does not match format '%Y-%m-%d' sample_data/2_ta.json
time data 'YYYY-MM-DD' does not match format '%Y-%m-%d' sample_data/6_si.json


In [64]:
print('<-------- Validation results ---------->\n')
is_invalid_found = False
if invalid_title:
    is_invalid_found = True
    print('Invalid title field ')
    print(invalid_title)
    print('\n')
if invalid_content:
    is_invalid_found = True
    print('Invalid content field ')
    print(invalid_content)
    print('\n')

if invalid_url:
    is_invalid_found = True
    print('Invalid url field ')
    print(invalid_url)
    print('\n')

if invalid_date:
    is_invalid_found = True
    print('Invalid date field ')
    print(invalid_date)
    print('\n')

if invalid_category:
    is_invalid_found = True
    print('Invalid category field ')
    print(invalid_category)
    print('\n')

if len(duplicate_title.index):
    is_invalid_found = True
    print('Duplicate title found')
    print('\n')
if len(duplicate_content.index):
    is_invalid_found = True
    print('Duplicate content found')
    print('\n')

if is_invalid_found == False:
    print('Good job! No validation error found')

<-------- Validation results ---------->

Invalid date field 
['sample_data/2_ta.json', 'sample_data/6_si.json']


Invalid category field 
['sample_data/10_en.json', 'sample_data/7_si.json', 'sample_data/6_si.json', 'sample_data/10_ta.json', 'sample_data/9_en.json', 'sample_data/8_ta.json', 'sample_data/4_si.json', 'sample_data/5_si.json', 'sample_data/8_en.json', 'sample_data/9_ta.json', 'sample_data/9_si.json', 'sample_data/5_ta.json', 'sample_data/4_en.json', 'sample_data/4_ta.json', 'sample_data/5_en.json', 'sample_data/8_si.json', 'sample_data/6_ta.json', 'sample_data/10_si.json', 'sample_data/3_si.json', 'sample_data/7_en.json', 'sample_data/7_ta.json', 'sample_data/6_en.json']


