In [82]:
import json
import re
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

In [83]:
src_1_df = pd.read_csv("./Gujarat_samachar/gujaratsamachar_text1.csv", delimiter=',')
src_2_df = pd.read_csv("./IndianExpress/indianexpress_text.csv", delimiter=',')
src_3_df = pd.read_csv("./Oneindia/oneindia_text.csv", delimiter=',')
#src_4_df = pd.read_csv("./Corpus/Text_data.csv", delimiter=',')
src_5_df = pd.read_csv("./Westerntimes_news/westerntimesnews_text.csv", delimiter=',')

In [84]:
def detect_personal_info(text):

    phone_pattern = r'\b(\+?\d{1,3}[-.\s]?)?(\(?\d{2,4}\)?[-.\s]?)?\d{3,4}[-.\s]?\d{3,4}\b'
    email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,7}\b'

    phone_numbers = re.findall(phone_pattern, text)
    emails = re.findall(email_pattern, text)

    if (len(phone_numbers) != 0) or (len(emails) != 0):
        return True
    else:
        return False
    
def detect_gujarati_slangs(text):

    slangs = ["માધારછોડ", "ગાંડુ", "રંડી", "બેટીચોડ", "ભોસડીકે", "ચુતીયા", "જણાતું", "ચૂત", "લૉડ"]

    slang_pattern = r'\b(?:' + '|'.join(map(re.escape, slangs)) + r')\b'
    
    slang_matches = re.findall(slang_pattern, text)
    
    if slang_matches:
        return True
    else:
        return False
    
def remove_special_char(text):
    cleaned_text = re.sub(r'[\\//!@#$%^&*(),-?":{}|<>]', '', text)
    return cleaned_text

def remove_eng_char(text):
    cleaned_text = re.sub(r'[A-Za-z]', '', text)
    return cleaned_text

def remove_space(text):
    cleaned_text = re.sub(r'\s+', ' ', text).strip()
    return cleaned_text

def guj_eng_ratio(text):

    english_pattern = r'[A-Za-z]'
    
    gujarati_pattern = r'[\u0A80-\u0AFF]'
    
    english_chars = re.findall(english_pattern, text)
    gujarati_chars = re.findall(gujarati_pattern, text)
    
    num_english = len(english_chars)
    num_gujarati = len(gujarati_chars)
    
    if num_gujarati > 0:
        ratio = num_english / num_gujarati
    else:
        ratio = None
    
    return ratio    

def gujarati_special_ratio(text):

    gujarati_pattern = r'[\u0A80-\u0AFF]'
    
    special_pattern = r'[\\//!@#$%^&*(),.?":{}|<>]'
    
    gujarati_chars = re.findall(gujarati_pattern, text)
    special_chars = re.findall(special_pattern, text)
    
    num_gujarati = len(gujarati_chars)
    num_special = len(special_chars)
    
    if num_gujarati > 0:
        ratio = num_special / num_gujarati 
    else:
        ratio = None
    
    return ratio

In [86]:
import pandas as pd
import numpy as np

def preprocess_text(data_frame, batch_size=1000):

    def process_sentence(sent):
        ger = guj_eng_ratio(sent)
        gsr = gujarati_special_ratio(sent)
        
        if (ger is None) or (gsr is None):
            return None
        if detect_gujarati_slangs(sent):
            return None
        if detect_personal_info(sent):
            return None
        if len(sent)<10:
            return None
        
        return remove_special_char(remove_eng_char(remove_space(sent)))

    def process_batch(batch):

        batch['Text'] = batch['Text'].fillna('')
        
        #batch['Text'] = batch['Text'].str.split("', '").apply(lambda para: 
        #    '\n'.join(filter(None, [process_sentence(sent) for sent in para])))

        batch['Text'] = batch['Text'].str.split("\n").apply(lambda para: 
            '\n'.join(filter(None, [process_sentence(sent) for sent in para])))
        
        batch = batch[batch['Text'].str.strip() != '']
        return batch

    n = len(data_frame)
    results = []
    for start in range(0, n, batch_size):
        batch = data_frame.iloc[start:start + batch_size].copy()
        processed_batch = process_batch(batch)
        results.append(processed_batch)

    return pd.concat(results, ignore_index=True)

In [87]:
src_1_df = preprocess_text(src_1_df, batch_size=5000)
src_2_df = preprocess_text(src_2_df, batch_size=5000)
src_3_df = preprocess_text(src_3_df, batch_size=5000)
#src_4_df = preprocess_text(src_4_df, batch_size=5000)
src_5_df = preprocess_text(src_5_df, batch_size=5000)

In [90]:
src_1_df.to_csv("./Gujarat_samachar/gujaratsamachar_text1.csv", index=False)
src_2_df.to_csv("./IndianExpress/indianexpress_text.csv", index=False)
src_3_df.to_csv("./Oneindia/oneindia_text.csv", index=False)
#src_4_df.to_csv("./Corpus/corpus_text.csv", index=False)
src_5_df.to_csv("./Westerntimes_news/westerntimesnews_text.csv", index=False)