In [1]:
import pandas as pd
import re
import string

In [2]:
!python --version

Python 3.11.11


In [3]:
!pip freeze > '/kaggle/working/requirements.txt'

In [4]:
df = pd.read_csv('/kaggle/input/nepali-summarization-set-raw/summarization_set_raw.csv')
original_dataset = len(df)
### NA and Duplicates
df = df.dropna(subset=['news'])
df = df.dropna(subset=['title'])
df = df.drop_duplicates(subset=['title', 'news'])

In [5]:
def perform_preprocessing(text):
    text = (
        text.replace("\n", " ")
        .replace("\r", " ")
        .replace("\t", " ")
        .replace("“", '"')
        .replace("”", '"')
        .replace("‘", "'")
        .replace("’", "'")
        .replace("0", "०")
        .replace("1", "१")
        .replace("2", "२")
        .replace("3", "३")
        .replace("4", "४")
        .replace("5", "५")
        .replace("6", "६")
        .replace("7", "७")
        .replace("8", "८")
        .replace("9", "९")
    )

    # Remove emojis
    regrex_pattern = re.compile(
        pattern="["
        "\U0001f600-\U0001f64f"  # emoticons
        "\U0001f300-\U0001f5ff"  # symbols & pictographs
        "\U0001f680-\U0001f6ff"  # transport & map symbols
        "\U0001f1e0-\U0001f1ff"  # flags (iOS)
        "\U00002500-\U00002bef"  # chinese char
        "\U00002702-\U000027b0"
        "\U000024c2-\U0001f251"
        "\U0001f926-\U0001f937"
        "\U00010000-\U0010ffff"
        "\u2640-\u2642"
        "\u2600-\u2b55"
        "\u200d"
        "\u23cf"
        "\u23e9"
        "\u231a"
        "\ufe0f"  # dingbats
        "\u3030"
        "]+",
        flags=re.UNICODE,
    )
    text = regrex_pattern.sub("", text)

    pattern = re.compile(r'<.*?>') # HTML
    text = pattern.sub('', text)

    text = re.sub(r'([A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,})', '', text) # Email

    # Remove multiple white spaces
    text = re.sub(r"\s+", " ", text)

    # Remove English
    text = re.sub('[a-zA-Z]', '', text)

    text = " ".join(text.split())

    return text

In [6]:
perform_preprocessing("निखिल उप्रेतीको भैरव फिल्म'ले अमेरिकामा<html>sak@ghmail.comw ah.")

"निखिल उप्रेतीको भैरव फिल्म'ले अमेरिकामा ."

In [7]:
## Clean
df['title_cleaned'] = df['title'].apply(lambda x: perform_preprocessing(x))
df['news_cleaned'] = df['news'].apply(lambda x: perform_preprocessing(x))

In [8]:
df['title_length'] = df['title_cleaned'].apply(lambda x: len(x.split()))
df['news_length'] = df['news_cleaned'].apply(lambda x: len(x.split()))

In [9]:
df['news_length'].describe(), df['title_length'].describe()

(count    365798.000000
 mean        238.887512
 std         207.697606
 min           0.000000
 25%         117.000000
 50%         177.000000
 75%         287.000000
 max       11931.000000
 Name: news_length, dtype: float64,
 count    365798.000000
 mean          7.449270
 std           2.660058
 min           0.000000
 25%           5.000000
 50%           7.000000
 75%           9.000000
 max          22.000000
 Name: title_length, dtype: float64)

In [16]:
len(df), len(df[df['news_length'] <= 30]), len(df[df['title_length'] <= 4]), len(df[df['title_length'] > 15])

(365798, 513, 47961, 1300)

In [17]:
### Remove very short news(<=30), and short and long titles(<=2 or >15)
df = df[df['news_length'] > 30]
df = df[df['title_length'] > 4]
df = df[df['title_length'] <= 15]

In [18]:
from sklearn.model_selection import train_test_split

In [19]:
train, test = train_test_split(df, test_size=0.10, shuffle=True, stratify=df['category'], random_state=22)
test, val = train_test_split(test, test_size=0.25, shuffle=True, stratify=test['category'], random_state=29)

In [20]:
len(train), len(test), len(val)

(284562, 23714, 7905)

In [21]:
df.head(3)

Unnamed: 0,title,news,category,published_date,title_cleaned,news_cleaned,title_length,news_length
0,निखिल उप्रेतीको भैरव फिल्मले अमेरिकामा रहेका न...,सनफ्रान्सिस्को-अमेरिकाका नेपालीको प्रतिक्रिया ...,मनोरञ्जन,"विहीबार, २८ माघ २०७२, ११ : ०५",निखिल उप्रेतीको भैरव फिल्मले अमेरिकामा रहेका न...,सनफ्रान्सिस्को-अमेरिकाका नेपालीको प्रतिक्रिया ...,9,281
1,सुशील कोइरालाको निधनपछि चौरासी बाले खोले यस्ता...,झण्डै ४ बर्षअघि सुशील कोइरालाले प्रधानमन्त्रीक...,मनोरञ्जन,"बुधबार, २७ माघ २०७२, १५ : २५",सुशील कोइरालाको निधनपछि चौरासी बाले खोले यस्ता...,झण्डै ४ बर्षअघि सुशील कोइरालाले प्रधानमन्त्रीक...,8,444
2,लिटल प्रिन्स एण्ड प्रिन्सेसको ग्रान्ड फिनाले,सुरुङ– ग्ल्यामरस नेपालले सुरुङ्गामा लिटल प्रिन...,मनोरञ्जन,"बुधबार, २७ माघ २०७२, १० : ५४",लिटल प्रिन्स एण्ड प्रिन्सेसको ग्रान्ड फिनाले,सुरुङ– ग्ल्यामरस नेपालले सुरुङ्गामा लिटल प्रिन...,6,137


In [22]:
print(f'Original dataset size: {original_dataset:,}')
print(f'After cleaning: {(len(train) + len(test) + len(val)):,}')

Original dataset size: 371,640
After cleaning: 316,181


In [23]:
train[['news_cleaned', 'title_cleaned']].rename(columns={'news_cleaned': 'news', 'title_cleaned': 'title'}).reset_index(drop=True).to_csv('llm_summarization_set_cleaned_train.csv', index=None)
test[['news_cleaned', 'title_cleaned']].rename(columns={'news_cleaned': 'news', 'title_cleaned': 'title'}).reset_index(drop=True).to_csv('llm_summarization_set_cleaned_test.csv', index=None)
val[['news_cleaned', 'title_cleaned']].rename(columns={'news_cleaned': 'news', 'title_cleaned': 'title'}).reset_index(drop=True).to_csv('llm_summarization_set_cleaned_val.csv', index=None)