In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Ankit\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
# df_train['length'] = df_train['text'].apply(lambda x: len(x))
# df_test['length'] = df_test['text'].apply(lambda x: len(x))
# df_train = df_train[df_train['length']<=256]
# df_test = df_test[df_test['length']<=256]
# df_train.drop(columns=['length'], inplace=True,axis=1)
# df_test.drop(columns=['length'], inplace=True,axis=1)

#### Function to remove punctuations and other special characters including repeating white spaces

In [5]:
def clean_text(text):    
    text = re.sub(r'[+!@#$%^&*(),.?":{}|<>~`;/\\[\]\'|-]+', ' ', text) # Remove all special characters    
    text = re.sub(r'।', ' ', text)  # Remove the '।' character
    text = re.sub(r'\s+', ' ', text).strip()    # Remove extra whitespace    
    return text

#### Deal with stopwords

In [6]:
stopwords_list = set(nltk.corpus.stopwords.words('nepali'))
import codecs

# Initialize an empty list for extra stopwords
extra_stop_words = []

# Open the file with UTF-8 encoding
with codecs.open('nepali_stopwords.txt', 'r', encoding='utf-8') as file:
    for line in file:
        # Strip whitespace and add the word to the list
        extra_stop_words.append(line.strip())
stopwords_list.update(extra_stop_words)
stopwords_list = set(stopwords_list)
stopwords_list.remove('राम्रो')


In [7]:
def remove_stop_words(text):
    text = text.lower()
    filtered_words = [word for word in text.split() if word not in stopwords_list]
    return ' '.join(filtered_words)

In [8]:
import demoji
# Function to replace emojis with descriptions and add spaces
def replace_emojis_with_space(text):
    # Replace emojis with descriptions, adding spaces before and after each description
    text = demoji.replace_with_desc(text, sep="+")  # Add space between descriptions
    text = text.replace('-',' ')
    text = text.replace(':',' ')
    return text

#### Helper Functions

In [11]:
def find_words_between_plus(text):
    # Use regex to find all words between + and +
    return re.findall(r'\+(.*?)\+', text)
def contains_english_letters(text):
    # Check if the text contains at least one English letter (a-z or A-Z)
    return bool(re.search(r'[a-zA-Z]', text))
def find_english_words(text):
    # Check if the text contains at least one English letter (a-z or A-Z)
    return re.findall(r'[a-zA-Z\+]+', text)

In [12]:
from deep_translator import GoogleTranslator

def translate_english_to_nepali(batch):
    return GoogleTranslator(source='en', target='ne').translate_batch(batch)  

## Functions to convert emojis to english, extract them, map them to nepali words and replace them

In [13]:
def emoji_extractor(df):
    present_english_words = []

    for text in df['text']:
        sentence = ' '.join(find_english_words(text))
        unique_words = set(find_words_between_plus(sentence))
        if unique_words:            
            present_english_words.extend(unique_words)
    return set(present_english_words)
def emoji_handler(df_train,df_test):
    emoji_words = emoji_extractor(df_train)
    emoji_words_test = emoji_extractor(df_test)
    total_words = emoji_words.union(emoji_words_test)
    total_words_list = list(total_words)
    translated_emojis = translate_english_to_nepali(total_words_list)
    emoji_dict = dict(zip(total_words_list, translated_emojis))
    df_train['text'] = df_train['text'].replace({'\+': ' '}, regex=True)
    df_train['text'] = df_train['text'].replace(emoji_dict,regex=True)
    df_test['text'] = df_test['text'].replace({'\+': ' '}, regex=True)
    df_test['text'] = df_test['text'].replace(emoji_dict,regex=True)

## Function to process English words after emojis have been dealth with

In [14]:
# Define the function to extract English words and separate sentences based on Nepali characters
def extract_english_words_inbetween(text):
    # Regex pattern for detecting Nepali characters and English words
    nepali_pattern = r'[\u0900-\u097F]+'
    english_word_pattern = r'[a-zA-Z]+'
    
    # Split text by Nepali characters to separate sentences
    sentences = re.split(nepali_pattern, text)
    
    # Now extract English words from each sentence
    english_sentences = []
    for sentence in sentences:
        words = re.findall(english_word_pattern, sentence)
        if words:  # If there are any English words, store them as a sentence
            english_sentences.append(" ".join(words))
    
    return english_sentences
# Extracts all english words. If they are separated by Nepali characters they are taken as a different sentence
def misc_english_extractor(df):
    present_english_words = []
    for text in df['text']:
        eng = extract_english_words_inbetween(text)
        if eng:
            present_english_words.extend(eng)
    return set(present_english_words)
def misc_english_handler(df_train,df_test):
    misc_english_words = misc_english_extractor(df_train)
    misc_english_words_test = misc_english_extractor(df_test)
    total_misc = misc_english_words.union(misc_english_words_test)
    total_misc_list = list(total_misc)
    translated_misc = translate_english_to_nepali(total_misc_list)
    misc_dict = dict(zip(total_misc_list, translated_misc))    
    misc_dict = {r'\b' + key + r'\b': value for key, value in misc_dict.items()}  
    df_train['text'] = df_train['text'].replace(misc_dict,regex=True)
    df_test['text'] = df_test['text'].replace(misc_dict,regex=True)

In [21]:
def final_clean(text):    
    text = re.sub(r'[+!@#$%^&*(),.?":{}|<>~`;/\\[\]\'|-]+', ' ', text) # Remove all special characters    
    text = re.sub(r'[a-zA-Z]', ' ', text)  # Remove the 'a-zA-Z' character
    text = re.sub(r'\s+', ' ', text).strip()    # Remove extra whitespace    
    return text

## Running starts here

In [None]:
df_train = pd.read_csv('train.csv',encoding='utf-8')
df_test = pd.read_csv('test.csv',encoding='utf-8')
df_test.dropna(inplace=True)
df_train.dropna(inplace=True)
df_train = df_train[~df_train['label'].str.match(r'^(-|20|11|o|--)$')]
df_test = df_test[~df_test['label'].str.match(r'^(-|20|11|o|--)$')]

In [9]:
df_train['text'] = df_train['text'].apply(clean_text)
df_test['text'] = df_test['text'].apply(clean_text)

In [10]:
# Convert emojis to text
df_train['text'] = df_train['text'].apply(replace_emojis_with_space)
df_test['text'] = df_test['text'].apply(replace_emojis_with_space)

In [16]:
emoji_handler(df_train,df_test)

In [17]:
misc_english_handler(df_train,df_test)

In [15]:
df_train = df_train[df_train['text'] != '']
df_test = df_test[df_test['text'] != '']

In [19]:
df_train['text'] = df_train['text'].apply(remove_stop_words)
df_test['text'] = df_test['text'].apply(remove_stop_words)
    

In [22]:
df_train['text'] = df_train['text'].apply(final_clean)
df_test['text'] = df_test['text'].apply(final_clean)

In [24]:
df_train = df_train[df_train['text'] != '']
df_test = df_test[df_test['text'] != '']

In [25]:
df_train.to_csv('train_clean.csv',index=False)
df_test.to_csv('test_clean.csv',index=False)