## Word Tokenizer

In [1]:
import nltk
from nltk.tokenize import word_tokenize

nltk.download('punkt')

def tokenize_text(text):
    tokens = word_tokenize(text)
    return tokens


if __name__ == "__main__":
    text = "Hello, world! This is a test sentence for word tokenization. English grammar has four sentence structures: simple, compound, complex and compound-complex. In this lesson, you’ll learn about simple sentences, but first, think of your favourite food. Now imagine eating that food every night for a year. Would it still be your favourite food at the end of that time? Probably not. By then you’d be tired or bored with it, right? The same logic applies to writing. If you read the same type of sentence over and over again, you’d become tired of it, just like you’d become tired of eating the same food over and over again. That’s why good English writers use all four types of sentences, not just one. That’s also why it’s important for you to be able to write each type correctly."
    tokens = tokenize_text(text)
    print("Tokens:", tokens)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Tokens: ['Hello', ',', 'world', '!', 'This', 'is', 'a', 'test', 'sentence', 'for', 'word', 'tokenization', '.', 'English', 'grammar', 'has', 'four', 'sentence', 'structures', ':', 'simple', ',', 'compound', ',', 'complex', 'and', 'compound-complex', '.', 'In', 'this', 'lesson', ',', 'you', '’', 'll', 'learn', 'about', 'simple', 'sentences', ',', 'but', 'first', ',', 'think', 'of', 'your', 'favourite', 'food', '.', 'Now', 'imagine', 'eating', 'that', 'food', 'every', 'night', 'for', 'a', 'year', '.', 'Would', 'it', 'still', 'be', 'your', 'favourite', 'food', 'at', 'the', 'end', 'of', 'that', 'time', '?', 'Probably', 'not', '.', 'By', 'then', 'you', '’', 'd', 'be', 'tired', 'or', 'bored', 'with', 'it', ',', 'right', '?', 'The', 'same', 'logic', 'applies', 'to', 'writing', '.', 'If', 'you', 'read', 'the', 'same', 'type', 'of', 'sentence', 'over', 'and', 'over', 'again', ',', 'you', '’', 'd', 'become', 'tired', 'of', 'it', ',', 'just', 'like', 'you', '’', 'd', 'become', 'tired', 'of', 'eat

## Regional Language Filteration

In [2]:
pip install langdetect

Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: langdetect
  Building wheel for langdetect (setup.py) ... [?25l[?25hdone
  Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993221 sha256=d539e17531adb7b5234bd473e6e98db8f6beda234f82046ea4f18947184cabbc
  Stored in directory: /root/.cache/pip/wheels/95/03/7d/59ea870c70ce4e5a370638b5462a7711ab78fba2f655d05106
Successfully built langdetect
Installing collected packages: langdetect
Successfully installed langdetect-1.0.9


In [4]:
from langdetect import detect, DetectorFactory
from langdetect.lang_detect_exception import LangDetectException

DetectorFactory.seed = 0

def is_marathi(word):
    try:
        return detect(word) == 'mr'
    except LangDetectException:
        return False

def filter_marathi_words(text):
    words = text.split()
    marathi_words = [word for word in words if is_marathi(word)]
    return marathi_words

if __name__ == "__main__":
    text = "आजचा दिवस खूप सुंदर आहे. The weather is perfect for a walk. आपण बाहेर जाऊन गार्डनमध्ये वेळ घालवूया. I hope you are enjoying the day as much as I am."
    marathi_words = filter_marathi_words(text)
    print("Marathi Words:", marathi_words)


Marathi Words: ['आजचा', 'दिवस', 'सुंदर', 'आहे.', 'आपण', 'बाहेर', 'जाऊन', 'गार्डनमध्ये', 'वेळ', 'घालवूया.']


# Stopwords Filteration

In [6]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('stopwords')
nltk.download('punkt')

def filter_stop_words(text):
    words = word_tokenize(text)

    stop_words = set(stopwords.words('english'))
    filtered_words = [word for word in words if word.lower() not in stop_words]

    return ' '.join(filtered_words)

if __name__ == "__main__":
    text = "This is a sample sentence demonstrating stop word filtration in Python."
    filtered_text = filter_stop_words(text)
    print("Filtered Text:", filtered_text)


Filtered Text: sample sentence demonstrating stop word filtration Python .


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [7]:
import string

def remove_punctuation(text):
    translator = str.maketrans('', '', string.punctuation)  #translation table to replace each punctuatioon with ''
    return text.translate(translator)

if __name__ == "__main__":
    text = "Hello, world! This is a test sentence with punctuation marks."
    text_without_punctuation = remove_punctuation(text)
    print("Text without punctuation:", text_without_punctuation)


Text without punctuation: Hello world This is a test sentence with punctuation marks


# Phone, Email, Name Validation

In [17]:
import re

def validate_phone_number(phone_number):
    pattern = re.compile(r'^\+?[1-9]\d{1,14}$')
    return bool(pattern.match(phone_number))

def validate_email(email):
    pattern = re.compile(r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$')
    return bool(pattern.match(email))

def validate_name(name):
    pattern = re.compile(r"^[A-Za-zÀ-ÿ][A-Za-zÀ-ÿ' -]*[A-Za-zÀ-ÿ]$")
    return bool(pattern.match(name))

def validate_inputs(phone_number, email, name):
    is_phone_valid = validate_phone_number(phone_number)
    is_email_valid = validate_email(email)
    is_name_valid = validate_name(name)

    return {
        'phone_number_valid': is_phone_valid,
        'email_valid': is_email_valid,
        'name_valid': is_name_valid
    }

if __name__ == "__main__":
    phone_number = "+1234567890"
    email = "jaybhayerutik2@gmail.com.com"
    name = "John%Doe"

    results = validate_inputs(phone_number, email, name)
    print("Validation Results:", results)

Validation Results: {'phone_number_valid': True, 'email_valid': True, 'name_valid': False}
