## Import Libraries

In [6]:
import csv
import nltk
from nltk.tokenize import word_tokenize , sent_tokenize
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

## Load Dataset

In [18]:
csvfile = open('documents.csv')
stemmedfile = open('stemmed.csv')
dataset = csv.DictReader(csvfile, delimiter=',')

hi


In [24]:
for row in dataset:
    words = word_tokenize(row['text'])
    stemmer = PorterStemmer()
    stemmed_words = [stemmer.stem(word) for word in words]
    csv.DictWriter(stemmedfile,[row['id'],stemmed_words])

['among', 'these', 'are', 'the', 'management', 'of', 'approximately', '1200', 'coast', 'guard', 'owned', 'housing', 'units', 'warehousing', 'health', 'care', 'services', 'work', 'life', 'services', 'transportation', 'of', 'household', 'goods', 'and', 'personnel', 'support', 'services', 'facilities', 'maintenance', 'is', 'provided', 'to', 'all', 'tenant', 'commands', 'on', 'coast', 'guard', 'island', 'and', 'industrial', 'support', 'is', 'provided', 'throughout', 'the', 'west', 'coast', 'isc', 'alameda', 'is', 'home', 'to', 'a', 'variety', 'of', 'tenant', 'commands', 'including', 'when', 'opened', 'in', '1913', 'was', 'know', 'as', 'government', 'island', 'the', 'island', 'was', 'an', 'artificially', 'made', 'by', 'dredging', 'the', 'oakland', 'estuary', 'and', 'place', 'the', 'mud', 'into', 'san', 'leandro', 'bay', 'the', 'coast', 'guard', 'opened', 'the', 'base', 'in', '1926', 'in', '1933', 'major', 'improvements', 'were', 'add', 'to', 'the', 'base', 'in', '1939', 'a', 'lighthouse', '

KeyboardInterrupt: 

## Stemming

Stemming is the process of reducing a word to its base or root form, such as converting "running" to "run". NLTK provides several stemmers, including the Porter stemmer, which is based on the Porter stemming algorithm.

In [4]:

# Tokenize into words
words = word_tokenize(text)

# Stemming
stemmer = PorterStemmer()
stemmed_words = [stemmer.stem(word) for word in words]

print(stemmed_words)

['the', 'boy', 'are', 'run', 'and', 'the', 'leav', 'are', 'fall', 'and', 'i', 'love', 'turtl', '.']


For stemming Arabic text, it’s recommended to use stemmers specifically designed for Arabic, such as the ISRI Stemmer or the Snowball Stemmer with Arabic support.

In [61]:
from nltk.stem.snowball import SnowballStemmer

stemmer = SnowballStemmer("arabic")

# Stem a word
print(stemmer.stem("العاملون"))

عامل


## Part-of-speech Tagging

Part-of-speech (POS) tagging is the process of labeling the words in a sentence with their corresponding part of speech, such as noun, verb, adjective, etc

In [62]:
from nltk import pos_tag

text = "This is an example sentence, showing off the part-of-speech tagging process."

tokens = word_tokenize(text)
tagged_tokens = pos_tag(tokens)

tagged_tokens

[('This', 'DT'),
 ('is', 'VBZ'),
 ('an', 'DT'),
 ('example', 'NN'),
 ('sentence', 'NN'),
 (',', ','),
 ('showing', 'VBG'),
 ('off', 'RP'),
 ('the', 'DT'),
 ('part-of-speech', 'JJ'),
 ('tagging', 'NN'),
 ('process', 'NN'),
 ('.', '.')]

## Lemmatization

Lemmatization is similar to stemming, but it produces a valid word form, known as the lemma, rather than just a root form. For example, "running" would be lemmatized to "run", but "better" would remain unchanged. NLTK provides a lemmatizer, which can be used with the WordNetLemmatizer class which takes into account the part-of-speech tags of the words.

In [65]:
def get_wordnet_pos(tag_parameter):

    tag = tag_parameter[0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    
    return tag_dict.get(tag, wordnet.NOUN)

In [98]:
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import wordnet

text = "The boys are running and the leaves are falling."

# Tokenize into words
words = word_tokenize(text)

# POS tagging
pos_tags = pos_tag(words)

# Lemmatization
lemmatizer = WordNetLemmatizer()

lemmatized_words = [lemmatizer.lemmatize(word, pos=get_wordnet_pos(tag)) for word, tag in pos_tags]

lemmatized_words

['The', 'boy', 'be', 'run', 'and', 'the', 'leaf', 'be', 'fall', '.']

## Spell checking

In [11]:
!pip install pyspellchecker

Defaulting to user installation because normal site-packages is not writeable


In [68]:
from spellchecker import SpellChecker

In [69]:
from nltk.tokenize import word_tokenize
from typing import List  # Import the List type from the typing module

def correct_sentence_spelling(tokens: List[str]) -> List[str]:
    spell = SpellChecker()
    misspelled = spell.unknown(tokens)
    for i, token in enumerate(tokens):
        if token in misspelled:
            corrected = spell.correction(token)
            if corrected is not None:
                tokens[i] = corrected
    return tokens

text = "This is a sampli sentinse withh speling erors."
words = word_tokenize(text)
correct_sentence_spelling(words)

['This', 'is', 'a', 'sample', 'sentence', 'with', 'spelling', 'errors', '.']

## Stopwords

In [70]:
from nltk.corpus import stopwords

In [71]:
stopwords.words('English')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

## Remove stopwords from the text

In [76]:
filtered_text = []

text = "We are going to the ancient city of Damascus."


for word in word_tokenize(text):
    if word not in stopwords.words('English'):
        filtered_text.append(word)
        
filtered_text

['We', 'going', 'ancient', 'city', 'Damascus', '.']

## Remove Punctuation

In [80]:
import string

text = "Hello, world! This is some sample' text."

new_tokens = []
for token in text.split():
    new_tokens.append(token.translate(str.maketrans('', '', string.punctuation)))

print(new_tokens)

['Hello', 'world', 'This', 'is', 'some', 'sample', 'text']
