<a href="https://colab.research.google.com/github/Richish/NLP-Projects/blob/main/BasicTextPreprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Basic Text Preprocessing

### Tokenizers Using nltk

In [2]:
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize, word_tokenize
texts = """This is great. This is a sentence. Colab google: uploading images in multiple subdirectories: If you would like to upload images (or files) in multiples subdirectories by using Colab google, please follow the following steps: - I'll suppose that your images(files) are split into 3 subdirectories (train, validate, test) in the main directory called (dataDir): 1- Zip the folder (dataDir) to (dataDir.zip) 2- Write this code in a Colab cell:"""

for sentence in sent_tokenize(texts):
    print(sentence)
    print(word_tokenize(sentence))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
This is great.
['This', 'is', 'great', '.']
This is a sentence.
['This', 'is', 'a', 'sentence', '.']
Colab google: uploading images in multiple subdirectories: If you would like to upload images (or files) in multiples subdirectories by using Colab google, please follow the following steps: - I'll suppose that your images(files) are split into 3 subdirectories (train, validate, test) in the main directory called (dataDir): 1- Zip the folder (dataDir) to (dataDir.zip) 2- Write this code in a Colab cell:
['Colab', 'google', ':', 'uploading', 'images', 'in', 'multiple', 'subdirectories', ':', 'If', 'you', 'would', 'like', 'to', 'upload', 'images', '(', 'or', 'files', ')', 'in', 'multiples', 'subdirectories', 'by', 'using', 'Colab', 'google', ',', 'please', 'follow', 'the', 'following', 'steps', ':', '-', 'I', "'ll", 'suppose', 'that', 'your', 'images', '(', 'files', ')', 'are', '

### Removing stop words, punctuations and digits

In [3]:
from nltk.corpus import stopwords
from string import punctuation
nltk.download('stopwords')
texts = [texts]
def preproces_corpus(texts):
    my_stopwords = set(stopwords.words('english'))
    def remove_stops_digits(tokens):
        return [token.lower() for token in tokens if token not in my_stopwords and token not in punctuation and not token.isdigit()]
    return [remove_stops_digits(word_tokenize(text)) for text in texts]

print(preproces_corpus(texts))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[['this', 'great', 'this', 'sentence', 'colab', 'google', 'uploading', 'images', 'multiple', 'subdirectories', 'if', 'would', 'like', 'upload', 'images', 'files', 'multiples', 'subdirectories', 'using', 'colab', 'google', 'please', 'follow', 'following', 'steps', 'i', "'ll", 'suppose', 'images', 'files', 'split', 'subdirectories', 'train', 'validate', 'test', 'main', 'directory', 'called', 'datadir', '1-', 'zip', 'folder', 'datadir', 'datadir.zip', '2-', 'write', 'code', 'colab', 'cell']]


### Stemmming and Lemmatization

In [7]:
# stemming using nltk
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()
word1, word2 = "footballs", "authorization"
print(stemmer.stem(word1), stemmer.stem(word2))

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
footbal author


In [9]:
# lemmatization using nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
print(lemmatizer.lemmatize("authorization", pos="a"), lemmatizer.lemmatize("better", pos="a"))

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
authorization good


In [12]:
# lemmatization using spacy
import spacy
sp = spacy.load("en_core_web_sm")
token = sp("better")
for word in token:
    print(word.text, word.lemma_)

better well


## some advanced preprocessing like pos tagging

In [15]:
# using spacy
import spacy
sp = spacy.load("en_core_web_sm")
doc = sp("Kimaya was born to Arshdeep kaur and harvinder singh (village khurd) in 2001 during winter month of december")
for token in doc:
    print(token.text, token.lemma_, token.pos_, token.shape_, token.is_alpha, token.is_stop)

Kimaya Kimaya PROPN Xxxxx True False
was be AUX xxx True True
born bear VERB xxxx True False
to to ADP xx True True
Arshdeep Arshdeep PROPN Xxxxx True False
kaur kaur PROPN xxxx True False
and and CCONJ xxx True True
harvinder harvinder NOUN xxxx True False
singh singh NOUN xxxx True False
( ( PUNCT ( False False
village village NOUN xxxx True False
khurd khurd PROPN xxxx True False
) ) PUNCT ) False False
in in ADP xx True True
2001 2001 NUM dddd False False
during during ADP xxxx True True
winter winter NOUN xxxx True False
month month NOUN xxxx True False
of of ADP xx True True
december december PROPN xxxx True False
