## Install

In [None]:
!pip install -U spacy
!pip install -U spacy-lookups-data
!python -m spacy download en_core_web_sm #en_core_web_md #en_core_web_lg

!pip install nltk

In [None]:
# To use tokenziers
nltk.download('punkt')

# To use stopwords
nltk.download('stopwords')

# To use Lemmatizer
nltk.download('wordnet')

# To use POS
nltk.download('averaged_perceptron_tagger')

# To use NER
nltk.download('maxent_ne_chunker')
nltk.download('words')

# To download all the packages
nltk.download()

# Import Libraries

In [None]:
import spacy
import nltk
from nltk.tokenize import sent_tokenize, wordpunct_tokenize, word_tokenize

# Stemming

In [None]:
from nltk.stem.porter import *

stemmer = PorterStemmer()
tokens = ['compute', 'computer', 'computed', 'computing']
for token in tokens:
    print(token + ' --> ' + stemmer.stem(token))

# Lemmatization

In [None]:
# Nltk
from nltk.stem import WordNetLemmatizer
your_text = "Your text goes here"
[WordNetLemmatizer().lemmatize(token) for token in your_text.split()]

# Spacy
nlp = spacy.load("en_core_web_sm")
doc = nlp("Was Google founded in early 1990?")
[(x.orth_, x.lemma_) for x in [token for token in doc]]

## Text Cleaning Function

In [None]:
def text_cleaning(data):

    # Discalimer: This is a sample text cleaning function. You might need to modify it
    
    import re
    
    #1. Removing URLS
    data = re.sub(r'http\S+', '', data)

    #2. Removing Tags
    data = re.sub(r'#\w+', '', data)

    #3. Removing Mentions
    data = re.sub(r'@\w+', '', data)

    #4. Contractions Expension & Tokenize
    #text_tokens = word_tokenize(contractions.fix(data.lower())) 
    text_tokens = word_tokenize(data.replace("'", '').lower())

    #5. Removing mentions
    tokens_without_mention = [w for w in text_tokens if not w.startswith('@')]
    
    #6. Remove Puncs
    tokens_without_punc = [w for w in tokens_without_mention if w.isalpha()]
    
    #7. Removing Stopwords
    tokens_without_sw = [t for t in tokens_without_punc if t not in stop_words]
    
    #8. lemma
    text_cleaned = [WordNetLemmatizer().lemmatize(t) for t in tokens_without_sw]
    
    #joining
    return " ".join(text_cleaned)

# Train-Test Split

In [None]:
# X --> your text column in your data frame
# y --> your target column in your data frame

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify = y, random_state = 4299)

## TF-IDF Vectorizer

***Resource:*** https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html#sklearn.feature_extraction.text.TfidfVectorizer

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
# Note: text_cleaning is your custom cleaning function
tf_idf_vectorizer = TfidfVectorizer(preprocessor=text_cleaning, min_df=2, ngram_range=(1,2))
X_train_tf_idf = tf_idf_vectorizer.fit_transform(X_train['clean_text'])
X_test_tf_idf = tf_idf_vectorizer.transform(X_test['clean_text'])

## CountVectorizer

***Resource:*** https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html#sklearn.feature_extraction.text.CountVectorizer

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
vectorizer = CountVectorizer(preprocessor=text_cleaning, min_df=2, ngram_range=(1,2))
X_train_count = vectorizer.fit_transform(X_train['text'])
X_test_count = vectorizer.transform(X_test['text'])

# Spacy

In [None]:
nlp = spacy.load("en_core_web_sm")

doc = nlp('Your text goes here')
for token in doc:
    print(f"""Token -> {token.text:{8}} Lemma -> {token.lemma_:{8}} POS -> {token.pos_:{8}} TAG -> {token.tag_:{8}} Shape -> {token.shape_:{8}} Is_Alpha -> {token.is_alpha:{2}}""")

### NER

Entities supported by spacy:

**PERSON** People, including fictional.

**NORP** Nationalities or religious or political groups.

**FAC** Buildings, airports, highways, bridges, etc.

**ORG** Companies, agencies, institutions, etc.

**GPE** Countries, cities, states.

**LOC** Non-GPE locations, mountain ranges, bodies of water.

**PRODUCT** Objects, vehicles, foods, etc. (Not services.)

**EVENT** Named hurricanes, battles, wars, sports events, etc.

**WORK_OF_ART** Titles of books, songs, etc.

**LAW** Named documents made into laws.

**LANGUAGE** Any named language.

**DATE** Absolute or relative dates or periods.

**TIME** Times smaller than a day.

**PERCENT** Percentage, including ”%“.

**MONEY** Monetary values, including unit.

**QUANTITY** Measurements, as of weight or distance.

**ORDINAL** “first”, “second”, etc.

**CARDINAL** Numerals that do not fall under another type.

In [None]:
for ent in doc.ents:
    print(f'{ent.text:{15}} --> {ent.label_}')

### Spacy Visualization

In [None]:
from spacy import displacy

doc = nlp('Apple is looking for buying a UK startup for $1 billion in 2020')
displacy.render(doc, style = 'ent', jupyter=True, options={'distance': 90})

### Spacy Sentence Segmentation

In [None]:
text = 'Apple is looking for buying a U.K. startup. Government has given permission.'
doc = nlp(text)

for sent in doc.sents:
    print(sent)

# Similarity

In [None]:
# To use similarity, spacy model should be at least en_core_web_md
!python -m spacy download en_core_web_md

In [None]:
# Compare 2 docs
doc1 = nlp('I like fast food')
doc2 = nlp('I like pizza')

print(doc1.similarity(doc2))

In [None]:
# Compare tokens

doc = nlp('I like pizza and pasta')
token1 = doc[2]
token2 = doc[4]

print(token1.similarity(token2))

In [None]:
# Getting vector

doc[2].vector

# Spell Checker

In [None]:
!pip install textblob
!pip install autocorrect
!pip install pyspellchecker

In [None]:
from spellchecker import SpellChecker
spell = SpellChecker()
docs = ['calandar', 'lighteinig', 'misspel', 'booq', 'undrstand', 'receive', 'adress']

# Correction
for word in docs:
    print(f'{word:{10}} --> {spell.correction(word):{20}}')

# Candidates
for word in docs:
    print(f'{word:{10}} --> {spell.candidates(word)}')

In [None]:
# TEXTBLOB

from textblob import TextBlob, Word

ex = TextBlob('He was veri happy in hisn neww locotion.')

for word in ex.words:
    print(word, ":", word.correct())

# Sentiment Analysis

In [None]:
testimonial = TextBlob("The food was great!")
print(testimonial.sentiment)