# Beginner: Natural Language Processor

## 1. Text Preprocessing

In [None]:
#Lowercase
text = text.lower()

#Dealing with numbers 
text = ''.join(word for word in text if not word.isdigit()) #get rid of the number in text

#dealing with punctuation
import string 
for punctuation in string.punctuation:
    text = text.replace(punctuation, '') 

#Removing "stopwords"
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize

stop_words = set(stopwords.words('english')) 

word_tokens = word_tokenize(text) 
  
text = [w for w in word_tokens if not w in stop_words] 

#Tokenizing


#Stemming or Lemmatizing
from nltk.stem.porter import PorterStemmer

stemmer = PorterStemmer()

stemmed = [stemmer.stem(word) for word in text]

from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

lemmatized = [lemmatizer.lemmatize(word) for word in text]

## 2. Vectorizing

> __Tf-idf__<br>
disadvantage: does not capture context

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

texts = ['i love football',
         'football is a game i love',
        'football football football']

tf_idf_vectorizer = TfidfVectorizer()

X = tf_idf_vectorizer.fit_transform(texts)

X.toarray()

pd.DataFrame(X.toarray(),columns = tf_idf_vectorizer.get_feature_names())

Unnamed: 0,football,game,is,love
0,0.613356,0.0,0.0,0.789807
1,0.345205,0.584483,0.584483,0.444514
2,1.0,0.0,0.0,0.0


> key params
> 1. __max_df__: Used to exclude "corpus specific stopwords", words that are very frequent in the dataset. The vectorizer will ignore the words that have a frequency higher than the specified threshold.
> 2. __min_df__: Used to exclude words that are very infrequent in the dataset. The vectorizer will ignore the words that have a frequency lower than the specified threshold.
> 3. __max_features__:Used to specify the number of features to keep when vectorizing. It will retain the top features according to count or tf-idf score.

In [4]:
tf_idf_vectorizer = TfidfVectorizer(max_df = 0.8)

X = tf_idf_vectorizer.fit_transform(texts)

X.toarray()

pd.DataFrame(X.toarray(),columns = tf_idf_vectorizer.get_feature_names())

Unnamed: 0,game,is,love
0,0.0,0.0,1.0
1,0.622766,0.622766,0.47363
2,0.0,0.0,0.0


In [5]:
tf_idf_vectorizer = TfidfVectorizer(min_df = 0.5)

X = tf_idf_vectorizer.fit_transform(texts)

X.toarray()

pd.DataFrame(X.toarray(),columns = tf_idf_vectorizer.get_feature_names())

Unnamed: 0,football,love
0,0.613356,0.789807
1,0.613356,0.789807
2,1.0,0.0


In [6]:
tf_idf_vectorizer = TfidfVectorizer(max_features = 2)

X = tf_idf_vectorizer.fit_transform(texts)

X.toarray()

pd.DataFrame(X.toarray(),columns = tf_idf_vectorizer.get_feature_names())

Unnamed: 0,football,love
0,0.613356,0.789807
1,0.613356,0.789807
2,1.0,0.0


## 2.1 Capturing Context: __N-Gram representation__
> - N is the number of words to be consiered as a one.
> - __ngram_range__: A parameter of the two vectorizers to specify the length of sequences to be considered

In [7]:
texts =  ['i do not love football',
         'i love football not basketball']

In [8]:
tf_idf_vectorizer = TfidfVectorizer(ngram_range = (2,2))

X = tf_idf_vectorizer.fit_transform(texts)

X.toarray()

pd.DataFrame(X.toarray(),columns = tf_idf_vectorizer.get_feature_names())

Unnamed: 0,do not,football not,love football,not basketball,not love
0,0.631667,0.0,0.449436,0.0,0.631667
1,0.0,0.631667,0.449436,0.631667,0.0


## 2.2 Feature Engineering
> Sometimes, you may want to extract your own features from the texts. Some common features are: <br>
>- Vocabulary Richness<br>
>- Average word per line<br>
>- Digit/Character ratio<br>
>- Anything you can think of that relates to the task!

In [None]:
def vocab_richness(text):
    tokens = word_tokenize(text)
    total_length = len(tokens)
    unique_words = set(tokens)
    unique_word_length = len(unique_words)
    return unique_word_length/total_length

data['vocab richness'] = data.text.apply(vocab_richness)


## 3.(Multinomia) Naive Bayers Algorithm
> disadvantages:
> - Assumes feature independence, rarely the case in real life datasets

## 4. Modelling Implementation

In [None]:
from sklearn.naive_bayes import MultinomialNB

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(data.text)
y = data['spam']

nb_model = MultinomialNB()

nb_model.fit(X,y)

nb_model.score(X,y)

## 4.1 Pipeling and GridSearch

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

# Create Pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('nb', MultinomialNB()),
])

# Set parameters to search
parameters = {
    'tfidf__ngram_range': ((1,1), (2,2)),
    'nb__alpha': (0.1,1),}

# Perform grid search
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, 
                           verbose=1, scoring = "accuracy", 
                           refit=True, cv=5)

grid_search.fit(data.text,y)

In [None]:
grid_search.best_params_
grid_search.best_score_

## 5. Latent Dirichlet Allocation
> Latent Dirichlet Allocation is an unsupervised learning algorithm for text data. It is based on co-occurences of words in texts and is used to find topics from a corpus of documents.

In [None]:
from sklearn.decomposition import LatentDirichletAllocation

vectorizer = TfidfVectorizer().fit(data['text'])

data_vectorized = vectorizer.transform(data['text'])

lda_model = LatentDirichletAllocation(n_components=2).fit(data_vectorized)

def print_topics(model, vectorizer):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-10 - 1:-1]])
        

print_topics(lda_model, vectorizer)

In [None]:
example = ["rice var congratulations save upenn"]

example_vectorized = vectorizer.transform(example)

lda_vectors = lda_model.transform(example_vectorized)

print("topic 0 :", lda_vectors[0][0])
print("topic 1 :", lda_vectors[0][1])

------------------------

# NLP with RNN
> __language model__: A language model is a model which attempts to predict the next word or character given an input list of words or characters.