# 2. Write a program that extracts the words (features) used in a sentence.


---


In [1]:
import pandas as pd
import numpy as np
import re
import nltk

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
corpus = ['The sky is blue and beautiful.',
          'Love this blue and beautiful sky!',
          'The quick brown fox jumps over the lazy dog.',
          'The brown fox is quick and the blue dog is lazy!',
          'The sky is very blue and the sky is very beautiful today',
          'The dog is lazy but the brown fox is quick!'    
]
labels = ['weather', 'weather', 'animals', 'animals', 'weather', 'animals']
corpus = np.array(corpus)
corpus_df = pd.DataFrame({'Document': corpus, 
                          'Category': labels})
corpus_df = corpus_df[['Document', 'Category']]
corpus_df

Unnamed: 0,Document,Category
0,The sky is blue and beautiful.,weather
1,Love this blue and beautiful sky!,weather
2,The quick brown fox jumps over the lazy dog.,animals
3,The brown fox is quick and the blue dog is lazy!,animals
4,The sky is very blue and the sky is very beaut...,weather
5,The dog is lazy but the brown fox is quick!,animals


## Removing stopwords and Lemmitization

In [4]:
wpt_obj=nltk.WordPunctTokenizer()

In [5]:
stop_words=nltk.corpus.stopwords.words('english')

#### Function to purify sentences

In [19]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [7]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
def remove_unwanted(string):
    string=re.sub(r'[^a-zA-Z0-9\s]','',string,re.I)
    string.lower()
    string.strip()
    ls=wpt_obj.tokenize(string)
    imp_tokens=[lemmatizer.lemmatize(w) for w in ls if w not in stop_words]
    doc= " ".join(imp_tokens)
    return doc

In [8]:
vector_converter=np.vectorize(remove_unwanted)

In [9]:
filtered_sentences=vector_converter(corpus)
filtered_sentences

array(['The sky blue beautiful', 'Love blue beautiful sky',
       'The quick brown fox jump lazy dog',
       'The brown fox quick blue dog lazy',
       'The sky blue sky beautiful today', 'The dog lazy brown fox quick'],
      dtype='<U33')

In [10]:
from sklearn.feature_extraction.text import CountVectorizer
cv_obj=CountVectorizer()
count_matrix=cv_obj.fit_transform(filtered_sentences)
final_count_array=count_matrix.toarray()
print(final_count_array)

[[1 1 0 0 0 0 0 0 0 1 1 0]
 [1 1 0 0 0 0 0 1 0 1 0 0]
 [0 0 1 1 1 1 1 0 1 0 1 0]
 [0 1 1 1 1 0 1 0 1 0 1 0]
 [1 1 0 0 0 0 0 0 0 2 1 1]
 [0 0 1 1 1 0 1 0 1 0 1 0]]


In [11]:
cv_obj.get_feature_names()

['beautiful',
 'blue',
 'brown',
 'dog',
 'fox',
 'jump',
 'lazy',
 'love',
 'quick',
 'sky',
 'the',
 'today']

In [12]:
features=cv_obj.get_feature_names()
counts=final_count_array
final_data_extracted=pd.DataFrame(counts,columns=features)
final_data_extracted

Unnamed: 0,beautiful,blue,brown,dog,fox,jump,lazy,love,quick,sky,the,today
0,1,1,0,0,0,0,0,0,0,1,1,0
1,1,1,0,0,0,0,0,1,0,1,0,0
2,0,0,1,1,1,1,1,0,1,0,1,0
3,0,1,1,1,1,0,1,0,1,0,1,0
4,1,1,0,0,0,0,0,0,0,2,1,1
5,0,0,1,1,1,0,1,0,1,0,1,0


#### n-gram Bag of words

In [13]:
from sklearn.feature_extraction.text import CountVectorizer
cv_obj=CountVectorizer(ngram_range=(2,2))
count_matrix=cv_obj.fit_transform(filtered_sentences)
final_count_array=count_matrix.toarray()
print(final_count_array)

[[0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1]
 [1 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 1 0 1 0 1 0 1 0 0 1 0 0 0 0 1 0]
 [0 0 0 1 0 1 1 0 1 0 0 0 0 1 0 0 0 1 0 0 0]
 [0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 1]
 [0 0 0 0 0 1 1 0 1 0 1 0 0 0 0 0 0 0 1 0 0]]


In [14]:
features=cv_obj.get_feature_names()
counts=final_count_array
final_data_extracted=pd.DataFrame(counts,columns=features)
final_data_extracted

Unnamed: 0,beautiful sky,beautiful today,blue beautiful,blue dog,blue sky,brown fox,dog lazy,fox jump,fox quick,jump lazy,...,lazy dog,love blue,quick blue,quick brown,sky beautiful,sky blue,the brown,the dog,the quick,the sky
0,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
1,1,0,1,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2,0,0,0,0,0,1,0,1,0,1,...,1,0,0,1,0,0,0,0,1,0
3,0,0,0,1,0,1,1,0,1,0,...,0,0,1,0,0,0,1,0,0,0
4,0,1,0,0,1,0,0,0,0,0,...,0,0,0,0,1,1,0,0,0,1
5,0,0,0,0,0,1,1,0,1,0,...,0,0,0,0,0,0,0,1,0,0


### TF-IDF

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer(min_df=0., max_df=1., use_idf=True)
tf_matrix = tf.fit_transform(corpus)
tf_matrix = tf_matrix.toarray()
pd.DataFrame(np.round(tv_matrix, 2), columns=tf.get_feature_names())

Unnamed: 0,and,beautiful,blue,brown,but,dog,fox,is,jumps,lazy,love,over,quick,sky,the,this,today,very
0,0.39,0.46,0.39,0.0,0.0,0.0,0.0,0.39,0.0,0.0,0.0,0.0,0.0,0.46,0.34,0.0,0.0,0.0
1,0.31,0.36,0.31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.52,0.0,0.0,0.36,0.0,0.52,0.0,0.0
2,0.0,0.0,0.0,0.3,0.0,0.3,0.3,0.0,0.43,0.3,0.0,0.43,0.3,0.0,0.44,0.0,0.0,0.0
3,0.25,0.0,0.25,0.29,0.0,0.29,0.29,0.5,0.0,0.29,0.0,0.0,0.29,0.0,0.43,0.0,0.0,0.0
4,0.18,0.21,0.18,0.0,0.0,0.0,0.0,0.37,0.0,0.0,0.0,0.0,0.0,0.43,0.32,0.0,0.31,0.62
5,0.0,0.0,0.0,0.29,0.41,0.29,0.29,0.49,0.0,0.29,0.0,0.0,0.29,0.0,0.42,0.0,0.0,0.0
