In [None]:
import pandas as pd
import numpy as np
import nltk
import spacy
from nltk.tokenize import sent_tokenize, word_tokenize
import gensim.downloader as gensim_api
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

Initilizing the DataFrame object


In [None]:
sentences = ['Google is  huge', 'Spider is a creepy creature', 'Ajith is the best south indian movie star']
dtf = pd.DataFrame()
dtf['sentences'] = sentences

Tokenize

In [None]:
word_tokenized =  [ word_tokenize(sentence) for sentence in sentences ]
dtf['tokenized_sentence'] = word_tokenized
dtf.head()

Unnamed: 0,sentences,tokenized_sentence
0,Google is huge,"[Google, is, huge]"
1,Spider is a creepy creature,"[Spider, is, a, creepy, creature]"
2,Ajith is the best south indian movie star,"[Ajith, is, the, best, south, indian, movie, s..."


Stop words


In [None]:
stopwords = nltk.corpus.stopwords.words("english")

processed_sentence = [ [ word for word in sentence if word not in stopwords ] for sentence in word_tokenized  ]
dtf['removed_stop_words'] = processed_sentence

dtf

Unnamed: 0,sentences,tokenized_sentence,removed_stop_words
0,Google is huge,"[Google, is, huge]","[Google, huge]"
1,Spider is a creepy creature,"[Spider, is, a, creepy, creature]","[Spider, creepy, creature]"
2,Ajith is the best south indian movie star,"[Ajith, is, the, best, south, indian, movie, s...","[Ajith, best, south, indian, movie, star]"


Lemmatization


In [None]:
lem = nltk.stem.wordnet.WordNetLemmatizer()

lem_sentences = [ [ lem.lemmatize(word) for word in sentence ] for sentence in processed_sentence ]
dtf['lemmitized_sentence'] = lem_sentences
dtf

Unnamed: 0,sentences,tokenized_sentence,removed_stop_words,lemmitized_sentence
0,Google is huge,"[Google, is, huge]","[Google, huge]","[Google, huge]"
1,Spider is a creepy creature,"[Spider, is, a, creepy, creature]","[Spider, creepy, creature]","[Spider, creepy, creature]"
2,Ajith is the best south indian movie star,"[Ajith, is, the, best, south, indian, movie, s...","[Ajith, best, south, indian, movie, star]","[Ajith, best, south, indian, movie, star]"


Word length and character length 

In [None]:
word_count = [ len(sent) for sent in lem_sentences ] 
dtf['word_count'] = word_count 

char_count = [ sum([ len(word)  for word in sent  ]) for sent in lem_sentences ] 
dtf['char_count'] = char_count

dtf['avg_word_length'] = dtf['char_count'] / dtf['word_count'] 

dtf

Unnamed: 0,sentences,tokenized_sentence,removed_stop_words,lemmitized_sentence,word_count,char_count,avg_word_length
0,Google is huge,"[Google, is, huge]","[Google, huge]","[Google, huge]",2,10,5.0
1,Spider is a creepy creature,"[Spider, is, a, creepy, creature]","[Spider, creepy, creature]","[Spider, creepy, creature]",3,20,6.666667
2,Ajith is the best south indian movie star,"[Ajith, is, the, best, south, indian, movie, s...","[Ajith, best, south, indian, movie, star]","[Ajith, best, south, indian, movie, star]",6,29,4.833333


Named Entity Recognition


In [None]:
!python -m spacy download en_core_web_lg

In [None]:
sentences_preprocessed = [ ' '.join(word for word in sent) for sent in lem_sentences ]
dtf['processed_sentence'] = sentences_preprocessed
print(sentences_preprocessed)

['Google huge', 'Spider creepy creature', 'Ajith best south indian movie star']


In [None]:
from spacy import displacy
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()

In [None]:
entity_list = [  [ X.label_ for X in nlp(sent).ents ] for sent in sentences_preprocessed ]
dtf['enities_detected'] = entity_list
print(entity_list)

[['ORG'], [], ['NORP']]


In [None]:
feature_list = []

for items in entity_list:
  checked_entity = [];
  temp = {}
  for entity in items:
    if entity in checked_entity:
       temp[entity] = temp[entity] + 1;
    else:
      checked_entity.append(entity)
      temp[entity] = 1 
  feature_list.append(temp)

print(feature_list)

[{'ORG': 1}, {}, {'NORP': 1}]


In [None]:
dtf = pd.concat([ dtf, pd.DataFrame(feature_list).fillna(0) ], axis=1)
dtf.head()

Unnamed: 0,sentences,tokenized_sentence,removed_stop_words,lemmitized_sentence,word_count,char_count,avg_word_length,processed_sentence,enities_detected,ORG,NORP
0,Google is huge,"[Google, is, huge]","[Google, huge]","[Google, huge]",2,10,5.0,Google huge,[ORG],1.0,0.0
1,Spider is a creepy creature,"[Spider, is, a, creepy, creature]","[Spider, creepy, creature]","[Spider, creepy, creature]",3,20,6.666667,Spider creepy creature,[],0.0,0.0
2,Ajith is the best south indian movie star,"[Ajith, is, the, best, south, indian, movie, s...","[Ajith, best, south, indian, movie, star]","[Ajith, best, south, indian, movie, star]",6,29,4.833333,Ajith best south indian movie star,[NORP],0.0,1.0


In [None]:
dtf.iloc[:][['word_count', 'char_count', 'avg_word_length', 'NORP', 'ORG']]

Unnamed: 0,word_count,char_count,avg_word_length,NORP,ORG
0,2,10,5.0,0.0,1.0
1,3,20,6.666667,0.0,0.0
2,6,29,4.833333,1.0,0.0


Word vector


In [None]:
nlp2 = gensim_api.load("glove-wiki-gigaword-300")



In [None]:
lem_sent = ['play', 'ball']
vec = np.array([ nlp2[word] for word in lem_sent ]).flatten()
vec

array([-3.6011e-01,  6.1678e-01, -4.8175e-01, -6.6795e-02, -2.5200e-02,
        1.8144e-01, -2.7854e-01,  3.1219e-01,  6.5527e-02, -8.2089e-01,
        1.5012e-01,  9.1332e-02, -1.1295e-01, -1.9711e-01,  9.6786e-02,
       -2.5849e-01, -1.1721e-01, -1.6041e-01,  2.1004e-01,  1.6739e-01,
       -1.8715e-01,  2.4142e-01, -1.8942e-01, -9.8103e-02, -5.6365e-02,
        1.7850e-01,  4.7334e-02, -4.4688e-02, -4.6457e-01,  9.9004e-02,
        9.8566e-02,  1.2547e-01,  2.8366e-01,  2.5806e-01, -1.6402e+00,
        9.7301e-02,  5.3586e-01,  6.2877e-01, -1.3648e-01, -1.2532e-01,
        4.2191e-01, -3.6628e-01, -2.1626e-01, -5.8103e-01,  8.5401e-02,
        1.5636e-01,  6.4898e-01,  1.9133e-01, -3.5306e-01, -8.5243e-03,
        2.0416e-01,  1.9537e-01, -2.1984e-01, -3.9992e-01, -3.2676e-01,
       -6.8891e-02, -6.9033e-02,  4.7597e-01,  4.4824e-01, -6.8132e-02,
        3.1752e-02,  2.4863e-01, -5.7836e-01, -3.1401e-02, -2.7701e-01,
       -4.7210e-01,  4.4395e-01, -3.9729e-02,  1.0607e-01, -2.74

In [None]:
 feature_vec = np.concatenate((dtf.iloc[0][['word_length', 'char_length', 'avg_word_length', 'NORP', 'ORG']].to_numpy(), vec), axis=0)
 feature_vec

array([2, 10, 0.2, 0.0, 1.0, -0.36011001467704773, 0.6167799830436707,
       -0.4817500114440918, -0.0667949989438057, -0.025200000032782555,
       0.18143999576568604, -0.2785399854183197, 0.3121899962425232,
       0.06552699953317642, -0.8208900094032288, 0.15012000501155853,
       0.09133200347423553, -0.11294999718666077, -0.19710999727249146,
       0.0967859998345375, -0.2584899961948395, -0.11721000075340271,
       -0.16041000187397003, 0.21004000306129456, 0.16739000380039215,
       -0.1871500015258789, 0.24142000079154968, -0.18941999971866608,
       -0.09810300171375275, -0.0563649982213974, 0.1784999966621399,
       0.04733400046825409, -0.04468800127506256, -0.46456998586654663,
       0.09900400042533875, 0.09856600314378738, 0.12546999752521515,
       0.28365999460220337, 0.2580600082874298, -1.6402000188827515,
       0.09730099886655807, 0.535860002040863, 0.6287699937820435,
       -0.13648000359535217, -0.12532000243663788, 0.42190998792648315,
       -0.3662