In [2]:
import pandas as pd
import string

df = pd.read_csv('Twitter Sentiments.csv')
## drop the columns
df = df.drop(columns = ['id','label'],axis= 1)

df.head()

Unnamed: 0,tweet
0,@user when a father is dysfunctional and is s...
1,@user @user thanks for #lyft credit i can't us...
2,bihday your majesty
3,#model i love u take with u all the time in ...
4,factsguide: society now #motivation


## Convert to Lowercase


In [3]:
df['clean_text'] =  df['tweet'].str.lower()
df.head()

Unnamed: 0,tweet,clean_text
0,@user when a father is dysfunctional and is s...,@user when a father is dysfunctional and is s...
1,@user @user thanks for #lyft credit i can't us...,@user @user thanks for #lyft credit i can't us...
2,bihday your majesty,bihday your majesty
3,#model i love u take with u all the time in ...,#model i love u take with u all the time in ...
4,factsguide: society now #motivation,factsguide: society now #motivation



## Removal of Punctuations

In [4]:
string.punctuation


'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [5]:
def remove_punctuations(text):
    punctuations = string.punctuation
    return text.translate(str.maketrans('', '', punctuations))



In [6]:
df['clean_text'] = df['clean_text'].apply(lambda x: remove_punctuations(x))
df.head()

Unnamed: 0,tweet,clean_text
0,@user when a father is dysfunctional and is s...,user when a father is dysfunctional and is so...
1,@user @user thanks for #lyft credit i can't us...,user user thanks for lyft credit i cant use ca...
2,bihday your majesty,bihday your majesty
3,#model i love u take with u all the time in ...,model i love u take with u all the time in u...
4,factsguide: society now #motivation,factsguide society now motivation


# Removal of Stopwords

In [7]:
import nltk

nltk.download('stopwords')

from nltk.corpus import stopwords

", ".join(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


"i, me, my, myself, we, our, ours, ourselves, you, you're, you've, you'll, you'd, your, yours, yourself, yourselves, he, him, his, himself, she, she's, her, hers, herself, it, it's, its, itself, they, them, their, theirs, themselves, what, which, who, whom, this, that, that'll, these, those, am, is, are, was, were, be, been, being, have, has, had, having, do, does, did, doing, a, an, the, and, but, if, or, because, as, until, while, of, at, by, for, with, about, against, between, into, through, during, before, after, above, below, to, from, up, down, in, out, on, off, over, under, again, further, then, once, here, there, when, where, why, how, all, any, both, each, few, more, most, other, some, such, no, nor, not, only, own, same, so, than, too, very, s, t, can, will, just, don, don't, should, should've, now, d, ll, m, o, re, ve, y, ain, aren, aren't, couldn, couldn't, didn, didn't, doesn, doesn't, hadn, hadn't, hasn, hasn't, haven, haven't, isn, isn't, ma, mightn, mightn't, mustn, mus

In [8]:

STOPWORDS = set(stopwords.words('english'))
def remove_stopwords(text):
    return " ".join([word for word in text.split() if word not in STOPWORDS])

In [9]:

df['clean_text'] = df['clean_text'].apply(lambda x: remove_stopwords(x))
df.head()

Unnamed: 0,tweet,clean_text
0,@user when a father is dysfunctional and is s...,user father dysfunctional selfish drags kids d...
1,@user @user thanks for #lyft credit i can't us...,user user thanks lyft credit cant use cause do...
2,bihday your majesty,bihday majesty
3,#model i love u take with u all the time in ...,model love u take u time urð± ðððð...
4,factsguide: society now #motivation,factsguide society motivation



## Removal of Frequent Words

In [10]:

from collections import Counter
word_count = Counter()
for text in df['clean_text']:
    for word in text.split():
        word_count[word] += 1

word_count.most_common(10)

[('user', 17473),
 ('love', 2647),
 ('day', 2198),
 ('happy', 1663),
 ('amp', 1582),
 ('im', 1139),
 ('u', 1136),
 ('time', 1110),
 ('life', 1086),
 ('like', 1042)]

In [11]:
FREQUENT_WORDS = set(word for (word, wc) in word_count.most_common(3))
def remove_freq_words(text):
    return " ".join([word for word in text.split() if word not in FREQUENT_WORDS])

In [12]:
df['clean_text'] = df['clean_text'].apply(lambda x: remove_freq_words(x))
df.head()

Unnamed: 0,tweet,clean_text
0,@user when a father is dysfunctional and is s...,father dysfunctional selfish drags kids dysfun...
1,@user @user thanks for #lyft credit i can't us...,thanks lyft credit cant use cause dont offer w...
2,bihday your majesty,bihday majesty
3,#model i love u take with u all the time in ...,model u take u time urð± ðððð ð...
4,factsguide: society now #motivation,factsguide society motivation



## Removal of Rare Words

In [13]:

RARE_WORDS = set(word for (word, wc) in word_count.most_common()[:-10:-1])
RARE_WORDS

{'airwaves',
 'carnt',
 'chisolm',
 'ibizabringitonmallorcaholidayssummer',
 'isz',
 'mantle',
 'shirley',
 'youuuð\x9f\x98\x8dð\x9f\x98\x8dð\x9f\x98\x8dð\x9f\x98\x8dð\x9f\x98\x8dð\x9f\x98\x8dð\x9f\x98\x8dð\x9f\x98\x8dð\x9f\x98\x8dâ\x9d¤ï¸\x8f',
 'ð\x9f\x99\x8fð\x9f\x8f¼ð\x9f\x8d¹ð\x9f\x98\x8eð\x9f\x8eµ'}

In [14]:

def remove_rare_words(text):
    return " ".join([word for word in text.split() if word not in RARE_WORDS])

In [15]:

df['clean_text'] = df['clean_text'].apply(lambda x: remove_rare_words(x))
df.head()

Unnamed: 0,tweet,clean_text
0,@user when a father is dysfunctional and is s...,father dysfunctional selfish drags kids dysfun...
1,@user @user thanks for #lyft credit i can't us...,thanks lyft credit cant use cause dont offer w...
2,bihday your majesty,bihday majesty
3,#model i love u take with u all the time in ...,model u take u time urð± ðððð ð...
4,factsguide: society now #motivation,factsguide society motivation


## Removal of Special characters

In [16]:

import re
def remove_spl_chars(text):
    text = re.sub('[^a-zA-Z0-9]', ' ', text)
    text = re.sub('\s+', ' ', text)
    return text

In [17]:

df['clean_text'] = df['clean_text'].apply(lambda x: remove_spl_chars(x))
df.head()


Unnamed: 0,tweet,clean_text
0,@user when a father is dysfunctional and is s...,father dysfunctional selfish drags kids dysfun...
1,@user @user thanks for #lyft credit i can't us...,thanks lyft credit cant use cause dont offer w...
2,bihday your majesty,bihday majesty
3,#model i love u take with u all the time in ...,model u take u time ur
4,factsguide: society now #motivation,factsguide society motivation


## Stemming

In [18]:

from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
def stem_words(text):
    return " ".join([ps.stem(word) for word in text.split()])

In [19]:
df['stemmed_text'] = df['clean_text'].apply(lambda x: stem_words(x))
df.head()

Unnamed: 0,tweet,clean_text,stemmed_text
0,@user when a father is dysfunctional and is s...,father dysfunctional selfish drags kids dysfun...,father dysfunct selfish drag kid dysfunct run
1,@user @user thanks for #lyft credit i can't us...,thanks lyft credit cant use cause dont offer w...,thank lyft credit cant use caus dont offer whe...
2,bihday your majesty,bihday majesty,bihday majesti
3,#model i love u take with u all the time in ...,model u take u time ur,model u take u time ur
4,factsguide: society now #motivation,factsguide society motivation,factsguid societi motiv


## Lemmatization & POS Tagging

In [20]:
import nltk
nltk.download('wordnet')

from nltk import pos_tag
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
wordnet_map = {"N":wordnet.NOUN, "V": wordnet.VERB, "J": wordnet.ADJ, "R": wordnet.ADV}

def lemmatize_words(text):
    # find pos tags
    pos_text = pos_tag(text.split())
    return " ".join([lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_text])

[nltk_data] Downloading package wordnet to /root/nltk_data...


In [21]:
wordnet.NOUN


'n'

In [22]:
import nltk

# Download the required NLTK data package
nltk.download('averaged_perceptron_tagger_eng')

from nltk import pos_tag
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
wordnet_map = {"N":wordnet.NOUN, "V": wordnet.VERB, "J": wordnet.ADJ, "R": wordnet.ADV}

def lemmatize_words(text):
    # find pos tags
    pos_text = pos_tag(text.split())
    return " ".join([lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_text])
df.head()


[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


Unnamed: 0,tweet,clean_text,stemmed_text
0,@user when a father is dysfunctional and is s...,father dysfunctional selfish drags kids dysfun...,father dysfunct selfish drag kid dysfunct run
1,@user @user thanks for #lyft credit i can't us...,thanks lyft credit cant use cause dont offer w...,thank lyft credit cant use caus dont offer whe...
2,bihday your majesty,bihday majesty,bihday majesti
3,#model i love u take with u all the time in ...,model u take u time ur,model u take u time ur
4,factsguide: society now #motivation,factsguide society motivation,factsguid societi motiv


In [23]:
df.sample(frac=1).head(10)


Unnamed: 0,tweet,clean_text,stemmed_text
6215,"mean people suck, so i wrote the book on nice....",mean people suck wrote book nice nice guys fin...,mean peopl suck wrote book nice nice guy finis...
24071,11th year anniversary of michael jackson's v...,11th year anniversary michael jacksons vindica...,11th year anniversari michael jackson vindic
31134,looks like i will be seeing @user tomorrow aft...,looks like seeing tomorrow,look like see tomorrow
10366,we bought a boat! ð#bayliner #boat,bought boat bayliner boat,bought boat baylin boat
20646,@user just watched #agirllikeher n don't knw ...,watched agirllikeher n dont knw 2 say wasnt da...,watch agirllikeh n dont knw 2 say wasnt dat ba...
3869,a sad couple days in #orlando . gun violence i...,sad couple days orlando gun violence country c...,sad coupl day orlando gun violenc countri canc...
23012,#selfie aggressive bitch -,selfie aggressive bitch,selfi aggress bitch
216,@user will be here for a screening. i will mis...,screening miss sing songs potp shock treatment,screen miss sing song potp shock treatment
28993,herbal sleep remedies that work!! #altwaystoh...,herbal sleep remedies work altwaystoheal healt...,herbal sleep remedi work altwaystoh healthi heal
5093,great day learning about researching urban his...,great learning researching urban history thank...,great learn research urban histori thank micha...


## Removal of URLs


In [24]:
text = "https://www.hackersrealm.net is the URL of the channel Hackers Realm"


In [25]:

def remove_url(text):
    return re.sub(r'https?://\S+|www\.\S+', '', text)

In [26]:
remove_url(text)


' is the URL of the channel Hackers Realm'

## Removal of HTML Tags

In [27]:
text = "<html><body> <h1>Hackers Realm</h1> <p>This is NLP text preprocessing tutorial</p> </body></html>"


In [28]:
def remove_html_tags(text):
    return re.sub(r'<.*?>', '', text)

In [29]:
remove_html_tags(text)


' Hackers Realm This is NLP text preprocessing tutorial '

## Spelling Correction

In [30]:
!pip install pyspellchecker


Collecting pyspellchecker
  Downloading pyspellchecker-0.8.1-py3-none-any.whl.metadata (9.4 kB)
Downloading pyspellchecker-0.8.1-py3-none-any.whl (6.8 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/6.8 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━[0m [32m5.7/6.8 MB[0m [31m170.8 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m98.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyspellchecker
Successfully installed pyspellchecker-0.8.1


In [31]:
text = 'natur is a beuty'


In [32]:
from spellchecker import SpellChecker
spell = SpellChecker()

def correct_spellings(text):
    corrected_text = []
    misspelled_text = spell.unknown(text.split())
    # print(misspelled_text)
    for word in text.split():
        if word in misspelled_text:
            corrected_text.append(spell.correction(word))
        else:
            corrected_text.append(word)

    return " ".join(corrected_text)

In [33]:
correct_spellings(text)


'nature is a beauty'

## Feature Extraction from Text Data
##### Bag of Words
A bag-of-words is a representation of text that describes the occurrence of words within a document. It involves two things: A vocabulary of known words. A measure of the presence of known words.

In [34]:
text_data = ['I am interested in NLP', 'This is a good tutorial with good topic', 'Feature extraction is very important topic']


In [35]:

from sklearn.feature_extraction.text import CountVectorizer
bow = CountVectorizer(stop_words='english')


In [36]:

# fit the data
bow.fit(text_data)

In [37]:

# get the vocabulary list
bow.get_feature_names_out()

array(['extraction', 'feature', 'good', 'important', 'interested', 'nlp',
       'topic', 'tutorial'], dtype=object)

In [38]:

bow_features = bow.transform(text_data)
bow_features

<3x8 sparse matrix of type '<class 'numpy.int64'>'
	with 9 stored elements in Compressed Sparse Row format>

In [40]:
bow_feature_array = bow_features.toarray()
bow_feature_array

array([[0, 0, 0, 0, 1, 1, 0, 0],
       [0, 0, 2, 0, 0, 0, 1, 1],
       [1, 1, 0, 1, 0, 0, 1, 0]])

In [41]:
print(bow.get_feature_names_out())
for sentence, feature in zip(text_data, bow_feature_array):
    print(sentence)
    print(feature)


['extraction' 'feature' 'good' 'important' 'interested' 'nlp' 'topic'
 'tutorial']
I am interested in NLP
[0 0 0 0 1 1 0 0]
This is a good tutorial with good topic
[0 0 2 0 0 0 1 1]
Feature extraction is very important topic
[1 1 0 1 0 0 1 0]


## TF-IDF

TF-IDF stands for term frequency-inverse document frequency and it is a measure, used in the fields of information retrieval (IR) and machine learning, that can quantify the importance or relevance of string representations (words, phrases, lemmas, etc) in a document amongst a collection of documents

In [42]:
text_data = ['I am interested in NLP', 'This is a good tutorial with good topic', 'Feature extraction is very important topic']


In [43]:

from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(stop_words='english')

In [44]:
# fit the data
tfidf.fit(text_data)

In [45]:

# get the vocabulary list
tfidf.vocabulary_

{'interested': 4,
 'nlp': 5,
 'good': 2,
 'tutorial': 7,
 'topic': 6,
 'feature': 1,
 'extraction': 0,
 'important': 3}

In [46]:
tfidf_features = tfidf.transform(text_data)
tfidf_features


<3x8 sparse matrix of type '<class 'numpy.float64'>'
	with 9 stored elements in Compressed Sparse Row format>

In [47]:
tfidf_feature_array = tfidf_features.toarray()
tfidf_feature_array


array([[0.        , 0.        , 0.        , 0.        , 0.70710678,
        0.70710678, 0.        , 0.        ],
       [0.        , 0.        , 0.84678897, 0.        , 0.        ,
        0.        , 0.32200242, 0.42339448],
       [0.52863461, 0.52863461, 0.        , 0.52863461, 0.        ,
        0.        , 0.40204024, 0.        ]])

In [48]:

for sentence, feature in zip(text_data, tfidf_features):
    print(sentence)
    print(feature)

I am interested in NLP
  (0, 4)	0.7071067811865476
  (0, 5)	0.7071067811865476
This is a good tutorial with good topic
  (0, 2)	0.8467889668239188
  (0, 6)	0.3220024178194947
  (0, 7)	0.4233944834119594
Feature extraction is very important topic
  (0, 0)	0.5286346066596935
  (0, 1)	0.5286346066596935
  (0, 3)	0.5286346066596935
  (0, 6)	0.4020402441612698


## Word2vec
The word2vec algorithm uses a neural network model to learn word associations from a large corpus of text. Once trained, such a model can detect synonymous words or suggest additional words for a partial sentence.

In [49]:
from gensim.test.utils import common_texts
from gensim.models import Word2Vec

In [50]:
# text data
common_texts

[['human', 'interface', 'computer'],
 ['survey', 'user', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'system'],
 ['system', 'human', 'system', 'eps'],
 ['user', 'response', 'time'],
 ['trees'],
 ['graph', 'trees'],
 ['graph', 'minors', 'trees'],
 ['graph', 'minors', 'survey']]

In [51]:

# initialize and fit the data
model = Word2Vec(common_texts, vector_size=100, min_count=1)


In [52]:
model.wv['graph']


array([-8.6196875e-03,  3.6657380e-03,  5.1898835e-03,  5.7419385e-03,
        7.4669183e-03, -6.1676754e-03,  1.1056137e-03,  6.0472824e-03,
       -2.8400505e-03, -6.1735227e-03, -4.1022300e-04, -8.3689485e-03,
       -5.6000124e-03,  7.1045388e-03,  3.3525396e-03,  7.2256695e-03,
        6.8002474e-03,  7.5307419e-03, -3.7891543e-03, -5.6180597e-04,
        2.3483764e-03, -4.5190323e-03,  8.3887316e-03, -9.8581640e-03,
        6.7646410e-03,  2.9144168e-03, -4.9328315e-03,  4.3981876e-03,
       -1.7395747e-03,  6.7113843e-03,  9.9648498e-03, -4.3624435e-03,
       -5.9933780e-04, -5.6956373e-03,  3.8508223e-03,  2.7866268e-03,
        6.8910765e-03,  6.1010956e-03,  9.5384968e-03,  9.2734173e-03,
        7.8980681e-03, -6.9895042e-03, -9.1558648e-03, -3.5575271e-04,
       -3.0998408e-03,  7.8943167e-03,  5.9385742e-03, -1.5456629e-03,
        1.5109634e-03,  1.7900408e-03,  7.8175711e-03, -9.5101865e-03,
       -2.0553112e-04,  3.4691966e-03, -9.3897223e-04,  8.3817719e-03,
      

In [53]:
model.wv.most_similar('graph')


[('trees', 0.06797593832015991),
 ('survey', 0.03364057466387749),
 ('minors', 0.009391162544488907),
 ('human', 0.008315935730934143),
 ('eps', 0.0045030261389911175),
 ('system', -0.010839177295565605),
 ('user', -0.023712964728474617),
 ('computer', -0.09575343132019043),
 ('response', -0.11412722617387772),
 ('time', -0.11555545777082443)]

## Word Embedding using Glove
GloVe is an unsupervised learning algorithm for obtaining vector representations for words. Training is performed on aggregated global word-word co-occurrence statistics from a corpus, and the resulting representations showcase interesting linear substructures of the word vector space

In [54]:
import pandas as pd
import string
from nltk.corpus import stopwords
df = pd.read_csv('Twitter Sentiments.csv')
# drop the columns
df = df.drop(columns=['id', 'label'], axis=1)

df['clean_text'] = df['tweet'].str.lower()

STOPWORDS = set(stopwords.words('english'))
def remove_stopwords(text):
    return " ".join([word for word in text.split() if word not in STOPWORDS])
df['clean_text'] = df['clean_text'].apply(lambda x: remove_stopwords(x))

import re
def remove_spl_chars(text):
    text = re.sub('[^a-zA-Z0-9]', ' ', text)
    text = re.sub('\s+', ' ', text)
    return text
df['clean_text'] = df['clean_text'].apply(lambda x: remove_spl_chars(x))

df.head()

Unnamed: 0,tweet,clean_text
0,@user when a father is dysfunctional and is s...,user father dysfunctional selfish drags kids ...
1,@user @user thanks for #lyft credit i can't us...,user user thanks lyft credit can t use cause ...
2,bihday your majesty,bihday majesty
3,#model i love u take with u all the time in ...,model love u take u time ur
4,factsguide: society now #motivation,factsguide society motivation


In [55]:
!pip install tensorflow



In [56]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np


In [57]:

# tokenize text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['clean_text'])

word_index = tokenizer.word_index
vocab_size = len(word_index)
vocab_size


39085

In [58]:
max(len(data) for data in df['clean_text'])


131

In [59]:
# padding text data
sequences = tokenizer.texts_to_sequences(df['clean_text'])
padded_seq = pad_sequences(sequences, maxlen=131, padding='post', truncating='post')

In [60]:
padded_seq[0]


array([    1,    28, 15330,  2630,  6365,   184,  7786,   385,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,

In [61]:

# create embedding index
embedding_index = {}
with open('glove.6B.100d.txt', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embedding_index[word] = coefs

In [62]:
embedding_index['good']


array([-0.030769 ,  0.11993  ,  0.53909  , -0.43696  , -0.73937  ,
       -0.15345  ,  0.081126 , -0.38559  , -0.68797  , -0.41632  ,
       -0.13183  , -0.24922  ,  0.441    ,  0.085919 ,  0.20871  ,
       -0.063582 ,  0.062228 , -0.051234 , -0.13398  ,  1.1418   ,
        0.036526 ,  0.49029  , -0.24567  , -0.412    ,  0.12349  ,
        0.41336  , -0.48397  , -0.54243  , -0.27787  , -0.26015  ,
       -0.38485  ,  0.78656  ,  0.1023   , -0.20712  ,  0.40751  ,
        0.32026  , -0.51052  ,  0.48362  , -0.0099498, -0.38685  ,
        0.034975 , -0.167    ,  0.4237   , -0.54164  , -0.30323  ,
       -0.36983  ,  0.082836 , -0.52538  , -0.064531 , -1.398    ,
       -0.14873  , -0.35327  , -0.1118   ,  1.0912   ,  0.095864 ,
       -2.8129   ,  0.45238  ,  0.46213  ,  1.6012   , -0.20837  ,
       -0.27377  ,  0.71197  , -1.0754   , -0.046974 ,  0.67479  ,
       -0.065839 ,  0.75824  ,  0.39405  ,  0.15507  , -0.64719  ,
        0.32796  , -0.031748 ,  0.52899  , -0.43886  ,  0.6740

In [65]:
# create embedding matrix
embedding_matrix = np.zeros((vocab_size+1, 100))
for word, i in word_index.items():
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None and embedding_vector.shape == (100,): # Check for shape compatibility
        embedding_matrix[i] = embedding_vector
    # else:  # Optional: Handle words not found in GloVe embeddings
    #    print(f"Word '{word}' not found in GloVe embeddings or has incorrect shape")

In [66]:
embedding_matrix.shape


(39086, 100)

## Named Entity Recognition

In [None]:
 !pip install -U pip setuptools wheel
 !pip install -U spacy
 !python -m spacy download en_core_web_sm

Collecting pip
  Downloading pip-24.3.1-py3-none-any.whl.metadata (3.7 kB)
Collecting setuptools
  Downloading setuptools-75.6.0-py3-none-any.whl.metadata (6.7 kB)
Collecting wheel
  Downloading wheel-0.45.1-py3-none-any.whl.metadata (2.3 kB)
Downloading pip-24.3.1-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m26.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading setuptools-75.6.0-py3-none-any.whl (1.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m48.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading wheel-0.45.1-py3-none-any.whl (72 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.5/72.5 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: wheel, setuptools, pip
  Attempting uninstall: wheel
    Found existing installation: wheel 0.45.0
    Uninstalling wheel-0.45.0:
      Successfully uninstalled wheel-0.45.0
  Attempting uninstall: set

Collecting spacy
  Downloading spacy-3.8.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (27 kB)
Collecting thinc<8.4.0,>=8.3.0 (from spacy)
  Downloading thinc-8.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (15 kB)
Collecting blis<1.1.0,>=1.0.0 (from thinc<8.4.0,>=8.3.0->spacy)
  Downloading blis-1.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.6 kB)
Collecting numpy>=1.19.0 (from spacy)
  Downloading numpy-2.0.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
Downloading spacy-3.8.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (29.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m29.1/29.1 MB[0m [31m79.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading thinc-8.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.7/3.7 MB[0m [31m74.6 MB/s[0m eta [36m0:00:00[0m
[

In [67]:
import spacy
from spacy import displacy

In [68]:
NER = spacy.load('en_core_web_sm')


In [69]:
text = 'Mark Zuckerberg is one of the founders of Facebook, a company from the United States'


In [70]:
ner_text = NER(text)


In [71]:

for word in ner_text.ents:
    print(word.text, word.label_)

Mark Zuckerberg PERSON
one CARDINAL
Facebook ORG
the United States GPE


In [72]:
spacy.explain('GPE')


'Countries, cities, states'

In [73]:
spacy.explain('CARDINAL')


'Numerals that do not fall under another type'

In [74]:
displacy.render(ner_text, style='ent', jupyter=True)


## Data Augmentation for Text

In [76]:
!pip install nlpaug

Collecting nlpaug
  Downloading nlpaug-1.1.11-py3-none-any.whl.metadata (14 kB)
Downloading nlpaug-1.1.11-py3-none-any.whl (410 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/410.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m410.5/410.5 kB[0m [31m26.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: nlpaug
Successfully installed nlpaug-1.1.11


In [77]:
import nlpaug.augmenter.word as naw


In [78]:
text = 'The quick brown fox jumps over a lazy dog'


## Synonym Replacement

In [79]:

syn_aug = naw.synonym.SynonymAug(aug_src='wordnet')
synonym_text = syn_aug.augment(text)
print('Synonym Text:', synonym_text)

Synonym Text: ['The quick brown university fox jumps terminated a faineant dog']


## Random Substitution


In [80]:
sub_aug = naw.random.RandomWordAug(action='substitute')
substituted_text = sub_aug.augment(text)
print('Substituted Text:', substituted_text)

Substituted Text: ['The _ brown fox jumps _ a lazy _']


## Random Deletion


In [81]:
del_aug = naw.random.RandomWordAug(action='delete')
deletion_text = del_aug.augment(text)
print('Deletion Text:', deletion_text)

Deletion Text: ['The quick brown jumps a lazy']


## Random Swap


In [82]:
swap_aug = naw.random.RandomWordAug(action='swap')
swap_text = swap_aug.augment(text)
print('Swap Text:', swap_text)

Swap Text: ['The quick brown fox over jumps a lazy dog']


## Back Translation


In [88]:
import nlpaug.augmenter.word as naw

# translate original text to other language (german) and convert back to english language
back_trans_aug = naw.back_translation.BackTranslationAug()
back_trans_text = back_trans_aug.augment(text)
print('Back Translated Text:', back_trans_text)

Some weights of FSMTForConditionalGeneration were not initialized from the model checkpoint at facebook/wmt19-en-de and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of FSMTForConditionalGeneration were not initialized from the model checkpoint at facebook/wmt19-de-en and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/67.0 [00:00<?, ?B/s]

vocab-src.json:   0%|          | 0.00/849k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/315k [00:00<?, ?B/s]

Back Translated Text: ['The speedy brown fox jumps over a lazy dog']


In [87]:
!pip install sacremoses

Collecting sacremoses
  Downloading sacremoses-0.1.1-py3-none-any.whl.metadata (8.3 kB)
Downloading sacremoses-0.1.1-py3-none-any.whl (897 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/897.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m897.5/897.5 kB[0m [31m32.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sacremoses
Successfully installed sacremoses-0.1.1
