In [1]:
import pandas as pd
import string 

In [2]:
df = pd.read_csv('Data/Twitter Sentiments.csv')

In [3]:
df.head(3)

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty


In [4]:
df = df.drop(columns=['id', 'label'], axis=1)
df.head(3)

Unnamed: 0,tweet
0,@user when a father is dysfunctional and is s...
1,@user @user thanks for #lyft credit i can't us...
2,bihday your majesty


## Convert to lowercase

In [5]:
df['clean_text'] = df['tweet'].str.lower()
df.head(3)

Unnamed: 0,tweet,clean_text
0,@user when a father is dysfunctional and is s...,@user when a father is dysfunctional and is s...
1,@user @user thanks for #lyft credit i can't us...,@user @user thanks for #lyft credit i can't us...
2,bihday your majesty,bihday your majesty


## Removal of Punctuations

In [6]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [7]:
def remove_punctuations(text):
    punctuations = string.punctuation
    return text.translate(str.maketrans('','',punctuations))

In [8]:
df['clean_text'] = df['clean_text'].apply(lambda x: remove_punctuations(x))
df.head(3)

Unnamed: 0,tweet,clean_text
0,@user when a father is dysfunctional and is s...,user when a father is dysfunctional and is so...
1,@user @user thanks for #lyft credit i can't us...,user user thanks for lyft credit i cant use ca...
2,bihday your majesty,bihday your majesty


## Removal of Stopwords

In [9]:
from nltk.corpus import stopwords
", ".join(stopwords.words('english'))

"i, me, my, myself, we, our, ours, ourselves, you, you're, you've, you'll, you'd, your, yours, yourself, yourselves, he, him, his, himself, she, she's, her, hers, herself, it, it's, its, itself, they, them, their, theirs, themselves, what, which, who, whom, this, that, that'll, these, those, am, is, are, was, were, be, been, being, have, has, had, having, do, does, did, doing, a, an, the, and, but, if, or, because, as, until, while, of, at, by, for, with, about, against, between, into, through, during, before, after, above, below, to, from, up, down, in, out, on, off, over, under, again, further, then, once, here, there, when, where, why, how, all, any, both, each, few, more, most, other, some, such, no, nor, not, only, own, same, so, than, too, very, s, t, can, will, just, don, don't, should, should've, now, d, ll, m, o, re, ve, y, ain, aren, aren't, couldn, couldn't, didn, didn't, doesn, doesn't, hadn, hadn't, hasn, hasn't, haven, haven't, isn, isn't, ma, mightn, mightn't, mustn, mus

In [10]:
STOPWORDS = set(stopwords.words('english'))
def remove_stopwords(text):
    return " ".join([word for word in text.split() if word not in STOPWORDS])

In [11]:
df['clean_text'] = df['clean_text'].apply(lambda x: remove_stopwords(x))
df.head(3)

Unnamed: 0,tweet,clean_text
0,@user when a father is dysfunctional and is s...,user father dysfunctional selfish drags kids d...
1,@user @user thanks for #lyft credit i can't us...,user user thanks lyft credit cant use cause do...
2,bihday your majesty,bihday majesty


## Removal of Frequent Words

In [12]:
from collections import Counter
word_count = Counter()
for text in df['clean_text']:
    for word in text.split():
        word_count[word] += 1

word_count.most_common(10)

[('user', 17473),
 ('love', 2647),
 ('day', 2198),
 ('happy', 1663),
 ('amp', 1582),
 ('im', 1139),
 ('u', 1136),
 ('time', 1110),
 ('life', 1086),
 ('like', 1042)]

In [13]:
FREQUENT_WORDS = set(word for (word, wc) in word_count.most_common(3))
def remove_freq_words(text):
    return " ".join([word for word in text.split() if word not in FREQUENT_WORDS])


In [14]:
df['clean_text'] = df['clean_text'].apply(lambda x: remove_freq_words(x))
df.head(3)

Unnamed: 0,tweet,clean_text
0,@user when a father is dysfunctional and is s...,father dysfunctional selfish drags kids dysfun...
1,@user @user thanks for #lyft credit i can't us...,thanks lyft credit cant use cause dont offer w...
2,bihday your majesty,bihday majesty


## Removal of Rare Words

In [15]:
RARE_WORDS = set(word for (word, wc) in word_count.most_common()[:-10:-1])
RARE_WORDS

{'airwaves',
 'carnt',
 'chisolm',
 'ibizabringitonmallorcaholidayssummer',
 'isz',
 'mantle',
 'shirley',
 'youuuð\x9f\x98\x8dð\x9f\x98\x8dð\x9f\x98\x8dð\x9f\x98\x8dð\x9f\x98\x8dð\x9f\x98\x8dð\x9f\x98\x8dð\x9f\x98\x8dð\x9f\x98\x8dâ\x9d¤ï¸\x8f',
 'ð\x9f\x99\x8fð\x9f\x8f¼ð\x9f\x8d¹ð\x9f\x98\x8eð\x9f\x8eµ'}

In [16]:
def remove_rare_words(text):
    return " ".join([word for word in text.split() if word not in RARE_WORDS])


In [17]:
df['clean_text'] = df['clean_text'].apply(lambda x: remove_rare_words(x))
df.head(3)

Unnamed: 0,tweet,clean_text
0,@user when a father is dysfunctional and is s...,father dysfunctional selfish drags kids dysfun...
1,@user @user thanks for #lyft credit i can't us...,thanks lyft credit cant use cause dont offer w...
2,bihday your majesty,bihday majesty


## Removal of Special characters

In [18]:
import re
def remove_spl_chars(text):
    text = re.sub('[^a-zA-Z0-9]','',text)
    text = re.sub('\s+', '', text)
    return text


In [19]:
df['clean_text'] = df['clean_text'].apply(lambda x: remove_spl_chars(x))
df.head(3)

Unnamed: 0,tweet,clean_text
0,@user when a father is dysfunctional and is s...,fatherdysfunctionalselfishdragskidsdysfunctionrun
1,@user @user thanks for #lyft credit i can't us...,thankslyftcreditcantusecausedontofferwheelchai...
2,bihday your majesty,bihdaymajesty


## Stemming

In [20]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
def stem_words(text):
    return " ".join([ps.stem(word) for word in text.split()])


In [21]:
df['stemmed_text'] = df['clean_text'].apply(lambda x: stem_words(x))
df.head(3)

Unnamed: 0,tweet,clean_text,stemmed_text
0,@user when a father is dysfunctional and is s...,fatherdysfunctionalselfishdragskidsdysfunctionrun,fatherdysfunctionalselfishdragskidsdysfunctionrun
1,@user @user thanks for #lyft credit i can't us...,thankslyftcreditcantusecausedontofferwheelchai...,thankslyftcreditcantusecausedontofferwheelchai...
2,bihday your majesty,bihdaymajesty,bihdaymajesti


## Lemmatization & POS Tagging

In [22]:
from nltk import pos_tag
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
wordnet_map = {"N":wordnet.NOUN, "V":wordnet.VERB, "J":wordnet.ADJ, "R":wordnet.ADV}

def lemmatize_words(text):
    pos_text = pos_tag(text.split())
    return " ".join([lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_text])


In [23]:
wordnet.NOUN

'n'

In [24]:
df['lemmatized_text'] = df['clean_text'].apply(lambda x: lemmatize_words(x))
df.head(3)

Unnamed: 0,tweet,clean_text,stemmed_text,lemmatized_text
0,@user when a father is dysfunctional and is s...,fatherdysfunctionalselfishdragskidsdysfunctionrun,fatherdysfunctionalselfishdragskidsdysfunctionrun,fatherdysfunctionalselfishdragskidsdysfunctionrun
1,@user @user thanks for #lyft credit i can't us...,thankslyftcreditcantusecausedontofferwheelchai...,thankslyftcreditcantusecausedontofferwheelchai...,thankslyftcreditcantusecausedontofferwheelchai...
2,bihday your majesty,bihdaymajesty,bihdaymajesti,bihdaymajesty


In [25]:
df.sample(frac=1).head(3)

Unnamed: 0,tweet,clean_text,stemmed_text,lemmatized_text
12498,yet another #lonely #depressed #sleepless ni...,yetanotherlonelydepressedsleeplessnight,yetanotherlonelydepressedsleeplessnight,yetanotherlonelydepressedsleeplessnight
28868,#likescam bull hill climb: you have to reach...,likescambullhillclimbreachtargetcompletetasksu...,likescambullhillclimbreachtargetcompletetasksu...,likescambullhillclimbreachtargetcompletetasksu...
14855,"#iala #superduperexcited @ lanham, maryland",ialasuperduperexcitedlanhammaryland,ialasuperduperexcitedlanhammaryland,ialasuperduperexcitedlanhammaryland


## Removal of URLs

In [26]:
text = "https://www.hackersrealm.net is the URL of the channel Hackers Realm"

In [27]:
def remove_url(text):
    return re.sub(r'https?://\S+|www\.\S+', '', text)

In [28]:
remove_url(text)

' is the URL of the channel Hackers Realm'

## Remove of HTML Tags

In [29]:
text = "<html><body> <h1>Hackers Realm</h1> <p>This is NLP text preprocessing tutorial</p> </body></html>"

In [30]:
def remove_html_tags(text):
    return re.sub(r'<.*?>', '', text)

In [31]:
remove_html_tags(text)

' Hackers Realm This is NLP text preprocessing tutorial '

## Spelling Correction

In [32]:
pip install pyspellchecker

Note: you may need to restart the kernel to use updated packages.


In [33]:
text = 'natur is a beuty'

In [34]:
from spellchecker import SpellChecker
spell = SpellChecker()

def correct_spellings(text):
    corrected_text = []
    misspelled_text = spell.unknown(text.split())
    for word in text.split():
        if word in misspelled_text:
            corrected_text.append(spell.correction(word))
        else:
            corrected_text.append(word)
    return " ".join(corrected_text)

In [35]:
correct_spellings(text)

'nature is a beauty'

## Feature Extraction from Text Data
### Bag of Words
A bag-of-words is a representation of text that describes the occurrence of words within a document. It involves two things: A vocabulary of known words. A measure of the presence of known words.

In [36]:
text_data = ['I am interested in NLP', 'This is a good tutorial with good topic', 'Feature extraction is very important topic']


In [37]:
from sklearn.feature_extraction.text import CountVectorizer
bow = CountVectorizer(stop_words='english')

In [38]:
bow.fit(text_data)

In [39]:
bow.get_feature_names_out()

array(['extraction', 'feature', 'good', 'important', 'interested', 'nlp',
       'topic', 'tutorial'], dtype=object)

In [40]:
bow_features = bow.transform(text_data)
bow_features

<3x8 sparse matrix of type '<class 'numpy.int64'>'
	with 9 stored elements in Compressed Sparse Row format>

In [41]:
bow_feature_array = bow_features.toarray()
bow_feature_array

array([[0, 0, 0, 0, 1, 1, 0, 0],
       [0, 0, 2, 0, 0, 0, 1, 1],
       [1, 1, 0, 1, 0, 0, 1, 0]], dtype=int64)

In [42]:
print(bow.get_feature_names_out())
for sentence, feature in zip(text_data, bow_feature_array):
    print(sentence)
    print(feature)
    

['extraction' 'feature' 'good' 'important' 'interested' 'nlp' 'topic'
 'tutorial']
I am interested in NLP
[0 0 0 0 1 1 0 0]
This is a good tutorial with good topic
[0 0 2 0 0 0 1 1]
Feature extraction is very important topic
[1 1 0 1 0 0 1 0]


### TF-IDF (Term Frequency/Inverse Document Frequency)
TF-IDF stands for term frequency-inverse document frequency and it is a measure, used in the fields of information retrieval (IR) and machine learning, that can quantify the importance or relevance of string representations (words, phrases, lemmas, etc) in a document amongst a collection of documents.

In [43]:
text_data = ['I am interested in NLP', 'This is a good tutorial with good topic', 'Feature extraction is very important topic']


In [44]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(stop_words='english')

In [45]:
tfidf.fit(text_data)

In [46]:
tfidf.vocabulary_

{'interested': 4,
 'nlp': 5,
 'good': 2,
 'tutorial': 7,
 'topic': 6,
 'feature': 1,
 'extraction': 0,
 'important': 3}

In [47]:
tfidf_features = tfidf.transform(text_data)
tfidf_features

<3x8 sparse matrix of type '<class 'numpy.float64'>'
	with 9 stored elements in Compressed Sparse Row format>

In [48]:
tfidf_feature_array = tfidf_features.toarray()
tfidf_feature_array

array([[0.        , 0.        , 0.        , 0.        , 0.70710678,
        0.70710678, 0.        , 0.        ],
       [0.        , 0.        , 0.84678897, 0.        , 0.        ,
        0.        , 0.32200242, 0.42339448],
       [0.52863461, 0.52863461, 0.        , 0.52863461, 0.        ,
        0.        , 0.40204024, 0.        ]])

In [49]:
for sentence, feature in zip(text_data, tfidf_features):
    print(sentence)
    print(feature)

I am interested in NLP
  (0, 4)	0.7071067811865476
  (0, 5)	0.7071067811865476
This is a good tutorial with good topic
  (0, 2)	0.8467889668239188
  (0, 6)	0.3220024178194947
  (0, 7)	0.4233944834119594
Feature extraction is very important topic
  (0, 0)	0.5286346066596935
  (0, 1)	0.5286346066596935
  (0, 3)	0.5286346066596935
  (0, 6)	0.4020402441612698


### Word2vec
The word2vec algorithm uses a neural network model to learn word associations from a large corpus of text. Once trained, such a model can detect synonymous words or suggest additional words for a partial sentence.

In [50]:
from gensim.test.utils import common_texts
from gensim.models import Word2Vec

In [51]:
common_texts

[['human', 'interface', 'computer'],
 ['survey', 'user', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'system'],
 ['system', 'human', 'system', 'eps'],
 ['user', 'response', 'time'],
 ['trees'],
 ['graph', 'trees'],
 ['graph', 'minors', 'trees'],
 ['graph', 'minors', 'survey']]

In [52]:
#initialize and fit the data
model = Word2Vec(common_texts, vector_size=100, min_count=1)

In [53]:
model.wv['graph']

array([-8.6196875e-03,  3.6657380e-03,  5.1898835e-03,  5.7419385e-03,
        7.4669183e-03, -6.1676754e-03,  1.1056137e-03,  6.0472824e-03,
       -2.8400505e-03, -6.1735227e-03, -4.1022300e-04, -8.3689485e-03,
       -5.6000124e-03,  7.1045388e-03,  3.3525396e-03,  7.2256695e-03,
        6.8002474e-03,  7.5307419e-03, -3.7891543e-03, -5.6180597e-04,
        2.3483764e-03, -4.5190323e-03,  8.3887316e-03, -9.8581640e-03,
        6.7646410e-03,  2.9144168e-03, -4.9328315e-03,  4.3981876e-03,
       -1.7395747e-03,  6.7113843e-03,  9.9648498e-03, -4.3624435e-03,
       -5.9933780e-04, -5.6956373e-03,  3.8508223e-03,  2.7866268e-03,
        6.8910765e-03,  6.1010956e-03,  9.5384968e-03,  9.2734173e-03,
        7.8980681e-03, -6.9895042e-03, -9.1558648e-03, -3.5575271e-04,
       -3.0998408e-03,  7.8943167e-03,  5.9385742e-03, -1.5456629e-03,
        1.5109634e-03,  1.7900408e-03,  7.8175711e-03, -9.5101865e-03,
       -2.0553112e-04,  3.4691966e-03, -9.3897223e-04,  8.3817719e-03,
      

In [54]:
model.wv.most_similar('graph')

[('user', 0.06793875247240067),
 ('survey', 0.03364057466387749),
 ('eps', 0.009391162544488907),
 ('human', 0.008315935730934143),
 ('minors', 0.0045030261389911175),
 ('system', -0.010839177295565605),
 ('trees', -0.023671656847000122),
 ('computer', -0.09575343877077103),
 ('time', -0.11410722136497498),
 ('response', -0.11557211726903915)]

### Word Embedding using Glove
GloVe is an unsupervised learning algorithm for obtaining vector representations for words. Training is performed on aggregated alobal word-word co-occurance statistics from a corpus, and the resulting representations showcase interesting linear substructures of the word vector space. 

Download Link: [Stanford's GloVe 100d word embeddings](https://www.kaggle.com/datasets/danielwillgeorge/glove6b100dtxt)

In [55]:
import pandas as pd
import string 
from nltk.corpus import stopwords

df = pd.read_csv('Data/Twitter Sentiments.csv')

In [56]:
df = df.drop(columns=['id', 'label'], axis=1)

In [57]:
df['clean_text'] = df['tweet'].str.lower()

In [58]:
STOPWORDS = set(stopwords.words('english'))

def remove_stopwords(text):
    return " ".join([word for word in text.split() if word not in STOPWORDS])

df['clean_text'] = df['clean_text'].apply(lambda x: remove_stopwords(x))


In [59]:
import re 
def remove_spl_chars(text):
    text = re.sub('[^a-zA-Z0-9]', ' ', text)
    text = re.sub('\s+', ' ',text)
    return text
df['clean_text'] = df['clean_text'].apply(lambda x: remove_spl_chars(x))
df.head(3)

Unnamed: 0,tweet,clean_text
0,@user when a father is dysfunctional and is s...,user father dysfunctional selfish drags kids ...
1,@user @user thanks for #lyft credit i can't us...,user user thanks lyft credit can t use cause ...
2,bihday your majesty,bihday majesty


In [60]:
#pip install keras
#pip install tensorflow

In [61]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [62]:
import numpy as np
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['clean_text'])

word_index = tokenizer.word_index
vocab_size = len(word_index)
vocab_size

39085

In [63]:
#word index
max(len(data) for data in df['clean_text'])

131

In [64]:
#padding text data
sequences = tokenizer.texts_to_sequences(df['clean_text'])
padded_sequences = pad_sequences(sequences, maxlen=131, padding='post', truncating='post')

In [65]:
padded_sequences[0]

array([    1,    28, 15330,  2630,  6365,   184,  7786,   385,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,

In [66]:
#create embedding index
embedding_index = {}
with open('Data/glove.6B.100d.txt', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embedding_index[word] = coefs 

In [67]:
embedding_index['good']

array([-0.030769 ,  0.11993  ,  0.53909  , -0.43696  , -0.73937  ,
       -0.15345  ,  0.081126 , -0.38559  , -0.68797  , -0.41632  ,
       -0.13183  , -0.24922  ,  0.441    ,  0.085919 ,  0.20871  ,
       -0.063582 ,  0.062228 , -0.051234 , -0.13398  ,  1.1418   ,
        0.036526 ,  0.49029  , -0.24567  , -0.412    ,  0.12349  ,
        0.41336  , -0.48397  , -0.54243  , -0.27787  , -0.26015  ,
       -0.38485  ,  0.78656  ,  0.1023   , -0.20712  ,  0.40751  ,
        0.32026  , -0.51052  ,  0.48362  , -0.0099498, -0.38685  ,
        0.034975 , -0.167    ,  0.4237   , -0.54164  , -0.30323  ,
       -0.36983  ,  0.082836 , -0.52538  , -0.064531 , -1.398    ,
       -0.14873  , -0.35327  , -0.1118   ,  1.0912   ,  0.095864 ,
       -2.8129   ,  0.45238  ,  0.46213  ,  1.6012   , -0.20837  ,
       -0.27377  ,  0.71197  , -1.0754   , -0.046974 ,  0.67479  ,
       -0.065839 ,  0.75824  ,  0.39405  ,  0.15507  , -0.64719  ,
        0.32796  , -0.031748 ,  0.52899  , -0.43886  ,  0.6740

In [68]:
#create embedding matrix
embedding_matrix = np.zeros((vocab_size+1, 100))
for word,i in word_index.items():
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
        

In [69]:
embedding_matrix.shape

(39086, 100)

### Named Entity Recognition

In [70]:
# %pip install -U pip setuptools wheel
# %pip install -U spacy 


In [71]:
import spacy 
from spacy import displacy 

In [72]:
# !python -m spacy download en_core_web_sm


In [73]:
NER = spacy.load('en_core_web_sm')

In [74]:
text = "Mark Zuckerberg is one of the founders of Facebook, a company from the United States"

ner_text = NER(text)

In [75]:
for word in ner_text.ents:
    print(word.text, word.label_)

Mark Zuckerberg PERSON
one CARDINAL
the United States GPE


In [76]:
spacy.explain('GPE')

'Countries, cities, states'

In [77]:
spacy.explain('CARDINAL')

'Numerals that do not fall under another type'

In [78]:
displacy.render(ner_text, style='ent', jupyter=True)

### Data Augmentation for Text

Uses:
1. Increase the dataset size by creating more samples
2. Reduce overfitting 
3. Improve model generalization 
4. Handling imbalance dataset

In [79]:
# %pip install nlpaug
# %pip install sacremoses

In [80]:
import nlpaug.augmenter.word as naw

  from .autonotebook import tqdm as notebook_tqdm


In [81]:
text = 'The quick brown fox jumps over a lazy dog'

#### Synonym Replacement

In [82]:
syn_aug = naw.synonym.SynonymAug(aug_src='wordnet')
synonym_text = syn_aug.augment(text)
print('Synonym Text:', synonym_text)

Synonym Text: ['The warm brownness fox leap over a lazy dog']


#### Random Substitution

In [83]:
sub_aug = naw.random.RandomWordAug(action='substitute')
substituted_text = sub_aug.augment(text)
print('Substituted Text:', substituted_text)

Substituted Text: ['The quick brown _ jumps _ a _ dog']


#### Random Deletion

In [84]:
del_aug = naw.random.RandomWordAug(action='delete')
deletion_text = del_aug.augment(text)
print('Deletion Text:', deletion_text)

Deletion Text: ['The quick brown jumps over dog']


#### Random Swap

In [85]:
swap_aug = naw.random.RandomWordAug(action='swap')
swap_text = swap_aug.augment(text)
print('Swap Text:', swap_text)

Swap Text: ['The quick fox brown jumps over a lazy dog']


#### Back Translation

translate original text to other language (german) and convert back to english language

In [86]:
# pip install torch
# %pip install transformers

In [87]:
back_trans_aug = naw.back_translation.BackTranslationAug()
back_trans_text = back_trans_aug.augment(text)
print('Back Translated Text:', back_trans_text)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Some weights of FSMTForConditionalGeneration were not initialized from the model checkpoint at facebook/wmt19-en-de and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Some weights of FSMTForConditionalGeneration were not initialized from the model checkpoint at facebook/wmt19-de-en and are newly initialized: 

Back Translated Text: ['The speedy brown fox jumps over a lazy dog']


In [88]:
print('Back Translated Text:', back_trans_text)

Back Translated Text: ['The speedy brown fox jumps over a lazy dog']
