# Building a Spell Corrector/Text Suggestor using fastText

### Importing the libraries

In [1]:
import nltk
import re
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer 
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from gensim.models import FastText
import io
import collections

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/amankedia/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/amankedia/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


### Reading the data

In [2]:
words = []
data = []
with io.open('comments.txt', 'r') as file:
    for entry in file:
        entry = entry.strip()
        data.append(entry)
        words.extend(entry.split())

### Checking for common terms in the data

In [3]:
unique_words = []
unique_words = collections.Counter(words)
unique_words.most_common(10)

[('the', 445892),
 ('to', 288753),
 ('of', 219279),
 ('and', 207335),
 ('a', 201765),
 ('I', 182618),
 ('is', 164602),
 ('you', 157025),
 ('that', 140495),
 ('in', 130244)]

In [4]:
data[:3]

['"Explanation',
 'Why the edits made under my username Hardcore Metallica Fan were reverted? They weren\'t vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don\'t remove the template from the talk page since I\'m retired now.89.205.38.27"',
 "D'aww! He matches this background colour I'm seemingly stuck with. Thanks.  (talk) 21:51, January 11, 2016 (UTC)"]

### Data Preprocessing

In [5]:
def text_clean(corpus):
    '''
    Purpose : Function to keep only alphabets, digits and certain words (punctuations, qmarks, tabs etc. removed)
    
    Input : Takes a text corpus, 'corpus' to be cleaned along with a list of words, 'keep_list', which have to be retained
            even after the cleaning process
    
    Output : Returns the cleaned text corpus
    
    '''
    cleaned_corpus = []
    for row in corpus:
        qs = []
        for word in row.split():
            p1 = re.sub(pattern='[^a-zA-Z0-9]',repl=' ',string=word)
            p1 = p1.lower()
            qs.append(p1)
        cleaned_corpus.append(' '.join(qs))
    return cleaned_corpus

In [6]:
def preprocess(corpus, cleaning = True, stemming = False, stem_type = None, lemmatization = False, remove_stopwords = True):
    
    '''
    Purpose : Function to perform all pre-processing tasks (cleaning, stemming, lemmatization, stopwords removal etc.)
    
    Input : 
    'corpus' - Text corpus on which pre-processing tasks will be performed
    'keep_list' - List of words to be retained during cleaning process
    'cleaning', 'stemming', 'lemmatization', 'remove_stopwords' - Boolean variables indicating whether a particular task should 
                                                                  be performed or not
    'stem_type' - Choose between Porter stemmer or Snowball(Porter2) stemmer. Default is "None", which corresponds to Porter
                  Stemmer. 'snowball' corresponds to Snowball Stemmer
    
    Note : Either stemming or lemmatization should be used. There's no benefit of using both of them together
    
    Output : Returns the processed text corpus
    
    '''
    if cleaning == True:
        corpus = text_clean(corpus)
    
    if remove_stopwords == True:
        wh_words = ['who', 'what', 'when', 'why', 'how', 'which', 'where', 'whom']
        stop = set(stopwords.words('english'))
        for word in wh_words:
            stop.remove(word)
        corpus = [[x for x in x.split() if x not in stop] for x in corpus]
    else :
        corpus = [[x for x in x.split()] for x in corpus]
    
    if lemmatization == True:
        lem = WordNetLemmatizer()
        corpus = [[lem.lemmatize(x, pos = 'v') for x in x] for x in corpus]
    
    if stemming == True:
        if stem_type == 'snowball':
            stemmer = SnowballStemmer(language = 'english')
            corpus = [[stemmer.stem(x) for x in x] for x in corpus]
        else :
            stemmer = PorterStemmer()
            corpus = [[stemmer.stem(x) for x in x] for x in corpus]
    
    corpus = [' '.join(x) for x in corpus]
        

    return corpus

In [7]:
data = preprocess(data)

### Data conversion into formation expected by fastText

In [8]:
preprocessed_data = []
for line in data:
    if line != "":
        preprocessed_data.append(line.split())

### Building the fastText model

In [9]:
model = FastText(size=300, window=3, min_count=1, min_n=1, max_n=5)

In [10]:
model.build_vocab(sentences=preprocessed_data)

In [11]:
len(model.wv.vocab)

182228

In [12]:
model.train(sentences=preprocessed_data, total_examples=len(preprocessed_data), epochs=10)

### Checking for top 5 similar terms returned by the model for specific words (Can be spell corrections and suggestions)

In [13]:
model.wv.most_similar('eplain', topn=5)

[('xplain', 0.8716608881950378),
 ('eexplain', 0.8298567533493042),
 ('explain', 0.823542058467865),
 ('plain', 0.8120548725128174),
 ('reexplain', 0.8073801398277283)]

In [14]:
model.wv.most_similar('reminder', topn=5)

[('remainder', 0.918642520904541),
 ('rejoinder', 0.9136584401130676),
 ('minderbinder', 0.9089889526367188),
 ('reminde', 0.9064415693283081),
 ('reindeer', 0.9025716781616211)]

In [15]:
model.wv.most_similar('relevnt', topn=5)

[('relevant', 0.8245096206665039),
 ('relev', 0.8121916055679321),
 ('releveant', 0.8010507822036743),
 ('releant', 0.7998696565628052),
 ('relevanmt', 0.7993144989013672)]

In [16]:
model.wv.most_similar('purse', topn=5)

[('purpse', 0.926501989364624),
 ('cpurse', 0.9056394696235657),
 ('pure', 0.8901852369308472),
 ('pursue', 0.8851768970489502),
 ('pursuit', 0.8713114261627197)]

## fastText and Word Mover's Distance

In [17]:
sentence_1 = "Obama speaks to the media in Illinois"
sentence_2 = "President greets the press in Chicago"
sentence_3 = "Apple is my favorite company"

In [18]:
word_mover_distance = model.wmdistance(sentence_1, sentence_2)
word_mover_distance

  """Entry point for launching an IPython kernel.


16.234933690155827

In [19]:
word_mover_distance = model.wmdistance(sentence_2, sentence_3)
word_mover_distance

  """Entry point for launching an IPython kernel.


21.107024504493854