## Stopword operations on email dataset using Spacy, Gensim and NLTK
1. Display existing stop words in the default list
2. Removing stop words from the default list
3. Adding stop words to the default list
4. Apply stop word elemination to the email Dataset

In [1]:
import pandas as pd
data = pd.read_csv('Resources/emails.csv', usecols=['text', 'spam'])
print(data.head())

                                                text spam
0  Subject: naturally it's your irresistible your...    1
1  Subject: the stock trading gunslinger  fanny i...    1
2  Subject: unbelievable new homes made easy  im ...    1
3  Subject: 4 color printing special  request add...    1
4  Subject: do not have money , get software cds ...    1


## Using Spacy

In [2]:
import spacy

In [None]:
# Display Existing Stopwords
nlp = spacy.load('en_core_web_sm')
stop_words = nlp.Defaults.stop_words
print(stop_words)

In [7]:
# Remove a stopword
stop_words.remove('will')

In [8]:
# Add Stopword
stop_words.add('Subject')

In [9]:
# Perform Stopword Elimination
text = data['text'][0]
doc = nlp(text)
text = [i.text for i in doc if i.text not in stop_words]
print(' '.join(text))

: naturally irresistible corporate identity   lt hard recollect company :   market suqgestions information isoverwhelminq ; good   catchy logo , stylish statlonery outstanding website   will task easier .   promise havinq ordered iogo   company will automaticaily world ieader : isguite ciear   good products , effective business organization practicable aim   will hotat nowadays market ; promise marketing efforts   will effective . list clear   benefits : creativeness : hand - , original logos , specially   reflect distinctive company image . convenience : logo stationery   provided formats ; easy - - use content management system letsyou   change website content structure . promptness : logo drafts business days . affordability :   marketing break - gaps budget . 100 % satisfaction   guaranteed : provide unlimited changes extra fees   surethat will love result collaboration . look   portfolio _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

## Using Gensim

In [10]:
import gensim

In [11]:
stop_words = gensim.parsing.preprocessing.STOPWORDS

In [None]:
# Display Stopwords
print(stop_words)

In [18]:
# Remove Stopwords
stop_words = stop_words.difference({'yourselves'})

In [19]:
stop_words = stop_words.union({'Subject'})

In [20]:
# Perform Stopword Elimination
from gensim.utils import tokenize
tokens = list(tokenize(data['text'][0]))
tokens_wo_sw = [word for word in tokens if word not in stop_words]
print(tokens_wo_sw)

['naturally', 's', 'irresistible', 'corporate', 'identity', 'lt', 'hard', 'recollect', 'company', 'market', 'suqgestions', 'information', 'isoverwhelminq', 'good', 'catchy', 'logo', 'stylish', 'statlonery', 'outstanding', 'website', 'task', 'easier', 't', 'promise', 'havinq', 'ordered', 'iogo', 'company', 'automaticaily', 'world', 'ieader', 'isguite', 'ciear', 'good', 'products', 'effective', 'business', 'organization', 'practicable', 'aim', 'hotat', 'nowadays', 'market', 'promise', 'marketing', 'efforts', 'effective', 'list', 'clear', 'benefits', 'creativeness', 'hand', 'original', 'logos', 'specially', 'reflect', 'distinctive', 'company', 'image', 'convenience', 'logo', 'stationery', 'provided', 'formats', 'easy', 'use', 'content', 'management', 'letsyou', 'change', 'website', 'content', 'structure', 'promptness', 'll', 'logo', 'drafts', 'business', 'days', 'affordability', 'marketing', 'break', 'shouldn', 't', 'gaps', 'budget', 'satisfaction', 'guaranteed', 'provide', 'unlimited', '

## Using NLTK


In [21]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [22]:
# Display All Stopwords
stop_words = set(stopwords.words('english'))
print(stop_words)

{'during', 'between', "won't", 'over', 'same', 'ours', 'should', 'me', 'we', 'wouldn', 'and', 'to', 'not', 'myself', 'which', 'itself', 'above', 'for', 'out', 'of', 'just', "isn't", 'yours', 'some', "you're", 'do', "you'd", 'into', 'most', 'will', 'other', 'on', 'your', 'they', 'he', 'didn', 'through', 'as', 'very', 'won', 'this', 'wasn', 'm', 'but', 'few', 'did', 'were', 'doesn', 'mustn', 'theirs', 'before', "doesn't", "mustn't", 'him', 'herself', 'them', 'there', "hadn't", 'how', 'under', 'by', 'that', 'has', 'being', 'their', 've', 'is', 'ma', 'than', 'o', 'while', 'hers', 'any', 'mightn', 'it', 'yourself', 'or', 'the', 'all', 'more', "needn't", 'who', 'needn', "weren't", 'down', "that'll", 'only', 's', 'because', 'when', 'in', "shan't", 'at', 'its', 'further', 'was', 'had', 'shouldn', 'below', 'been', 'am', "couldn't", 'about', 'such', 'where', 'i', 'll', 'yourselves', 'own', 'why', 'against', 'up', 'now', 't', "she's", 'both', 'her', "haven't", 'isn', 'his', 'don', 'if', 'no', "di

In [23]:
# Removing stopwords
stop_words = stop_words.difference({'during'})

In [25]:
# Adding Sstopwords
stop_words = stop_words.union({'Subject'})

In [26]:
# Eliminating Stopwords
tokenz = word_tokenize(data['text'][0])
tokenz_wo_sw = [word for word in tokenz if word not in stop_words]
print(tokenz_wo_sw)

[':', 'naturally', "'s", 'irresistible', 'corporate', 'identity', 'lt', 'really', 'hard', 'recollect', 'company', ':', 'market', 'full', 'suqgestions', 'information', 'isoverwhelminq', ';', 'good', 'catchy', 'logo', ',', 'stylish', 'statlonery', 'outstanding', 'website', 'make', 'task', 'much', 'easier', '.', "n't", 'promise', 'havinq', 'ordered', 'iogo', 'company', 'automaticaily', 'become', 'world', 'ieader', ':', 'isguite', 'ciear', 'without', 'good', 'products', ',', 'effective', 'business', 'organization', 'practicable', 'aim', 'hotat', 'nowadays', 'market', ';', 'promise', 'marketing', 'efforts', 'become', 'much', 'effective', '.', 'list', 'clear', 'benefits', ':', 'creativeness', ':', 'hand', '-', 'made', ',', 'original', 'logos', ',', 'specially', 'done', 'reflect', 'distinctive', 'company', 'image', '.', 'convenience', ':', 'logo', 'stationery', 'provided', 'formats', ';', 'easy', '-', '-', 'use', 'content', 'management', 'system', 'letsyou', 'change', 'website', 'content', 'e