# PREPROCESSING IN TEXT MINING

### IMPORT MODULES

In [4]:
!pip install spacy



In [30]:
import pandas as pd
import re
import nltk
import spacy
import string
pd.options.mode.chained_assignment = None

In [31]:
full_df = pd.read_csv("C:/Users/Win 10/Downloads/bugtest.csv", nrows=30)
full_df.head()
df = full_df[["text"]]
df["text"] = df["text"].astype(str)
df.head(10)


Unnamed: 0,text
0,user-ag mozilla/4 compat msie window nt avant ...
1,beta recreat shipped-local file browser scratc...
2,moment nscssframeconstructor attributechang ca...
3,user-ag mozilla/5 window window nt en-u rv gec...
4,upload extens wrong min/maxvers lengthi error ...
5,user-ag mozilla/4 compat msie window nt sv1 ne...
6,user-ag mozilla/5 x11 linux i686 en-gb rv geck...
7,user-ag mozilla/5 window window nt en-u rv b2 ...
8,user-ag mozilla/5 window window nt en-u rv a1 ...
9,testcas bug cairo build border border-radiu do...


### PROBLEM 1: LOWER CASTING

* Converting the text data into lowercase format.

In [32]:
df["lowertext"] = df["text"].str.lower()
df["lowertext"].head(10)

0    user-ag mozilla/4 compat msie window nt avant ...
1    beta recreat shipped-local file browser scratc...
2    moment nscssframeconstructor attributechang ca...
3    user-ag mozilla/5 window window nt en-u rv gec...
4    upload extens wrong min/maxvers lengthi error ...
5    user-ag mozilla/4 compat msie window nt sv1 ne...
6    user-ag mozilla/5 x11 linux i686 en-gb rv geck...
7    user-ag mozilla/5 window window nt en-u rv b2 ...
8    user-ag mozilla/5 window window nt en-u rv a1 ...
9    testcas bug cairo build border border-radiu do...
Name: lowertext, dtype: object

### PROBLEM 2: REMOVAL OF PUNCTUATION

* Removing a list of punctuations from the text data.

In [33]:
PUNCT = string.punctuation

def remove_punctuation(text):
    return text.translate(str.maketrans('','',PUNCT))

In [34]:
df["nopuntext"] = df["text"].apply(lambda text: remove_punctuation(text))
df["nopuntext"].head(10)

0    userag mozilla4 compat msie window nt avant br...
1    beta recreat shippedlocal file browser scratch...
2    moment nscssframeconstructor attributechang ca...
3    userag mozilla5 window window nt enu rv gecko2...
4    upload extens wrong minmaxvers lengthi error m...
5    userag mozilla4 compat msie window nt sv1 net ...
6    userag mozilla5 x11 linux i686 engb rv gecko20...
7    userag mozilla5 window window nt enu rv b2 gec...
8    userag mozilla5 window window nt enu rv a1 gec...
9    testcas bug cairo build border borderradiu don...
Name: nopuntext, dtype: object

### PROBLEM 3: REMOVAL OF STOPWORD

* Removing a list of stopwords or commonly occuring words from the text data.

In [38]:
# We must import NLTK to remove the stopword
import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords
", ".join(stopwords.words('english'))
STOPW = set(stopwords.words('english'))
def remove_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in STOPW])

df["stopwtext"] = df["nopuntext"].apply(lambda text: remove_stopwords(text))
df["stopwtext"].head(10)

[nltk_data] Downloading package stopwords to C:\Users\Win
[nltk_data]     10\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


0    userag mozilla4 compat msie window nt avant br...
1    beta recreat shippedlocal file browser scratch...
2    moment nscssframeconstructor attributechang ca...
3    userag mozilla5 window window nt enu rv gecko2...
4    upload extens wrong minmaxvers lengthi error m...
5    userag mozilla4 compat msie window nt sv1 net ...
6    userag mozilla5 x11 linux i686 engb rv gecko20...
7    userag mozilla5 window window nt enu rv b2 gec...
8    userag mozilla5 window window nt enu rv a1 gec...
9    testcas bug cairo build border borderradiu ren...
Name: stopwtext, dtype: object

### PROBLEM 4: REMOVAL OF FREQUENT WORDS

* Removing a list of frequent words in the given corpus from the text data.

In [39]:
from collections import Counter
cnt = Counter()
for text in df["stopwtext"].values:
    for word in text.split():
        cnt[word] += 1
    

In [40]:
cnt.most_common(10)
FREQWS = set([w for (w, wc) in cnt.most_common(10)])
def remove_freqwords(text):
    return " ".join([word for word in str(text).split() if word not in FREQWS])


In [43]:
df["stopftext"] = df["stopwtext"].apply(lambda text: remove_freqwords(text))
df["stopftext"].head(10)

0    userag mozilla4 compat msie nt avant browser a...
1    beta recreat shippedlocal file browser scratch...
2    moment nscssframeconstructor attributechang ca...
3    userag mozilla5 nt enu rv gecko20060728 firefo...
4    upload extens wrong minmaxvers lengthi error m...
5    userag mozilla4 compat msie nt sv1 net clr net...
6    userag mozilla5 x11 linux i686 engb rv gecko20...
7    userag mozilla5 nt enu rv b2 gecko20060821 fir...
8    userag mozilla5 nt enu rv a1 gecko20060808 sea...
9    testcas cairo build border borderradiu render ...
Name: stopftext, dtype: object

### PROBLEM 5: STEMMING

* Reducing inflected (or sometimes derived) words to their word stem from the text data. For example,
stemming two works of does and doing to the suffix of do.


In [45]:
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()
def stem_words(text):
    return " ".join([stemmer.stem(word) for word in text.split()])
df["stemtext"] = df["stopftext"].apply(lambda text: stem_words(text))
df["stemtext"].head(10)

0    userag mozilla4 compat msie nt avant browser a...
1    beta recreat shippedloc file browser scratch b...
2    moment nscssframeconstructor attributechang ca...
3    userag mozilla5 nt enu rv gecko20060728 firefo...
4    upload exten wrong minmaxv lengthi error messa...
5    userag mozilla4 compat msie nt sv1 net clr net...
6    userag mozilla5 x11 linux i686 engb rv gecko20...
7    userag mozilla5 nt enu rv b2 gecko20060821 fir...
8    userag mozilla5 nt enu rv a1 gecko20060808 sea...
9    testca cairo build border borderradiu render b...
Name: stemtext, dtype: object

### PROBLEM 6: LEMMATIZATION

* Reducing inflected words to their word stem from the text data but still saving the root word (also called as lemma) belonging to the language.

In [50]:
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
def lemmatize_words(text):
    return " ".join([lemmatizer.lemmatize(word) for word in text.split()])

[nltk_data] Downloading package wordnet to C:\Users\Win
[nltk_data]     10\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


In [51]:
df["lemtext"] = df["stemtext"].apply(lambda text: lemmatize_words(text))
df["lemtext"].head(10)

0    userag mozilla4 compat msie nt avant browser a...
1    beta recreat shippedloc file browser scratch b...
2    moment nscssframeconstructor attributechang ca...
3    userag mozilla5 nt enu rv gecko20060728 firefo...
4    upload exten wrong minmaxv lengthi error messa...
5    userag mozilla4 compat msie nt sv1 net clr net...
6    userag mozilla5 x11 linux i686 engb rv gecko20...
7    userag mozilla5 nt enu rv b2 gecko20060821 fir...
8    userag mozilla5 nt enu rv a1 gecko20060808 sea...
9    testca cairo build border borderradiu render b...
Name: lemtext, dtype: object

### PROBLEM 7: REMOVAL OF URLs

* Removing any URLs present in the text data.

In [53]:
import re
def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

In [55]:
df["urltext"] = df["lemtext"].apply(lambda text: remove_urls(text))
df["urltext"].head(10)

0    userag mozilla4 compat msie nt avant browser a...
1    beta recreat shippedloc file browser scratch b...
2    moment nscssframeconstructor attributechang ca...
3    userag mozilla5 nt enu rv gecko20060728 firefo...
4    upload exten wrong minmaxv lengthi error messa...
5    userag mozilla4 compat msie nt sv1 net clr net...
6    userag mozilla5 x11 linux i686 engb rv gecko20...
7    userag mozilla5 nt enu rv b2 gecko20060821 fir...
8    userag mozilla5 nt enu rv a1 gecko20060808 sea...
9    testca cairo build border borderradiu render b...
Name: urltext, dtype: object

### PROBLEM 8: REMOVAL OF HTML TAGS

* Removing any HTML tags present in the text data.

In [56]:
from bs4 import BeautifulSoup
def remove_html(text):
    return BeautifulSoup(text, "lxml").text

In [57]:
df["tagtext"] = df["urltext"].apply(lambda text: remove_html(text))
df["tagtext"].head(10)

0    userag mozilla4 compat msie nt avant browser a...
1    beta recreat shippedloc file browser scratch b...
2    moment nscssframeconstructor attributechang ca...
3    userag mozilla5 nt enu rv gecko20060728 firefo...
4    upload exten wrong minmaxv lengthi error messa...
5    userag mozilla4 compat msie nt sv1 net clr net...
6    userag mozilla5 x11 linux i686 engb rv gecko20...
7    userag mozilla5 nt enu rv b2 gecko20060821 fir...
8    userag mozilla5 nt enu rv a1 gecko20060808 sea...
9    testca cairo build border borderradiu render b...
Name: tagtext, dtype: object

### PROBLEM 9: SPELLING CORRECTION

* Correcting spelling mistakes in the text data.

In [60]:
! pip install pyspellchecker

Collecting pyspellchecker
  Downloading pyspellchecker-0.6.2-py3-none-any.whl (2.7 MB)
Installing collected packages: pyspellchecker
Successfully installed pyspellchecker-0.6.2


In [61]:
from spellchecker import SpellChecker
spell = SpellChecker()
def correct_spellings(text):
    corrected_text = []
    misspelled_words = spell.unknown(text.split())
    for word in text.split():
        if word in misspelled_words:
            corrected_text.append(spell.correction(word))
        else:
            corrected_text.append(word)
    return " ".join(corrected_text)

In [62]:
df["spelltext"] = df["tagtext"].apply(lambda text: correct_spellings(text))
df["spelltext"].head(10)

0    user momzilla combat sie it avant browser avan...
1    beta retreat shippedloc file browser scratch b...
2    moment nscssframeconstructor attributechang ca...
3    user momzilla it end re gecko20060728 firebox ...
4    upload eaten wrong minimax length error messag...
5    user momzilla combat sie it svu net car net ca...
6    user momzilla xu linux i686 eng re gecko200607...
7    user momzilla it end re be gecko20060821 fireb...
8    user momzilla it end re a gecko20060808 seamon...
9    testa cairo build border borderradiu render bo...
Name: spelltext, dtype: object