# IRS Practical 2
> 19BCE245 - Aayush Shah
- Text Preprocessing using NLTK. Visualization
  - Word Cloud
  - Histogram of top N frequent terms

In [None]:
#import required stuff
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import feature_extraction, linear_model, model_selection, preprocessing

import re #for working with regular expression
import nltk #for natural language processing (nlp)
import spacy #also for nlp
import string #This is a module, Python also has built-in class str, these are different

In [None]:
train_df = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")
test_df = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")

In [None]:
print('train dataframe : \n',train_df.head(5))
print(train_df.info())
print('test dataframe : ',test_df.head(5))
print(test_df.info())

train dataframe : 
    id keyword location                                               text  \
0   1     NaN      NaN  Our Deeds are the Reason of this #earthquake M...   
1   4     NaN      NaN             Forest fire near La Ronge Sask. Canada   
2   5     NaN      NaN  All residents asked to 'shelter in place' are ...   
3   6     NaN      NaN  13,000 people receive #wildfires evacuation or...   
4   7     NaN      NaN  Just got sent this photo from Ruby #Alaska as ...   

   target  
0       1  
1       1  
2       1  
3       1  
4       1  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB
None
test dataframe :

In [None]:
print(len(train_df.index))
print(len(test_df.index))

7613
3263


In [None]:
# Merging train and test dataframe for performing text-preprocessing
train_df_copy = train_df
train_df = train_df.drop('target', axis = 1)
frames = [train_df,test_df]
train_df = pd.concat(frames)

In [None]:
# Converting everything in Lower case
train_df['lowered_text'] = train_df['text'].str.lower()
print(train_df['lowered_text'].head(3))

0    our deeds are the reason of this #earthquake m...
1               forest fire near la ronge sask. canada
2    all residents asked to 'shelter in place' are ...
Name: lowered_text, dtype: object


In [None]:
# Removing punctuation
punctuation=string.punctuation
mapping=str.maketrans("","",punctuation)

def remove_punctuation(in_str):
    return in_str.translate(mapping)

print(train_df['lowered_text'].head(10))   
train_df['lowered_text']=train_df["lowered_text"].apply(lambda x: remove_punctuation(x))
print(train_df['lowered_text'].head(10)) 

0    our deeds are the reason of this #earthquake m...
1               forest fire near la ronge sask. canada
2    all residents asked to 'shelter in place' are ...
3    13,000 people receive #wildfires evacuation or...
4    just got sent this photo from ruby #alaska as ...
5    #rockyfire update => california hwy. 20 closed...
6    #flood #disaster heavy rain causes flash flood...
7    i'm on top of the hill and i can see a fire in...
8    there's an emergency evacuation happening now ...
9    i'm afraid that the tornado is coming to our a...
Name: lowered_text, dtype: object
0    our deeds are the reason of this earthquake ma...
1                forest fire near la ronge sask canada
2    all residents asked to shelter in place are be...
3    13000 people receive wildfires evacuation orde...
4    just got sent this photo from ruby alaska as s...
5    rockyfire update  california hwy 20 closed in ...
6    flood disaster heavy rain causes flash floodin...
7    im on top of the hill and 

In [None]:
# Removing Stop words
from nltk.corpus import stopwords
stopwords_eng=stopwords.words('english')

print(train_df["lowered_text"].head(10)) #before

def remove_stopwords(in_str):
    new_str=''
    words=in_str.split()
    for tx in words:
        if tx not in stopwords_eng:
            new_str=new_str + tx + " "
    return new_str

train_df['lowered_text_stop_removed']=train_df["lowered_text"].apply(lambda x: remove_stopwords(x))
print(train_df["lowered_text_stop_removed"].head(10)) #after

0    our deeds are the reason of this earthquake ma...
1                forest fire near la ronge sask canada
2    all residents asked to shelter in place are be...
3    13000 people receive wildfires evacuation orde...
4    just got sent this photo from ruby alaska as s...
5    rockyfire update  california hwy 20 closed in ...
6    flood disaster heavy rain causes flash floodin...
7    im on top of the hill and i can see a fire in ...
8    theres an emergency evacuation happening now i...
9     im afraid that the tornado is coming to our area
Name: lowered_text, dtype: object
0        deeds reason earthquake may allah forgive us 
1               forest fire near la ronge sask canada 
2    residents asked shelter place notified officer...
3    13000 people receive wildfires evacuation orde...
4    got sent photo ruby alaska smoke wildfires pou...
5    rockyfire update california hwy 20 closed dire...
6    flood disaster heavy rain causes flash floodin...
7                          im t

In [None]:
# Removing most frequent 10 words
from collections import Counter
counter=Counter()
for text in train_df["lowered_text_stop_removed"]:
    for word in text.split():
        counter[word]+=1
most_cmn_list=counter.most_common(10)
print(type(most_cmn_list), most_cmn_list)
most_cmn_words_list=[]
for word, freq in most_cmn_list:
    most_cmn_words_list.append(word)
print('Most common words : ', most_cmn_words_list)

def remove_frequent(in_str):
    new_str=''
    for word in in_str.split():
        if word not in most_cmn_words_list:
            new_str=new_str + word + " "
    return new_str

train_df["lowered_text_stop_removed_freq_removed"]=train_df['lowered_text_stop_removed'].apply(lambda x: remove_frequent(x))

<class 'list'> [('like', 490), ('amp', 434), ('im', 419), ('fire', 357), ('get', 335), ('new', 326), ('via', 324), ('news', 282), ('people', 278), ('one', 277)]
Most common words :  ['like', 'amp', 'im', 'fire', 'get', 'new', 'via', 'news', 'people', 'one']


In [None]:
# Removing 10 most rare words
most_rare_list=counter.most_common()[-10:]
most_rare_words=[]
for word, freq in most_rare_list:
    most_rare_words.append(word)
print('Most rare words : ',most_rare_words)

def remove_rare(in_text):
    new_text=""
    for word in in_text.split():
        if word not in most_rare_words:
            new_text=new_text + word + " "
    return new_text

train_df["lowered_stop_freq_rare_removed"]=train_df["lowered_text_stop_removed_freq_removed"].apply(lambda x: remove_rare(x))

Most rare words :  ['httptcotjpylu9fox', 'httptcopfavw5qyqe', 'httptcohkut5msdtp', 'issuicide', 'rajman', 'hasaka', 'risen', 'fasteners', 'xrwn', 'httptcoutbxlcbiuy']


In [None]:
# Stemming using PorterStemmer [IGNORE... SKIP TO SnowballStemmer.]

# from nltk.stem.porter import PorterStemmer
# stemmer=PorterStemmer()

# def do_stemming(in_str):
#     new_str=""
#     for word in in_str.split():
#         new_str=new_str + stemmer.stem(word) + " "
#     return new_str

# train_df["Stemmed"]=train_df["lowered_stop_freq_rare_removed"].apply(lambda x: do_stemming(x))

In [None]:
# Stemming using SnowballStemmer [IGNORE... SKIP TO Lemmatization.]

# from nltk.stem.snowball import SnowballStemmer
# stemmer_sb=SnowballStemmer(language='english')

# def do_stemming_sb(in_str):
#     new_str=""
#     for word in in_str.split():
#         new_str=new_str + stemmer_sb.stem(word) + " "
#     return new_str

# train_df["Stemmed_sb"]=train_df["lowered_stop_freq_rare_removed"].apply(lambda x: do_stemming_sb(x))

In [None]:
# Lemmatization [IGNORE... SKIP TO Lemmatization with POS]

from nltk.stem import WordNetLemmatizer
lem=WordNetLemmatizer()

def do_lemmatizing(in_str):
    new_str=""
    for word in in_str.split():
        new_str=new_str + lem.lemmatize(word) + " "
    return new_str

train_df["Lemmatized"]=train_df["lowered_stop_freq_rare_removed"].apply(lambda x: do_lemmatizing(x))

In [None]:
# from nltk.corpus import wordnet

# wordnet_map = {"N":wordnet.NOUN, "V":wordnet.VERB, "J":wordnet.ADJ, "R":wordnet.ADV}
# pos_tagged_text = nltk.pos_tag(train_df.loc[0,"lowered_stop_freq_rare_removed"].split())
# print(pos_tagged_text)

# print(train_df["lowered_stop_freq_rare_removed"].head(5)) #before

# def do_lemmatizing_with_POS(in_str):
#     new_str=''
#     for word in in_str.split():
#         tag=nltk.pos_tag(word)[0][1][0]
#         new_str=new_str + lem.lemmatize(word, wordnet_map.get(tag,wordnet.NOUN)) + " "
#     return new_str

# train_df["Lemmatized"]=train_df["lowered_stop_freq_rare_removed"].apply(lambda x: do_lemmatizing_with_POS(x))
# print(train_df["Lemmatized"].head(5)) #after
# train_df["Lemmatized"].isnull().sum()

> **Ignoring emojis...**

In [None]:
# Removing URLs
def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)
def remove_html(in_str):
    html_pattern = re.compile('<.*?>')
    return html_pattern.sub(r'', in_str)

train_df["urls_removed"]=train_df["Lemmatized"].apply(lambda x: remove_urls(x))
train_df["html_removed"]=train_df["urls_removed"].apply(lambda x: remove_html(x))

In [None]:
# Converting chat words to actual text
chat_words_str = """
AFAIK=As Far As I Know
AFK=Away From Keyboard
ASAP=As Soon As Possible
ATK=At The Keyboard
ATM=At The Moment
A3=Anytime, Anywhere, Anyplace
BAK=Back At Keyboard
BBL=Be Back Later
BBS=Be Back Soon
BFN=Bye For Now
B4N=Bye For Now
BRB=Be Right Back
BRT=Be Right There
BTW=By The Way
B4=Before
B4N=Bye For Now
CU=See You
CUL8R=See You Later
CYA=See You
FAQ=Frequently Asked Questions
FC=Fingers Crossed
FWIW=For What It's Worth
FYI=For Your Information
GAL=Get A Life
GG=Good Game
GN=Good Night
GMTA=Great Minds Think Alike
GR8=Great!
G9=Genius
IC=I See
ICQ=I Seek you (also a chat program)
ILU=ILU: I Love You
IMHO=In My Honest/Humble Opinion
IMO=In My Opinion
IOW=In Other Words
IRL=In Real Life
KISS=Keep It Simple, Stupid
LDR=Long Distance Relationship
LMAO=Laugh My A.. Off
LOL=Laughing Out Loud
LTNS=Long Time No See
L8R=Later
MTE=My Thoughts Exactly
M8=Mate
NRN=No Reply Necessary
OIC=Oh I See
PITA=Pain In The A..
PRT=Party
PRW=Parents Are Watching
ROFL=Rolling On The Floor Laughing
ROFLOL=Rolling On The Floor Laughing Out Loud
ROTFLMAO=Rolling On The Floor Laughing My A.. Off
SK8=Skate
STATS=Your sex and age
ASL=Age, Sex, Location
THX=Thank You
TTFN=Ta-Ta For Now!
TTYL=Talk To You Later
U=You
U2=You Too
U4E=Yours For Ever
WB=Welcome Back
WTF=What The F...
WTG=Way To Go!
WUF=Where Are You From?
W8=Wait...
7K=Sick:-D Laugher
"""

chat_words_expanded_dict = {}
chat_words_list = []
for line in chat_words_str.split("\n"):
    if line != "":
        chat_word = line.split("=")[0]
        chat_word_expanded = line.split("=")[1]
        chat_words_list.append(chat_word)
        chat_words_expanded_dict[chat_word] = chat_word_expanded
chat_words_list = set(chat_words_list)

def convert_chat_words(in_str):
    new_str = ""
    for w in in_str.split():
        if w.upper() in chat_words_list:
            new_str = new_str + chat_words_expanded_dict[w.upper()] + " "
        else:
            new_str = new_str + w + " "
    return new_str

train_df["chat_words_coverted"]=train_df["html_removed"].apply(lambda x: convert_chat_words(x))

In [None]:
!pip install pyspellchecker

Collecting pyspellchecker
  Downloading pyspellchecker-0.6.3-py3-none-any.whl (2.7 MB)
     |████████████████████████████████| 2.7 MB 625 kB/s            
[?25hInstalling collected packages: pyspellchecker
Successfully installed pyspellchecker-0.6.3


In [None]:
# Spelling Correction

from spellchecker import SpellChecker

spell = SpellChecker()

def correct_spellings(in_str):
    new_str = ""
    misspelled_words = spell.unknown(in_str.split())
    for word in in_str.split():
        if word in misspelled_words:
            new_str = new_str + spell.correction(word) + " "
        else:
            new_str = new_str + word + " "
    return new_str

train_df["spellings_corrected"]=train_df["chat_words_coverted"].apply(lambda x: correct_spellings(x))

In [None]:
# train_df["spellings_corrected"]=train_df["chat_words_coverted"]

---

In [None]:
print(test_df.shape)
print(train_df.shape)
print(train_df_copy.shape)

(3263, 4)
(10876, 13)
(7613, 5)


In [None]:
# splitting dataframe into train_df and test_df
train_df_copy2 = train_df

test_df = train_df.iloc[7613:,:]
train_df = train_df.iloc[:7613,:]

test_df['text'] = train_df_copy2.iloc[7613:,:]['spellings_corrected']
train_df['text'] = train_df_copy2.iloc[:7613,:]['spellings_corrected']

print(test_df.shape)
print(train_df.shape)

train_df['target'] = train_df_copy['target'].values
# 7613
# 3263
train_df.head(5)

(3263, 13)
(7613, 13)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]


Unnamed: 0,id,keyword,location,text,lowered_text,lowered_text_stop_removed,lowered_text_stop_removed_freq_removed,lowered_stop_freq_rare_removed,Lemmatized,urls_removed,html_removed,chat_words_coverted,spellings_corrected,target
0,1,,,deed reason earthquake may allah forgive You,our deeds are the reason of this earthquake ma...,deeds reason earthquake may allah forgive us,deeds reason earthquake may allah forgive us,deeds reason earthquake may allah forgive us,deed reason earthquake may allah forgive u,deed reason earthquake may allah forgive u,deed reason earthquake may allah forgive u,deed reason earthquake may allah forgive You,deed reason earthquake may allah forgive You,1
1,4,,,forest near la range ask canada,forest fire near la ronge sask canada,forest fire near la ronge sask canada,forest near la ronge sask canada,forest near la ronge sask canada,forest near la ronge sask canada,forest near la ronge sask canada,forest near la ronge sask canada,forest near la ronge sask canada,forest near la range ask canada,1
2,5,,,resident asked shelter place notified officer ...,all residents asked to shelter in place are be...,residents asked shelter place notified officer...,residents asked shelter place notified officer...,residents asked shelter place notified officer...,resident asked shelter place notified officer ...,resident asked shelter place notified officer ...,resident asked shelter place notified officer ...,resident asked shelter place notified officer ...,resident asked shelter place notified officer ...,1
3,6,,,13000 receive wildfire evacuation order califo...,13000 people receive wildfires evacuation orde...,13000 people receive wildfires evacuation orde...,13000 receive wildfires evacuation orders cali...,13000 receive wildfires evacuation orders cali...,13000 receive wildfire evacuation order califo...,13000 receive wildfire evacuation order califo...,13000 receive wildfire evacuation order califo...,13000 receive wildfire evacuation order califo...,13000 receive wildfire evacuation order califo...,1
4,7,,,got sent photo ruby alaska smoke wildfire pour...,just got sent this photo from ruby alaska as s...,got sent photo ruby alaska smoke wildfires pou...,got sent photo ruby alaska smoke wildfires pou...,got sent photo ruby alaska smoke wildfires pou...,got sent photo ruby alaska smoke wildfire pour...,got sent photo ruby alaska smoke wildfire pour...,got sent photo ruby alaska smoke wildfire pour...,got sent photo ruby alaska smoke wildfire pour...,got sent photo ruby alaska smoke wildfire pour...,1


In [None]:
print(test_df.shape)
print(train_df.shape)

(3263, 13)
(7613, 14)


In [None]:
# from sklearn.feature_extraction.text import TfidfVectorizer

# v = TfidfVectorizer(decode_error='replace', encoding='utf-8')
# train_df['text'] = v.fit_transform(train_df['text'].values.astype('U'))
# test_df['text'] = v.fit_transform(test_df['text'].values.astype('U'))

In [None]:
print(test_df.shape)
print(train_df.shape)

(3263, 13)
(7613, 14)


In [None]:
print(test_df.head(5))
print(train_df.head(5))

   id keyword location                                               text  \
0   0     NaN      NaN                       happened terrible car crash    
1   2     NaN      NaN  heard earthquake different city stay safe ever...   
2   3     NaN      NaN  forest spot pond goose fleeing across street c...   
3   9     NaN      NaN              apocalypse lighting spokane wildfire    
4  11     NaN      NaN             typhoon soudelor kill 28 china taiwan    

                                        lowered_text  \
0                 just happened a terrible car crash   
1  heard about earthquake is different cities sta...   
2  there is a forest fire at spot pond geese are ...   
3              apocalypse lighting spokane wildfires   
4      typhoon soudelor kills 28 in china and taiwan   

                           lowered_text_stop_removed  \
0                       happened terrible car crash    
1  heard earthquake different cities stay safe ev...   
2  forest fire spot pond geese f

---

In [None]:
count_vectorizer = feature_extraction.text.CountVectorizer()

In [None]:
train_vectors = count_vectorizer.fit_transform(train_df["text"])
test_vectors = count_vectorizer.transform(test_df["text"])

In [None]:
print(train_vectors)
print(test_vectors)

  (0, 3778)	1
  (0, 15438)	1
  (0, 4447)	1
  (0, 13065)	1
  (0, 904)	1
  (0, 5398)	1
  (0, 19386)	1
  (1, 5389)	1
  (1, 13738)	1
  (1, 12252)	1
  (1, 15352)	1
  (1, 1289)	1
  (1, 2468)	1
  (2, 15682)	1
  (2, 1292)	1
  (2, 16510)	2
  (2, 14755)	2
  (2, 13970)	1
  (2, 14089)	1
  (2, 4821)	1
  (2, 14251)	1
  (2, 4907)	1
  (3, 4821)	1
  (3, 14251)	1
  (3, 118)	1
  :	:
  (7610, 6205)	1
  (7610, 6)	1
  (7610, 18620)	1
  (7610, 11067)	1
  (7611, 2526)	1
  (7611, 14840)	1
  (7611, 15797)	1
  (7611, 11483)	1
  (7611, 12582)	1
  (7611, 16405)	1
  (7611, 14174)	1
  (7611, 11600)	1
  (7611, 17930)	1
  (7611, 1845)	2
  (7611, 3045)	1
  (7611, 14889)	1
  (7611, 17391)	1
  (7612, 19103)	1
  (7612, 2424)	1
  (7612, 6455)	1
  (7612, 12329)	1
  (7612, 13943)	1
  (7612, 587)	1
  (7612, 15394)	1
  (7612, 10981)	1
  (0, 2526)	1
  (0, 3402)	1
  (0, 6133)	1
  (0, 17757)	1
  (1, 2883)	1
  (1, 4030)	1
  (1, 4447)	1
  (1, 4842)	1
  (1, 6244)	1
  (1, 16068)	1
  (1, 17175)	1
  (2, 671)	1
  (2, 2487)	1
  (2, 5282)

In [None]:
sample_submission = pd.read_csv("/kaggle/input/nlp-getting-started/sample_submission.csv")

In [None]:
from sklearn.linear_model import LogisticRegression
clf_lr = LogisticRegression(random_state=0)
clf_lr.fit(train_vectors, train_df["target"])

LogisticRegression(random_state=0)

In [None]:
sample_submission["target"] = clf_lr.predict(test_vectors)
sample_submission.to_csv("submission.csv", index=False)