In [67]:
import pandas as pd
import numpy as np
import re
import nltk
import contractions

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')

import spacy
nlp = spacy.load("en_core_web_sm")

from nltk.stem import PorterStemmer,WordNetLemmatizer
from nltk.corpus import stopwords,wordnet
from nltk.tokenize.treebank import TreebankWordDetokenizer,TreebankWordTokenizer
from nltk.tokenize import wordpunct_tokenize,TweetTokenizer



stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))



from autocorrect import Speller
spell = Speller(lang='en')
def autospell(text):
    spells = [spell(w) for w in (nltk.word_tokenize(text))]
    return " ".join(spells)


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/aalokemozumdar/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/aalokemozumdar/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/aalokemozumdar/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/aalokemozumdar/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/aalokemozumdar/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [68]:
df=pd.read_csv('A1_dataset.csv')
df['TEXT'] = df.TEXT.astype(str)

In [69]:
def pos_tagger(tag):
    nltk_tag=tag[1]
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV   
    return 'n'

In [89]:
stop_list=['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves',\
     'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', \
     'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had',\
    'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'if', 'or', 'because', 'as', 'until','of', 'at', 'by', 'for', 'with', 'about', 'between',\
    'into', 'through', 'during', 'to', 'from','in', 'out','on','again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 
    'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 
    'just','should', 'now', 'd', 'll', 'm', 'o', 're', 've', 'y','u','im','go','will','come','whats','twitter','tweet',\
    'know','x','yeah','year','yet','youre','would','do','can','nan','see','look','one','could']

stop_reg = r'\b(?:'+'|'.join(stop_list)+r')\b'
stop_reg
stopword_finder= re.compile(stop_reg)

In [92]:
def processing(sentence, verbose=False):
  og_sentence = sentence
  sentence=sentence.lower()#CASE FOLDING
  html_finder = re.compile(r'<.*?>|&\w+;')
  rmhtml_Text=html_finder.sub('',sentence)  #remove HTML Tags
  # rmurl_Text = re.sub(r'http\S+|https:\S+|www\S+|.com\S+', '', rmhtml_Text) #remove URL tags
  pre_url = r'(?:https?://)|(?:www\.)'
  url_set = r'[\w+=?/._\-:;&*^$#@~`!|]'
  rmurl_Text=re.sub(f'(?:{pre_url}){url_set}+|{url_set}+\.com{url_set}*', '',rmhtml_Text)
  username_finder = re.compile('@\w+')
  rmurl_Text=username_finder.sub('',rmurl_Text) #remove usernames
  rmwhite_Text=re.sub(' +', ' ',rmurl_Text) #remove extra whitespace
  rmwhite_Text = contractions.fix(rmwhite_Text) #Remove contractions
  rmpunt_text=re.sub('[^\w\s]','',rmwhite_Text) #punctuation and username removal
  
  token_text=[token for token in TweetTokenizer().tokenize(rmpunt_text)]#tokenize text 
  
  # stem_text= [stemmer.stem(word) for word in rmpunt_text]#stemming
  pos_tags=[pos_tagger(tag) for tag in nltk.pos_tag(token_text)]
  lem_text= [lemmatizer.lemmatize(word,pos=pos) for word,pos in zip(token_text,pos_tags)] #lemmatize word
  # print(lem_text)
  rmstop_text = [tok for tok in lem_text if not stopword_finder.search(tok)]
  # print(rmstop_text)
  sentence=TreebankWordDetokenizer().detokenize(rmstop_text) #convert to sentence 
  spell_sentence = autospell(sentence)
  if verbose:
    print(f'Original sentence :\n{og_sentence}\n')
    print(f'Removing HTML and (Lowercasing):\n{rmhtml_Text}\n')
    print(f'Removing URL tags (and usernames):\n{rmurl_Text}\n')
    print(f'Removing whitespace(and expand contractions):\n{rmwhite_Text}\n')
    print(f'Remove punctuations:\n{rmpunt_text}\n')   
    print(f'Tokenize Text:\n{token_text}\n')
    print(f'Lemmatize Text:\n{lem_text}\n')
    print(f'Removing stop words Text:\n{rmstop_text}\n')
    print(f'Spelling Correction:\n{spell_sentence}\n')
  
  return spell_sentence

### Text Processing on Positive Sentence

In [79]:
text = "@nakulshenoy Lol, that and &quot;twiiter killed the blogger&quot; are far apart.\
     Btw, what is &quot;blogging in the traditional sense&quot; may i know? "

processing(text,verbose=True)

Original sentence :
@nakulshenoy Lol, that and &quot;twiiter killed the blogger&quot; are far apart.     Btw, what is &quot;blogging in the traditional sense&quot; may i know? 

Removing HTML and (Lowercasing):
@nakulshenoy lol, that and twiiter killed the blogger are far apart.     btw, what is blogging in the traditional sense may i know? 

Removing URL tags:
 lol, that and twiiter killed the blogger are far apart.     btw, what is blogging in the traditional sense may i know? 

Removing whitespace:
 lol, that and twiiter killed the blogger are far apart. by the way, what is blogging in the traditional sense may i know? 

Remove punctuations and (Expand Contractions):
 lol that and twiiter killed the blogger are far apart by the way what is blogging in the traditional sense may i know 

Tokenize Text:
['lol', 'that', 'and', 'twiiter', 'killed', 'the', 'blogger', 'are', 'far', 'apart', 'by', 'the', 'way', 'what', 'is', 'blogging', 'in', 'the', 'traditional', 'sense', 'may', 'i', 'know

'lol twitter kill blogger far apart way blogging traditional sense may'

### Text Processing on Negative Sentence

In [80]:
text = "Stephanie Pratt Tells MTV News &quot;The Hills' Did Not Make Me Bulimic&quot; \
    http://bit.ly/n4wL4 BETTER but still &gt; half the story is missing" 


processing(text,verbose=True)


Original sentence :
Stephanie Pratt Tells MTV News &quot;The Hills' Did Not Make Me Bulimic&quot;     http://bit.ly/n4wL4 BETTER but still &gt; half the story is missing

Removing HTML and (Lowercasing):
stephanie pratt tells mtv news the hills' did not make me bulimic     http://bit.ly/n4wl4 better but still  half the story is missing

Removing URL tags:
stephanie pratt tells mtv news the hills' did not make me bulimic      better but still  half the story is missing

Removing whitespace:
stephanie pratt tells mtv news the hills' did not make me bulimic better but still half the story is missing

Remove punctuations and (Expand Contractions):
stephanie pratt tells mtv news the hills did not make me bulimic better but still half the story is missing

Tokenize Text:
['stephanie', 'pratt', 'tells', 'mtv', 'news', 'the', 'hills', 'did', 'not', 'make', 'me', 'bulimic', 'better', 'but', 'still', 'half', 'the', 'story', 'is', 'missing']

Lemmatize Text:
['stephanie', 'pratt', 'tell', 'mtv', 

'stephanie pratt tell mtv news hill not make bulimic well but still half story miss'

In [81]:
data=df.apply(lambda row: processing(row.TEXT,verbose=False),axis=1)

In [82]:
data

0                                       get threaded scar
1                              like sedan mango yesterday
2       car after shower bed sooooooooooo tired sparro...
3       actually start afternoon try something wed slo...
4            www gid not worry vote nonstop col love much
                              ...                        
4282                          performance clip test shock
4283             gh no rcn true blood episode demand 1013
4284              return forest sarah mercy lose key wood
4285                         proud dad piece keep up papa
4286    wake up eat pizza breakfast also dentist appoi...
Length: 4287, dtype: object

In [83]:
df['NEW_TEXT']=data

In [84]:
df.to_csv('A1_dataset_processed.csv')