In [1]:
import numpy as np
import pandas as pd

In [2]:
# Data
df = pd.read_csv("C:\\Users\\siddh\\OneDrive\\Desktop\\Data science data\\IMDB Dataset.csv")
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


## Lowercasing

In [3]:
df['review'] = df['review'].str.lower()

In [4]:
df['review'][3]

"basically there's a family where a little boy (jake) thinks there's a zombie in his closet & his parents are fighting all the time.<br /><br />this movie is slower than a soap opera... and suddenly, jake decides to become rambo and kill the zombie.<br /><br />ok, first of all when you're going to make a film you must decide if its a thriller or a drama! as a drama the movie is watchable. parents are divorcing & arguing like in real life. and then we have jake with his closet which totally ruins all the film! i expected to see a boogeyman similar movie, and instead i watched a drama with some meaningless thriller spots.<br /><br />3 out of 10 just for the well playing parents & descent dialogs. as for the shots with jake: just ignore them."

## Removing HTML Tags

In [5]:
import re

def remove_html_tags(data):
    pattern = re.compile('<.*?>')
    return pattern.sub(r'', data)

In [6]:
df['review'] = df['review'].apply(remove_html_tags)

In [7]:
df['review'][1]

'a wonderful little production. the filming technique is very unassuming- very old-time-bbc fashion and gives a comforting, and sometimes discomforting, sense of realism to the entire piece. the actors are extremely well chosen- michael sheen not only "has got all the polari" but he has all the voices down pat too! you can truly see the seamless editing guided by the references to williams\' diary entries, not only is it well worth the watching but it is a terrificly written and performed piece. a masterful production about one of the great master\'s of comedy and his life. the realism really comes home with the little things: the fantasy of the guard which, rather than use the traditional \'dream\' techniques remains solid then disappears. it plays on our knowledge and our senses, particularly with the scenes concerning orton and halliwell and the sets (particularly of their flat with halliwell\'s murals decorating every surface) are terribly well done.'

## Remove URL's

In [8]:
text = 'google search here www.google.com'

In [9]:
def remove_url(text):
    pattern = re.compile(r'https?://\S+|www\.\S+')
    return pattern.sub(r'',text)

In [10]:
remove_url(text)

'google search here '

## Remove punctuations

In [11]:
import string
punc = string.punctuation
punc

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [12]:
def remove_punc(text):
    for char in punc:
        text = text.replace(char,'')
        
    return text

In [13]:
# Faster technique
def remove_punc_fast(text):
    return text.translate(str.maketrans('','',punc))

In [14]:
tweet = pd.read_csv("C:\\Users\\siddh\\OneDrive\\Desktop\\Data science data\\test_tweets_anuFYb8.csv")
tweet.head()

Unnamed: 0,id,tweet
0,31963,#studiolife #aislife #requires #passion #dedic...
1,31964,@user #white #supremacists want everyone to s...
2,31965,safe ways to heal your #acne!! #altwaystohe...
3,31966,is the hp and the cursed child book up for res...
4,31967,"3rd #bihday to my amazing, hilarious #nephew..."


In [15]:
tweet['tweet'] = tweet['tweet'].apply(remove_punc_fast)

In [16]:
tweet['tweet'][1]

' user white supremacists want everyone to see the new â\x80\x98  birdsâ\x80\x99 movie â\x80\x94 and hereâ\x80\x99s why  '

## Chatword Treatment

In [17]:
d = {"AFAIK":"As Far As I Know",
      "AFK":"Away From Keyboard",
      "ASAP":"As Soon As Possible"}

def chat_conversion(text):
    new_text = []
    for w in text.split():
        if w.upper() in d:
            new_text.append(d[w.upper()])
        else:
            new_text.append(w)
            
    return " ".join(new_text)


In [18]:
chat_conversion("We have to reach there ASAP")

'We have to reach there As Soon As Possible'

## Spelling correction

In [19]:
from textblob import TextBlob

incorrect_text = "Please read the notebook and write the notbook amd return the ntebook"
textblb = TextBlob(incorrect_text)
textblb.correct().string


'Please read the notebook and write the notebook and return the notebook'

## Removing Stopwords

In [20]:
from nltk.corpus import stopwords

def remove_sw(text):
    new_text = []
    
    for word in text.split():
        if word in stopwords.words('english'):
            new_text.append('')
            
        else:
            new_text.append(word)
    x = new_text[:]
    new_text.clear()
    return " ".join(x)

In [21]:
remove_sw("probability my all-time favourite movie , a story of selflessness,sacrifice and dedication")

'probability  all-time favourite movie ,  story  selflessness,sacrifice  dedication'

## Tokenization

#### Split function

In [22]:
# word tokenization
sent1 = "I am going to delhi"
sent1.split()

['I', 'am', 'going', 'to', 'delhi']

In [23]:
# sentence tokenization
sent2 = "I am going to delhi. I will stay there for 3 days. Let's hope the trip to be great"
sent2.split('.')

['I am going to delhi',
 ' I will stay there for 3 days',
 " Let's hope the trip to be great"]

#### Regex

In [24]:
import re
sent3 = 'I am going to delhi!'
tokens = re.findall("[\w']+",sent3)
tokens


['I', 'am', 'going', 'to', 'delhi']

#### nltk

In [25]:
from nltk.tokenize import word_tokenize,sent_tokenize

In [26]:
sent1 = 'I am going to visit delhi!'
word_tokenize(sent1)

['I', 'am', 'going', 'to', 'visit', 'delhi', '!']

In [27]:
sent2 = 'A 5km ride cost $10.50'
word_tokenize(sent2)

['A', '5km', 'ride', 'cost', '$', '10.50']

#### spacy

In [28]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [29]:
doc1 = nlp(sent2)

In [30]:
for token in doc1:
    print(token)

A
5
km
ride
cost
$
10.50


### Stemming

In [31]:
from nltk.stem.porter import PorterStemmer

In [32]:
ps = PorterStemmer()

def stem_words(text):
    return " ".join([ps.stem(word) for word in text.split()])

In [33]:
text = 'walk walks walking walked'
stem_words(text)

'walk walk walk walk'

## Lemmatization

In [42]:
from nltk.stem import WordNetLemmatizer
wl = WordNetLemmatizer()

sent = 'He was running and eating at the same time. He has a bad habit of swimming after playing long hours in sun'
pun = '?:!.,;'

sent_words = nltk.word_tokenize(sent)

for word in sent_words:
    if word in punc:
        sent_words.remove(word)
        
print("{0:20}{1:20}".format("Word","Lemma"))

for word in sent_words:
    print("{0:20}{1:20}".format(word,wl.lemmatize(word,pos='v')))


Word                Lemma               
He                  He                  
was                 be                  
running             run                 
and                 and                 
eating              eat                 
at                  at                  
the                 the                 
same                same                
time                time                
He                  He                  
has                 have                
a                   a                   
bad                 bad                 
habit               habit               
of                  of                  
swimming            swim                
after               after               
playing             play                
long                long                
hours               hours               
in                  in                  
sun                 sun                 
