In [1]:
import pandas as pd
from bs4 import BeautifulSoup
from sklearn.model_selection import train_test_split
import numpy as np

In [2]:
train = pd.read_csv("Replaced.csv", encoding='ISO-8859-1')

#### Number of Words


In [3]:
train['word_count'] = train['text'].apply(lambda x: len(str(x).split(" ")))
train[['text','word_count']].head()

Unnamed: 0,text,word_count
0,i love this album. it's very good. more to the...,41
1,Good flavor. This review was collected as part...,11
2,Good flavor.,2
3,I read through the reviews on here before look...,124
4,My husband bought this gel for us. The gel cau...,25


#### Number of characters


In [4]:
train['char_count'] = train['text'].str.len() 
train[['text','char_count']].head()

Unnamed: 0,text,char_count
0,i love this album. it's very good. more to the...,201.0
1,Good flavor. This review was collected as part...,62.0
2,Good flavor.,12.0
3,I read through the reviews on here before look...,696.0
4,My husband bought this gel for us. The gel cau...,132.0


#### Eliminating empty cells or cells containing NaN values


In [5]:
train = train[train['text'].notnull()]

#### Average Word Length



In [6]:
def avg_word(sentence):
  words = sentence.split()
  return (sum(len(word) for word in words)/len(words))

train['avg_word'] = train['text'].apply(lambda x: avg_word(x))
train[['text','avg_word']].head()

Unnamed: 0,text,avg_word
0,i love this album. it's very good. more to the...,3.926829
1,Good flavor. This review was collected as part...,4.727273
2,Good flavor.,5.5
3,I read through the reviews on here before look...,4.620968
4,My husband bought this gel for us. The gel cau...,4.32


#### Number of stopwords



In [7]:
from nltk.corpus import stopwords
stop = stopwords.words('english')

train['stopwords'] = train['text'].apply(lambda x: len([x for x in x.split() if x in stop]))
train[['text','stopwords']].head()

Unnamed: 0,text,stopwords
0,i love this album. it's very good. more to the...,21
1,Good flavor. This review was collected as part...,4
2,Good flavor.,0
3,I read through the reviews on here before look...,57
4,My husband bought this gel for us. The gel cau...,9


#### Number of special characters


In [8]:
train['hastags'] = train['text'].apply(lambda x: len([x for x in x.split() if x.startswith('#')]))
train[['text','hastags']].head()

Unnamed: 0,text,hastags
0,i love this album. it's very good. more to the...,0
1,Good flavor. This review was collected as part...,0
2,Good flavor.,0
3,I read through the reviews on here before look...,0
4,My husband bought this gel for us. The gel cau...,0


In [9]:
train['@ character'] = train['text'].apply(lambda x: len([x for x in x.split() if x.startswith('@')]))
train[['text','@ character']].head()

Unnamed: 0,text,@ character
0,i love this album. it's very good. more to the...,0
1,Good flavor. This review was collected as part...,0
2,Good flavor.,0
3,I read through the reviews on here before look...,0
4,My husband bought this gel for us. The gel cau...,0


#### Number of numerics



In [10]:
train['numerics'] = train['text'].apply(lambda x: len([x for x in x.split() if x.isdigit()]))
train[['text','numerics']].head()

Unnamed: 0,text,numerics
0,i love this album. it's very good. more to the...,0
1,Good flavor. This review was collected as part...,0
2,Good flavor.,0
3,I read through the reviews on here before look...,0
4,My husband bought this gel for us. The gel cau...,0


#### Number of Uppercase words



In [11]:
train['upper'] = train['text'].apply(lambda x: len([x for x in x.split() if x.isupper()]))
train[['text','upper']].head()

Unnamed: 0,text,upper
0,i love this album. it's very good. more to the...,2
1,Good flavor. This review was collected as part...,0
2,Good flavor.,0
3,I read through the reviews on here before look...,7
4,My husband bought this gel for us. The gel cau...,1


### Basic Pre-processing

#### Lower case


In [12]:
train['text'] = train['text'].apply(lambda x: " ".join(x.lower() for x in x.split()))
train['text'].head()

0    i love this album. it's very good. more to the...
1    good flavor. this review was collected as part...
2                                         good flavor.
3    i read through the reviews on here before look...
4    my husband bought this gel for us. the gel cau...
Name: text, dtype: object

#### Removing Punctuation



In [13]:
train['text'] = train['text'].str.replace('[^\w\s]','')
train['text'].head()

0    i love this album its very good more to the hi...
1    good flavor this review was collected as part ...
2                                          good flavor
3    i read through the reviews on here before look...
4    my husband bought this gel for us the gel caus...
Name: text, dtype: object

#### Removal of Stop Words



In [14]:
from nltk.corpus import stopwords
stop = stopwords.words('english')
train['text'] = train['text'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
train['text'].head()

0    love album good hip hop side current pop sound...
1          good flavor review collected part promotion
2                                          good flavor
3    read reviews looking buying one couples lubric...
4    husband bought gel us gel caused irritation fe...
Name: text, dtype: object

#### Common word removal



In [15]:
freq = pd.Series(' '.join(train['text']).split()).value_counts()[:10]
freq

great        20936
product      20247
movie        19729
review       18906
part         18665
promotion    17733
collected    17725
love         16885
use          15946
good         12197
dtype: int64

In [16]:
freq = list(freq.index)
train['text'] = train['text'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
train['text'].head()

0    album hip hop side current pop sound hype list...
1                                               flavor
2                                               flavor
3    read reviews looking buying one couples lubric...
4    husband bought gel us gel caused irritation fe...
Name: text, dtype: object

#### Rare words removal



In [17]:
freq = pd.Series(' '.join(train['text']).split()).value_counts()[-10:]
freq

mealsnack        1
1yearold         1
loadwhich        1
wellclean        1
wordlol          1
wipejust         1
soff             1
wallisoverall    1
unwatchably      1
eyesthey         1
dtype: int64

In [18]:
freq = list(freq.index)
train['text'] = train['text'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
train['text'].head()

0    album hip hop side current pop sound hype list...
1                                               flavor
2                                               flavor
3    read reviews looking buying one couples lubric...
4    husband bought gel us gel caused irritation fe...
Name: text, dtype: object

#### Spelling correction



In [19]:
from textblob import TextBlob
train['text'][:5].apply(lambda x: str(TextBlob(x).correct()))

0    album hip hop side current pop sound hope list...
1                                               flavor
2                                               flavor
3    read reviews looking buying one couples lubric...
4    husband bought get us get caused irritation fe...
Name: text, dtype: object

#### Tokenization


In [20]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Dev\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [21]:
TextBlob(train['text'][0]).words

WordList(['album', 'hip', 'hop', 'side', 'current', 'pop', 'sound', 'hype', 'listen', 'everyday', 'gym', 'give', '5star', 'rating', 'way', 'metaphors', 'crazy'])

#### Stemming



In [22]:
from nltk.stem import PorterStemmer
st = PorterStemmer()
train['text'][:5].apply(lambda x: " ".join([st.stem(word) for word in x.split()]))

0    album hip hop side current pop sound hype list...
1                                               flavor
2                                               flavor
3    read review look buy one coupl lubric ultim di...
4    husband bought gel us gel caus irrit felt like...
Name: text, dtype: object

#### Lemmatization



In [23]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Dev\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [24]:
from textblob import Word
train['text'] = train['text'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
train['text'].head()

0    album hip hop side current pop sound hype list...
1                                               flavor
2                                               flavor
3    read review looking buying one couple lubrican...
4    husband bought gel u gel caused irritation fel...
Name: text, dtype: object

### Advance Text Processing

#### N-grams



In [25]:
TextBlob(train['text'][0]).ngrams(2)

[WordList(['album', 'hip']),
 WordList(['hip', 'hop']),
 WordList(['hop', 'side']),
 WordList(['side', 'current']),
 WordList(['current', 'pop']),
 WordList(['pop', 'sound']),
 WordList(['sound', 'hype']),
 WordList(['hype', 'listen']),
 WordList(['listen', 'everyday']),
 WordList(['everyday', 'gym']),
 WordList(['gym', 'give']),
 WordList(['give', '5star']),
 WordList(['5star', 'rating']),
 WordList(['rating', 'way']),
 WordList(['way', 'metaphor']),
 WordList(['metaphor', 'crazy'])]

#### Term frequency


In [26]:
tf1 = (train['text'][1:3]).apply(lambda x: pd.value_counts(x.split(" "))).sum(axis = 0).reset_index()
tf1.columns = ['words','tf']
tf1

Unnamed: 0,words,tf
0,flavor,2


#### Inverse Document Frequency



In [27]:
for i,word in enumerate(tf1['words']):
  tf1.loc[i, 'idf'] = np.log(train.shape[0]/(len(train[train['text'].str.contains(word)])))

tf1

Unnamed: 0,words,tf,idf
0,flavor,2,5.141692


#### Term Frequency – Inverse Document Frequency (TF-IDF)



In [28]:
tf1['tfidf'] = tf1['tf'] * tf1['idf']
tf1

Unnamed: 0,words,tf,idf,tfidf
0,flavor,2,5.141692,10.283383


***TfidfVectorizer***



In [29]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_features=1000, lowercase=True, analyzer='word',
 stop_words= 'english',ngram_range=(1,1))
train_vect = tfidf.fit_transform(train['text'])

train_vect

<70967x1000 sparse matrix of type '<class 'numpy.float64'>'
	with 721460 stored elements in Compressed Sparse Row format>

#### Bag of Words



In [30]:
from sklearn.feature_extraction.text import CountVectorizer
bow = CountVectorizer(max_features=1000, lowercase=True, ngram_range=(1,1),analyzer = "word")
train_bow = bow.fit_transform(train['text'])

train_bow

<70967x1000 sparse matrix of type '<class 'numpy.int64'>'
	with 852741 stored elements in Compressed Sparse Row format>

#### Sentiment Analysis



In [31]:
train['text'][:5].apply(lambda x: TextBlob(x).sentiment)

0                 (-0.09999999999999999, 0.575)
1                                    (0.0, 0.0)
2                                    (0.0, 0.0)
3    (0.014090909090909083, 0.6594444444444445)
4                                    (0.0, 0.0)
Name: text, dtype: object

In [32]:
train['sentiment'] = train['text'].apply(lambda x: TextBlob(x).sentiment[0] )
train[['text','sentiment']].head()

Unnamed: 0,text,sentiment
0,album hip hop side current pop sound hype list...,-0.1
1,flavor,0.0
2,flavor,0.0
3,read review looking buying one couple lubrican...,0.014091
4,husband bought gel u gel caused irritation fel...,0.0


#### Hashing with HashingVectorizer

In [33]:
from sklearn.feature_extraction.text import HashingVectorizer

vectorizer = HashingVectorizer(n_features=20)
vector = vectorizer.transform(train['text'])

vector

<70967x20 sparse matrix of type '<class 'numpy.float64'>'
	with 648047 stored elements in Compressed Sparse Row format>