## 1. Duplicate Review Removal

In [5]:
import pandas as pd
import warnings as wn
wn.filterwarnings("ignore")

In [6]:
review_text = pd.read_csv("C:\\Users\\shefs\\Documents\\Downloads\\review_text.csv",header = None,names = ['Text'])
review_text

Unnamed: 0,Text
0,This is the first pair of shoes I got online
1,First few days was a tad tight but now that th...
2,The soles are not hard but not too flexible ei...
3,Usually wear a 9.5 but after trying them on at...
4,I had the Kinvara 4 running shoes
...,...
95,These are very nice shoes .
96,Quality is great
97,I like the shoe a lot .
98,I bought the yellow colored from Amazon ( as m...


In [7]:
# Number of Words

review_text['word_count'] = review_text['Text'].apply(lambda x: len(str(x).split(" ")))
review_text[['Text','word_count']].head()

Unnamed: 0,Text,word_count
0,This is the first pair of shoes I got online,10
1,First few days was a tad tight but now that th...,20
2,The soles are not hard but not too flexible ei...,11
3,Usually wear a 9.5 but after trying them on at...,19
4,I had the Kinvara 4 running shoes,7


In [8]:
# Number of characters

review_text['char_count'] = review_text['Text'].str.len() ## this also includes spaces
review_text[['Text','char_count']].head()

Unnamed: 0,Text,char_count
0,This is the first pair of shoes I got online,44
1,First few days was a tad tight but now that th...,86
2,The soles are not hard but not too flexible ei...,52
3,Usually wear a 9.5 but after trying them on at...,81
4,I had the Kinvara 4 running shoes,33


In [9]:
# Average Word Length

def avg_word(sentence):
    words = sentence.split()
    return (sum(len(word) for word in words)/len(words))

review_text['avg_word'] = review_text['Text'].apply(lambda x: avg_word(x))
review_text[['Text','avg_word']].head()

Unnamed: 0,Text,avg_word
0,This is the first pair of shoes I got online,3.5
1,First few days was a tad tight but now that th...,3.35
2,The soles are not hard but not too flexible ei...,3.818182
3,Usually wear a 9.5 but after trying them on at...,3.315789
4,I had the Kinvara 4 running shoes,3.857143


## Basic Pre-processing

In [10]:
#Lower case

review_text['Text'] = review_text['Text'].apply(lambda x: " ".join(x.lower() for x in x.split()))
review_text['Text'].head()

0         this is the first pair of shoes i got online
1    first few days was a tad tight but now that th...
2    the soles are not hard but not too flexible ei...
3    usually wear a 9.5 but after trying them on at...
4                    i had the kinvara 4 running shoes
Name: Text, dtype: object

In [11]:
#Removing Punctuation

review_text['Text'] = review_text['Text'].str.replace('[^\w\s]','')
review_text['Text'].head()

0         this is the first pair of shoes i got online
1    first few days was a tad tight but now that th...
2    the soles are not hard but not too flexible ei...
3    usually wear a 95 but after trying them on at ...
4                    i had the kinvara 4 running shoes
Name: Text, dtype: object

In [12]:
#Removal of Stop Words

from nltk.corpus import stopwords
stop = stopwords.words('english')
review_text['Text'] = review_text['Text'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
review_text['Text'].head()

0                   first pair shoes got online
1    first days tad tight broken fit like glove
2                    soles hard flexible either
3    usually wear 95 trying store first went 10
4                       kinvara 4 running shoes
Name: Text, dtype: object

In [13]:
#Common word removal

freq = pd.Series(' '.join(review_text['Text']).split()).value_counts()[:10]
freq

shoes          25
shoe           17
great          16
good           13
running        11
comfortable    11
nice            8
fit             8
pair            7
color           6
dtype: int64

In [14]:
freq = list(freq.index)
review_text['Text'] = review_text['Text'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
review_text['Text'].head()

0                              first got online
1        first days tad tight broken like glove
2                    soles hard flexible either
3    usually wear 95 trying store first went 10
4                                     kinvara 4
Name: Text, dtype: object

In [15]:
#Rare words removal

freq1 = pd.Series(' '.join(review_text['Text']).split()).value_counts()[-10:]
freq1



wise       1
asics      1
times      1
width      1
loving     1
service    1
fell       1
better     1
far34      1
weight     1
dtype: int64

In [16]:
freq1 = list(freq1.index)
review_text['Text'] = review_text['Text'].apply(lambda x: " ".join(x for x in x.split() if x not in freq1))
review_text['Text'].head()

0                              first got online
1        first days tad tight broken like glove
2                    soles hard flexible either
3    usually wear 95 trying store first went 10
4                                     kinvara 4
Name: Text, dtype: object

In [17]:
# Spelling correction
!pip install textblob
from textblob import TextBlob
review_text['Text'][:5].apply(lambda x: str(TextBlob(x).correct()))



0                              first got online
1        first days had tight broken like glove
2                    soles hard flexible either
3    usually wear 95 trying store first went 10
4                                     kinvara 4
Name: Text, dtype: object

In [18]:
# Tokenization

TextBlob(review_text['Text'][1]).words

WordList(['first', 'days', 'tad', 'tight', 'broken', 'like', 'glove'])

In [19]:
#Stemming

from nltk.stem import PorterStemmer
st = PorterStemmer()
review_text['Text'][:5].apply(lambda x: " ".join([st.stem(word) for word in x.split()]))

0                          first got onlin
1    first day tad tight broken like glove
2                 sole hard flexibl either
3    usual wear 95 tri store first went 10
4                                kinvara 4
Name: Text, dtype: object

In [20]:
#Lemmatization
import nltk
nltk.download('wordnet')

from textblob import Word
review_text['Text'] = review_text['Text'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
review_text['Text'].head()

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\shefs\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


0                              first got online
1         first day tad tight broken like glove
2                      sol hard flexible either
3    usually wear 95 trying store first went 10
4                                     kinvara 4
Name: Text, dtype: object

## Term Frequency – Inverse Document Frequency (TF-IDF)

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_features=1000, lowercase=True, analyzer='word',
 stop_words= 'english',ngram_range=(1,1))
text_vect = tfidf.fit_transform(review_text['Text'])

text_vect

<100x195 sparse matrix of type '<class 'numpy.float64'>'
	with 334 stored elements in Compressed Sparse Row format>

In [22]:
pairwise_similarity = text_vect* text_vect.T
pairwise_similarity

<100x100 sparse matrix of type '<class 'numpy.float64'>'
	with 446 stored elements in Compressed Sparse Row format>

In [23]:
pairwise_similarity.toarray() 

array([[1.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 1.        , 0.        , ..., 0.23851807, 0.        ,
        0.08180104],
       [0.        , 0.        , 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.23851807, 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.08180104, 0.        , ..., 0.        , 0.        ,
        1.        ]])

In [24]:
print(pairwise_similarity)

  (0, 67)	0.28666040789831354
  (0, 0)	0.9999999999999998
  (1, 99)	0.08180103637801456
  (1, 83)	0.14619492353001123
  (1, 77)	0.09027169276480414
  (1, 73)	0.08180103637801456
  (1, 65)	0.12268908325363599
  (1, 19)	0.14619492353001123
  (1, 14)	0.12587621246660446
  (1, 13)	0.08840837106681712
  (1, 97)	0.23851807325664745
  (1, 59)	0.34545177626908213
  (1, 48)	0.16631148676471602
  (1, 33)	0.23851807325664745
  (1, 1)	0.9999999999999999
  (2, 2)	1.0
  (3, 79)	0.14711017963822168
  (3, 65)	0.16457566550482225
  (3, 14)	0.1688508943790147
  (3, 67)	0.15871757709482678
  (3, 99)	0.10972826305226265
  (3, 73)	0.10972826305226265
  (3, 8)	0.12992945562709254
  (3, 71)	0.11727009543134674
  (3, 3)	1.0
  :	:
  (96, 32)	1.0
  (97, 59)	0.6904525888755569
  (97, 48)	0.3324058652603269
  (97, 1)	0.23851807325664745
  (97, 97)	1.0000000000000002
  (97, 91)	0.22673578812665154
  (97, 33)	1.0000000000000002
  (97, 27)	0.22673578812665154
  (98, 68)	0.1553974887329577
  (98, 46)	0.14617006055774

In [25]:
#Sentiment Analysis

review_text['Text'][:5].apply(lambda x: TextBlob(x).sentiment)

0                     (0.25, 0.3333333333333333)
1    (-0.10952380952380954, 0.33968253968253964)
2      (-0.2916666666666667, 0.5416666666666666)
3                     (0.0, 0.29166666666666663)
4                                     (0.0, 0.0)
Name: Text, dtype: object

In [26]:
review_text['sentiment'] = review_text['Text'].apply(lambda x: TextBlob(x).sentiment[0] )
review_text[['Text','sentiment']].head()

Unnamed: 0,Text,sentiment
0,first got online,0.25
1,first day tad tight broken like glove,-0.109524
2,sol hard flexible either,-0.291667
3,usually wear 95 trying store first went 10,0.0
4,kinvara 4,0.0


In [36]:
review_text['Text']

0                                    first got online
1               first day tad tight broken like glove
2                            sol hard flexible either
3          usually wear 95 trying store first went 10
4                                           kinvara 4
                           ...                       
95                                                   
96                                            quality
97                                           like lot
98                 bought yellow colored amazon third
99    wore day shipping squished ness went away comfy
Name: Text, Length: 100, dtype: object

In [43]:
final_text = review_text.drop_duplicates(subset=['Text'])
final_text

Unnamed: 0,Text,word_count,char_count,avg_word,sentiment
0,first got online,10,44,3.500000,0.250000
1,first day tad tight broken like glove,20,86,3.350000,-0.109524
2,sol hard flexible either,11,52,3.818182,-0.291667
3,usually wear 95 trying store first went 10,19,81,3.315789,0.000000
4,kinvara 4,7,33,3.857143,0.000000
...,...,...,...,...,...
75,everyday,4,32,7.250000,-0.200000
77,perfect next 5 k 12 day love heel city,23,99,3.347826,0.500000
78,walking runner,12,51,3.333333,0.000000
79,new balance support well light wear,12,64,4.416667,0.268182
