# Text Mining

In [4]:
import pandas as pd

# Reading given txt file and creating dataframe
data_frame = pd.read_csv('C:\\Users\\The Dark Knight\\Desktop\\Applications\\Code\\Text_Mining\\User_Reviews_Data\\'\
                          'User_Restaurant_Reviews.txt', sep='\t+', header = None, engine = "python")

# Storing the dataframe in a csv file
data_frame.to_csv('C:\\Users\\The Dark Knight\\Desktop\\Applications\\Code\\Text_Mining\\User_Reviews_Data\\'\
                  'User_Restaurant_Reviews.csv', index = None, header = ['Review','Sentiment'])

# Reading the newly created csv file
user_restaurant_reviews = pd.read_csv('C:\\Users\\The Dark Knight\\Desktop\\Applications\\Code\\Text_Mining\\'\
                  'User_Reviews_Data\\User_Restaurant_Reviews.csv')

In [5]:
# Viewing the data
user_restaurant_reviews.head()

Unnamed: 0,Review,Sentiment
0,Wow... Loved this place.,1.0
1,I learned that if an electric slicer is used t...,
2,But they don't clean the chiles?,
3,Crust is not good.,0.0
4,Not tasty and the texture was just nasty.,0.0


In [7]:
user_restaurant_reviews.shape

(3729, 2)

In [8]:
# Using only top 5 records for sample analysis
limited_user_restaurant_reviews = user_restaurant_reviews[0:5]
limited_user_restaurant_reviews


Unnamed: 0,Review,Sentiment
0,Wow... Loved this place.,1.0
1,I learned that if an electric slicer is used t...,
2,But they don't clean the chiles?,
3,Crust is not good.,0.0
4,Not tasty and the texture was just nasty.,0.0


In [9]:
limited_user_restaurant_reviews.columns.values

array(['Review', 'Sentiment'], dtype=object)

In [10]:
# Tokenizing
from nltk.tokenize import sent_tokenize, word_tokenize   #loading sentence and word tokenizing function

sample_text = limited_user_restaurant_reviews["Review"][0]
print(sample_text)

Wow... Loved this place.


In [12]:
# Tokenizing the text into sentences using sent_tokenize
sentence_tokens = sent_tokenize(sample_text)
print(sentence_tokens)

['Wow...', 'Loved this place.']


In [13]:
# Tokenizing the text into words using word_tokenize
word_tokens = word_tokenize(sample_text)
print(word_tokens)

['Wow', '...', 'Loved', 'this', 'place', '.']


In [14]:
# Steps to remove stopwords (unimportant words) like a, an ,the, this, that, etc.
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
print(len(stop_words))

179


In [15]:
print(stop_words)

{'doesn', "should've", 'when', 'under', 'have', 'themselves', 'weren', 'where', 'herself', 'did', 'there', 'does', "don't", 'll', 'why', 'nor', "she's", 'doing', "needn't", 'yours', 'again', 'too', 'didn', 'your', 'were', 'a', 'because', 'him', 'then', 'from', "mightn't", 'of', "aren't", 'haven', 'you', 'any', 'very', 'm', 'above', 'but', 'during', 'an', 'most', 'the', 'than', 'below', 'yourself', 'just', 'wouldn', "that'll", 'needn', 't', 'himself', 'some', 'theirs', 'has', 'while', 'mustn', "mustn't", 'what', 'as', 'through', 'who', 'such', "couldn't", 'wasn', 'they', "you'll", 'on', 'at', 'don', 'no', 'couldn', 'ours', 'once', "wouldn't", 'aren', 'both', 'will', 'further', 'my', "you'd", 'off', 'against', 've', 'hers', 'shan', 'if', 'other', 'won', 'he', "didn't", "hasn't", 'whom', "haven't", 's', 'its', "isn't", 'itself', 'not', 'his', 'do', 'for', 'few', 'in', 'all', 'being', 'same', 'our', 'yourselves', 'down', 'after', 'to', 'over', "won't", 'her', 'more', 'myself', 'how', "it's

In [18]:
period_stop_words = {'.','..','...','....','.....'}
stop_words = stop_words.union(period_stop_words)
print(stop_words)

{'doesn', "should've", 'when', 'under', 'have', 'themselves', 'weren', 'where', 'herself', 'did', 'there', 'does', "don't", 'll', '...', 'why', 'nor', "she's", 'doing', "needn't", 'yours', 'again', 'too', 'didn', 'your', 'were', 'a', 'because', 'him', 'then', 'from', "mightn't", 'of', "aren't", '....', 'haven', 'you', 'any', 'very', 'm', 'above', 'but', 'during', 'an', 'most', 'the', 'than', 'below', 'yourself', 'just', 'wouldn', "that'll", 'needn', 't', 'himself', 'some', 'theirs', 'has', 'while', 'mustn', "mustn't", 'what', 'as', 'through', 'who', 'such', "couldn't", 'wasn', 'they', "you'll", 'on', 'at', 'don', 'no', 'couldn', 'ours', 'once', "wouldn't", 'aren', 'both', 'will', 'further', 'my', "you'd", 'off', 'against', 've', 'hers', 'shan', 'if', 'other', 'won', 'he', "didn't", "hasn't", 'whom', "haven't", 's', 'its', "isn't", 'itself', 'not', 'his', 'do', 'for', 'few', 'in', 'all', 'being', 'same', 'our', 'yourselves', 'down', 'after', 'to', 'over', "won't", 'her', 'more', 'myself

In [19]:
filtered_sample_text = [word for word in word_tokens if word not in stop_words]
print(filtered_sample_text)

['Wow', 'Loved', 'place']


In [25]:
# Using PorterStemmer to stem the words back to root word
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()

sample_text_2 = limited_user_restaurant_reviews["Review"][1]
print(sample_text_2)

I learned that if an electric slicer is used the blade becomes hot enough to start to cook the prosciutto.


In [26]:
word_tokens_2 = word_tokenize(sample_text_2)
print(word_tokens_2)

['I', 'learned', 'that', 'if', 'an', 'electric', 'slicer', 'is', 'used', 'the', 'blade', 'becomes', 'hot', 'enough', 'to', 'start', 'to', 'cook', 'the', 'prosciutto', '.']


In [27]:
stem_tokens = [stemmer.stem(word) for word in word_tokens_2]
print(stem_tokens)

['i', 'learn', 'that', 'if', 'an', 'electr', 'slicer', 'is', 'use', 'the', 'blade', 'becom', 'hot', 'enough', 'to', 'start', 'to', 'cook', 'the', 'prosciutto', '.']


In [28]:
# Using lemmatizer to keep meaningful words
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(word) for word in word_tokens_2]
print(lemmatized_tokens)

['I', 'learned', 'that', 'if', 'an', 'electric', 'slicer', 'is', 'used', 'the', 'blade', 'becomes', 'hot', 'enough', 'to', 'start', 'to', 'cook', 'the', 'prosciutto', '.']


In [29]:
review_22_text = user_restaurant_reviews["Review"][22]

In [30]:
# Using regex expressions to remove punctuations, symbols, currency etc.
import re
review_22_text_cleaned = re.sub(r'\W+|\d+|_',' ', review_22_text)
print("Text after removing currency - \n" + review_22_text_cleaned)

Text after removing currency - 
We ordered     Margaritas they couldnt get the machine to work because it was frozen so refunded our money 


In [31]:
print("Actual text - \n" + review_22_text)

Actual text - 
We ordered 2 $.99 Margaritas - they couldnt get the machine to work because it was frozen so refunded our money.
