## Installations

In [1]:
from google.colab import drive
drive.mount('/content/drive')
# !cd drive
# !cd MyDrive/IP_S7/Video

Mounted at /content/drive


In [2]:
cd drive

/content/drive


In [3]:
cd "MyDrive/Data Science Project/Data"

/content/drive/MyDrive/Data Science Project/Data


## Imports

In [1]:
import spacy
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
import re
from bs4 import BeautifulSoup
from contractions import CONTRACTION_MAP
import unicodedata

In [2]:
import os
import pandas as pd
import numpy as np
from tqdm import tqdm

In [3]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\praty\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Preprocessing Function

In [4]:
df = pd.read_csv('Data/Cities_Labelled/MasterData.csv')

In [5]:
import re
text= ' '.join(t for t in df['Review'])
text = re.sub('\n', '', text)
text = re.sub('\?', '', text)
text = re.sub('"', '', text)
text = re.sub("'", '', text)
words_list= text.split()
word_freq= dict()
for word in set(words_list):
  word_freq[word] = 0
for word in words_list:
  word_freq[word] += 1   
df_word= pd.DataFrame(word_freq.items(),columns=['word','count'])
# df_word['word_len']= df_word['word'].map(lambda x: len(x))
df_word=df_word.sort_values('count',ascending=False).reset_index(drop=True)
df_word.head()

Unnamed: 0,word,count
0,I,793082
1,hotel,621446
2,room,583053
3,not,512572
4,stay,341127


In [6]:
words = df_word.values.tolist()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

df_top= df_word.head(100)
f = plt.figure()
f.set_figwidth(10)
f.set_figheight(100)
sns.barplot(x=df_top['count'], y=df_top['word'])
plt.title('Word Count Distribution')
plt.show()

In [15]:
nlp = spacy.load('en_core_web_md')
tokenizer = ToktokTokenizer()
stopword_list = nltk.corpus.stopwords.words('english')
stopword_list.remove('no')
stopword_list.remove('not')

In [18]:
stopword_list.extend(["I", "street", "london", "book", "restaurant", "bathroom", "night", "location", "hotel", "room"])

In [20]:
stopword_list.append("nan")

In [21]:
len(stopword_list)

188

In [58]:
# # Cleaning Text - strip HTML
def strip_html_tags(text):
    soup = BeautifulSoup(text, "html.parser")
    stripped_text = soup.get_text()
    return stripped_text


# # Removing accented characters
def remove_accented_chars(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text


# # Expanding Contractions
def expand_contractions(text, contraction_mapping=CONTRACTION_MAP):

    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())),
                                      flags=re.IGNORECASE|re.DOTALL)
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match) \
                                   if contraction_mapping.get(match) \
                                    else contraction_mapping.get(match.lower())
        expanded_contraction = first_char+expanded_contraction[1:]
        return expanded_contraction

    expanded_text = contractions_pattern.sub(expand_match, text)
    expanded_text = re.sub("'", "", expanded_text)
    return expanded_text


# # Removing Special Characters
def remove_special_characters(text):
    text = re.sub('[^a-zA-Z0-9\s]', '', text)
    return text


# # Lemmatizing text
def lemmatize_text(text):
    text = nlp(text)
    text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
    return text


# # Removing Stopwords
def remove_stopwords(text, is_lower_case=False):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)
    return filtered_text


# # Normalize text corpus - tying it all together
def normalize_corpus(corpus, html_stripping=False, contraction_expansion=False,
                     accented_char_removal=False, text_lower_case=False,
                     text_lemmatization=False, special_char_removal=False,
                     stopword_removal=True):

    normalized_corpus = []

    for i in tqdm(range(len(corpus))):
        
        doc = corpus[i][0]
        if html_stripping:
            doc = strip_html_tags(doc)

        if accented_char_removal:
            doc = remove_accented_chars(doc)

        if contraction_expansion:
            doc = expand_contractions(doc)

        if text_lower_case:
            doc = doc.lower()

        # remove extra newlines
        doc = re.sub(r'[\r|\n|\r\n]+', ' ',doc)
        # insert spaces between special characters to isolate them
        special_char_pattern = re.compile(r'([{.(-)!}])')
        doc = special_char_pattern.sub(" \\1 ", doc)

        if text_lemmatization:
            doc = lemmatize_text(doc)

        if special_char_removal:
            doc = remove_special_characters(doc)

        # remove extra whitespace
        doc = re.sub(' +', ' ', doc)
        doc = re.sub('\d+', '', doc)
        if stopword_removal:
            doc = remove_stopwords(doc, is_lower_case=text_lower_case)

        if doc != '':
            arb = [doc, corpus[i][1]]
            normalized_corpus.append(arb)

    return normalized_corpus


## Preprocessing for Model Training

In [30]:
df = pd.read_csv('Data/Cities_Labelled/MasterData.csv')
df.head()

Unnamed: 0,Review,Sentiment,Score
0,nice trendy hotel location not bad I stay hote...,POSITIVE,0.998912
1,great budget hotel stay two night aloft recent...,POSITIVE,0.998913
2,excellent value location not big problem stay ...,POSITIVE,0.998927
3,stylish clean reasonable value poor location I...,NEGATIVE,0.00069
4,remote excellent value money stay one night ho...,POSITIVE,0.998909


In [32]:
df = df.drop(['Score'], axis=1)

In [33]:
df.head()

Unnamed: 0,Review,Sentiment
0,nice trendy hotel location not bad I stay hote...,POSITIVE
1,great budget hotel stay two night aloft recent...,POSITIVE
2,excellent value location not big problem stay ...,POSITIVE
3,stylish clean reasonable value poor location I...,NEGATIVE
4,remote excellent value money stay one night ho...,POSITIVE


In [34]:
df_pos = df[df['Sentiment'] == 'POSITIVE']
df_neg = df[df['Sentiment'] == 'NEGATIVE']

In [35]:
df_pos.shape

(192315, 2)

In [36]:
df_neg.shape

(62279, 2)

In [37]:
data = df_neg.values.tolist()

In [40]:
ind = np.random.choice(np.arange(df_pos.shape[0]), df_neg.shape[0], replace=False)

In [41]:
pos = df_pos.to_numpy()[ind].tolist()

In [45]:
data.extend(pos)

In [59]:
data = normalize_corpus(data)

100%|██████████| 123398/123398 [00:32<00:00, 3810.28it/s]


In [60]:
len(data)

123372

In [63]:
df = pd.DataFrame(data, columns=("Review","Sentiment"))


In [65]:
df[df['Sentiment'] == "POSITIVE"].shape

(62269, 2)

In [67]:
df[df['Sentiment'] == "NEGATIVE"].shape

(61103, 2)

In [68]:
df.to_csv(f"Data/Cities_Labelled/ModelTrain.csv", index=False)

## Preprocessing Whole Data

In [10]:
dir = "Data/Cities/"
num = 0
ct = ["london.csv"]
for city in ct:
  # text = ""
  print(f"City: {city}")
  fname = os.path.join(dir, city)
  if os.path.isfile(fname):
    df = pd.read_csv(fname)
    df = df.drop(['Date'], axis=1)
    data = df.values.tolist()
    for i in range(len(data)):
      data[i][0] = str(data[i][0]) + " " + str(data[i][1])
      data[i] = data[i][0]
    data = normalize_corpus(data)
    df = pd.DataFrame(data, columns=("Review",))
    print(f"Shape: {df.shape}")
    # Removing Null Reviews
    df = df[df['Review'] != 'nan']
    print(f"New Shape: {df.shape}")
    df.to_csv(f"Data/Cities_Processed/{city}", index=False)


City: london.csv


  soup = BeautifulSoup(text, "html.parser")
100%|██████████| 79350/79350 [3:15:39<00:00,  6.76it/s]      


Shape: (79350, 1)
New Shape: (78946, 1)
