In [1]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
from nltk.stem import WordNetLemmatizer
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
import spacy
from nltk.corpus import sentiwordnet as swn
from IPython.display import clear_output
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import plotly
plotly.offline.init_notebook_mode (connected = True)

In [3]:
data=pd.read_csv('/content/drive/MyDrive/ML_project_datasheets/review_text_data.csv')

In [4]:
data.head()

Unnamed: 0,reviewText
0,really happy they got evangelised .. spoiler a...
1,Having lived in West New Guinea (Papua) during...
2,Excellent look into contextualizing the Gospel...
3,"More than anything, I've been challenged to fi..."
4,This is a great movie for a missionary going i...


# Preprocess Data

In [5]:
# Copy 'reviewText' col from data in Edited_Review
Edited_Review = data['reviewText'].copy()

In [6]:
Edited_Review.head()

0    really happy they got evangelised .. spoiler a...
1    Having lived in West New Guinea (Papua) during...
2    Excellent look into contextualizing the Gospel...
3    More than anything, I've been challenged to fi...
4    This is a great movie for a missionary going i...
Name: reviewText, dtype: object

In [7]:
#Add this new col in data
data['Review_without_stopwords'] = Edited_Review

In [8]:
data.head()

Unnamed: 0,reviewText,Review_without_stopwords
0,really happy they got evangelised .. spoiler a...,really happy they got evangelised .. spoiler a...
1,Having lived in West New Guinea (Papua) during...,Having lived in West New Guinea (Papua) during...
2,Excellent look into contextualizing the Gospel...,Excellent look into contextualizing the Gospel...
3,"More than anything, I've been challenged to fi...","More than anything, I've been challenged to fi..."
4,This is a great movie for a missionary going i...,This is a great movie for a missionary going i...


In [9]:
#Ensure consistency in text data convert text into lowercase
data['Review_without_stopwords']=data['Review_without_stopwords'].str.lower()

In [10]:
data.head()

Unnamed: 0,reviewText,Review_without_stopwords
0,really happy they got evangelised .. spoiler a...,really happy they got evangelised .. spoiler a...
1,Having lived in West New Guinea (Papua) during...,having lived in west new guinea (papua) during...
2,Excellent look into contextualizing the Gospel...,excellent look into contextualizing the gospel...
3,"More than anything, I've been challenged to fi...","more than anything, i've been challenged to fi..."
4,This is a great movie for a missionary going i...,this is a great movie for a missionary going i...


In [11]:
# Check if the 'Review_without_stopwords' column contains hashtags
hashtags_exist = data['Review_without_stopwords'].str.contains(r'#\S+', case=False).any()
if hashtags_exist:
    print("Hashtags found in the 'Review_without_stopwords' column.")
else:
    print("No hashtags found in the 'Review_without_stopwords' column.")

Hashtags found in the 'Review_without_stopwords' column.


In [15]:
data['Review_without_stopwords'] = data['Review_without_stopwords'].apply(lambda x: re.sub(r'#\S+\s*|\s+#\S+', '', str(x)))

In [17]:
print(hashtags_exist)

False


In [18]:
# Check if the 'Review_without_stopwords' column contains links
links_exist = data['Review_without_stopwords'].str.contains(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', case=False).any()
if links_exist:
    print("links found in the 'Review_without_stopwords' column.")
else:
    print("No links found in the 'Review_without_stopwords' column.")

links found in the 'Review_without_stopwords' column.


In [21]:
# Code to remove the links from the text
data['Review_without_stopwords'] = data['Review_without_stopwords'].apply(lambda x: re.sub(r"http\S+", "", x, flags=re.I))

In [23]:
from urllib.parse import urlsplit

def remove_links(text):
    split_url = urlsplit(text)
    return ' '.join(part for part in [split_url.scheme, split_url.netloc, split_url.path, split_url.query, split_url.fragment] if part)

data['Review_without_stopwords'] = data['Review_without_stopwords'].apply(remove_links)


In [25]:
links_exist = data['Review_without_stopwords'].str.contains(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', case=False).any()
print(links_exist)

False


In [26]:
# remove non-alphanumeric characters and retains only words
data['Review_without_stopwords'] = data['Review_without_stopwords'].apply(lambda x: ' '.join(re.findall(r'\w+', str(x))))

In [27]:
data.head(10)

Unnamed: 0,reviewText,Review_without_stopwords
0,really happy they got evangelised .. spoiler a...,really happy they got evangelised spoiler aler...
1,Having lived in West New Guinea (Papua) during...,having lived in west new guinea papua during t...
2,Excellent look into contextualizing the Gospel...,excellent look into contextualizing the gospel...
3,"More than anything, I've been challenged to fi...",more than anything i ve been challenged to fin...
4,This is a great movie for a missionary going i...,this is a great movie for a missionary going i...
5,This movie was in ENGLISH....it was a great su...,this movie was in english it was a great summa...
6,"This is a fascinating true story, well acted b...",this is a fascinating true story well acted by...
7,This DVD appears to be in German. It is not in...,this dvd appears to be in german it is not in ...
8,This movie is not in English although the titl...,this movie is not in english although the titl...
9,So sorry I didn't purchase this years ago when...,so sorry i didn t purchase this years ago when...


In [28]:
#substitute the multiple spaces with single spaces to standardizing the spacing in text data
data['Review_without_stopwords']=data['Review_without_stopwords'].apply(lambda x:re.sub(r'\s+', ' ', x, flags=re.I))

In [29]:
data.head(10)

Unnamed: 0,reviewText,Review_without_stopwords
0,really happy they got evangelised .. spoiler a...,really happy they got evangelised spoiler aler...
1,Having lived in West New Guinea (Papua) during...,having lived in west new guinea papua during t...
2,Excellent look into contextualizing the Gospel...,excellent look into contextualizing the gospel...
3,"More than anything, I've been challenged to fi...",more than anything i ve been challenged to fin...
4,This is a great movie for a missionary going i...,this is a great movie for a missionary going i...
5,This movie was in ENGLISH....it was a great su...,this movie was in english it was a great summa...
6,"This is a fascinating true story, well acted b...",this is a fascinating true story well acted by...
7,This DVD appears to be in German. It is not in...,this dvd appears to be in german it is not in ...
8,This movie is not in English although the titl...,this movie is not in english although the titl...
9,So sorry I didn't purchase this years ago when...,so sorry i didn t purchase this years ago when...


In [30]:
# Remove single characters surrounded by spaces and keep one space
data['Review_without_stopwords'] = data['Review_without_stopwords'].apply(lambda x: re.sub(r'\s+([a-zA-Z])\s+', r' \1 ', x))

In [38]:
data['Review_without_stopwords'] = data['Review_without_stopwords'].apply(lambda x: re.sub(r'\b[a-zA-Z]\b', '', x))

In [39]:
data.head()

Unnamed: 0,reviewText,Review_without_stopwords
0,really happy they got evangelised .. spoiler a...,really happy they got evangelised spoiler aler...
1,Having lived in West New Guinea (Papua) during...,having lived in west new guinea papua during t...
2,Excellent look into contextualizing the Gospel...,excellent look into contextualizing the gospel...
3,"More than anything, I've been challenged to fi...",more than anything ve been challenged to find...
4,This is a great movie for a missionary going i...,this is great movie for missionary going int...


In [40]:
data.to_csv("/content/drive/MyDrive/ML_project_datasheets/Final_code/preprocessed_data.csv", index=False )