In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd "/content/drive/My Drive/sma_colab/final_assessment"

/content/drive/My Drive/sma_colab/final_assessment


# **Import Libraries**

In [None]:
import requests
from scrapy.selector import Selector
import json
from html import unescape
import csv
import os
import pandas as pd
from textblob import TextBlob
import matplotlib.pyplot as plt
import nltk

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# **Function to make folder if it does not exist**

In [None]:
def create_folder(folder_path):
  if not os.path.exists(folder_path):
    os.makedirs(folder_path)

In [None]:
# Create parent folder of dominos and papa johns
create_folder("dominos")
create_folder("papa_johns")

In [None]:
# Create sub folders

create_folder("dominos/trust_pilot")
create_folder("dominos/consumer_affairs")

create_folder("papa_johns/trust_pilot")
create_folder("papa_johns/consumer_affairs")

# **Method to download html pages of consumer affairs and trust pilot**

In [None]:
def download_pages(url_link, folder_name, end_loop = 101, url_end = None):
  for page_no in range(1, end_loop):
    url_link = url_link + str(page_no) + (url_end if url_end is not None else "")
    response = requests.get(url_link)
    file_name = folder_name + "page" + str(page_no) + ".html"
    with open(file_name, 'w', encoding='utf-8') as file:
      file.write(response.text)

In [None]:
download_pages("https://uk.trustpilot.com/review/www.dominos.co.uk?page=", "dominos/trust_pilot/")

In [None]:
download_pages("https://uk.trustpilot.com/review/www.papajohns.co.uk?page=", "papa_johns/trust_pilot/")

In [None]:
download_pages("https://www.consumeraffairs.com/food/dominos.html?page=", "dominos/consumer_affairs/", 6, "#scroll_to_reviews=true")

In [None]:
download_pages("https://www.consumeraffairs.com/food/papa_johns.html?page=", "papa_johns/consumer_affairs/", 6, "#scroll_to_reviews=true")

# **Extracting Trust Pilot Reviews**

In [None]:
def extract_trust_pilot(company_name):
  folder_path = company_name + '/trust_pilot'
  file_list = os.listdir(folder_path)
  extracted_data = []
  for file_name in file_list:
    file_path = os.path.join(folder_path, file_name)
    with open(file_path, 'r', encoding='utf-8') as file:
      file_content = file.read()
      selector = Selector(text=file_content)
      overall_data = selector.css('script[type="application/ld+json"]::text').extract()[0]
      jsons_data = json.loads(overall_data)['@graph']
      for json_data in jsons_data:
        if json_data['@type'] == 'Review':
          # author_name = json_data['author']['name']
          published_date = json_data['datePublished']
          review = json_data['reviewBody']
          rating = json_data['reviewRating']['ratingValue']
          extracted_data.append([published_date, review.strip().encode('ascii', 'ignore').decode('ascii'), rating, company_name, "trust_pilot"])
  return extracted_data

In [None]:
dominos_reviews_tp = extract_trust_pilot("dominos")

In [None]:
papa_johns_reviews_tp = extract_trust_pilot("papa_johns")

# **Extracting Consumer Affairs Reviews**

In [None]:
def extract_consumer_affair(company_name):
  folder_path = company_name + '/consumer_affairs'
  file_list = os.listdir(folder_path)
  extracted_data = []
  for file_name in file_list:
    file_path = os.path.join(folder_path, file_name)
    with open(file_path, 'r', encoding='utf-8') as file:
      file_content = file.read()
      selector = Selector(text=file_content)
      overall_data_date = selector.css("#reviews-container .rvw__dtls .rvw__rvd-dt::text").extract()
      overall_data_reviews = selector.css("#reviews-container .rvw__dtls .rvw__bd p::text").extract()
      # overall_first_name = selector.css(".rvw__inf span:nth-child(1)::text").extract()
      # overall_second_name = selector.css(".rvw__inf span:nth-child(2)::text").extract()
      for date, review in zip(overall_data_date, overall_data_reviews):
        extracted_data.append([date.replace("Reviewed", "").strip(), review.strip().encode('ascii', 'ignore')
        .decode('ascii'), None, company_name, "consumer_affairs"])
  return extracted_data

In [None]:
dominos_reviews_ca = extract_consumer_affair("dominos")

In [None]:
papa_johns_reviews_ca = extract_consumer_affair("papa_johns")

# **Save CSV Data**

In [None]:
def save_csv(extracted_data, company_name):
  csv_file_path = company_name + '_reviews.csv'
  with open(csv_file_path, 'w', newline='', encoding='utf-8') as csvfile:
    csv_writer = csv.writer(csvfile)

    # Write the header row
    csv_writer.writerow(['published_date', 'review', 'rating', 'company', 'source'])

    # Write the extracted data
    csv_writer.writerows(extracted_data)

In [None]:
save_csv(dominos_reviews_tp, "dominos_tp")

csv written


In [None]:
save_csv(papa_johns_reviews_tp, "papa_johns_tp")

csv written


In [None]:
save_csv(dominos_reviews_ca, "dominos_ca")

csv written


In [None]:
save_csv(papa_johns_reviews_ca, "papa_johns_ca")

csv written


# **Sentimental Analysis**

In [None]:
nrc_lexicon = pd.read_csv('NRC-Emotion-Lexicon-Wordlevel-v0.92.txt', names=['word', 'emotion', 'association'], skiprows=45, delimiter='\t')

In [None]:
def word_cloud(review):
  tokens = word_tokenize(review)

  # Remove stopwords
  stop_words = set(stopwords.words('english'))
  tokens = [word.lower() for word in tokens if word.lower() not in stop_words and len(word) >= 3]

  # Lemmatize
  lemmatizer = WordNetLemmatizer()
  tokens = [lemmatizer.lemmatize(word) for word in tokens]

  # Filter lexicon for relevant words
  # Assuming nrc_lexicon is loaded earlier in your code
  filtered_lexicon = nrc_lexicon[nrc_lexicon['word'].isin(tokens)]

  # Display emotions associated with each word
  return sorted(filtered_lexicon[['word', 'emotion']]['word'].unique())

In [None]:
def sentimental_analysis(csv_files):
  sentiments = []
  nltk.download('punkt')
  nltk.download('averaged_perceptron_tagger')
  nltk.download('brown')
  nltk.download('stopwords')
  nltk.download('wordnet')
  for csv_file in csv_files:
    df = pd.read_csv(csv_file)
    # Perform sentiment analysis on each row
    for index, row in df.iterrows():
      text = row['review']
      blob = TextBlob(str(text))
      sentiment_polarity = blob.sentiment.polarity
      sentiment_subjectivity = blob.sentiment.subjectivity
      pos_tags = blob.tags
      nouns = ', '.join(blob.noun_phrases)
      sentence_words = ', '.join(blob.words)

      cleaned_text = word_cloud(row['review'])
      sentiments.append({'published_date': row['published_date'], 'review': row['review'], 'rating': row['rating'], 'company': row['company'],
                         'source': row['source'], 'Text': text, 'Sentiment Polarity': sentiment_polarity,
                         'Sentiment Subjectivity': sentiment_subjectivity, 'pos_tags': pos_tags, 'nouns': nouns, 'sentence_words': sentence_words,
                         'cleaned_text': cleaned_text})

  # Create a new DataFrame with sentiment analysis results
  sentiments_df = pd.DataFrame(sentiments)

  # Save the results to a new CSV file
  output_csv_path = 'sentiment_analysis_results.csv'
  sentiments_df.to_csv(output_csv_path, encoding='utf-8')

In [None]:
sentimental_analysis(["papa_johns_tp_reviews.csv", "dominos_ca_reviews.csv", "dominos_tp_reviews.csv", "papa_johns_ca_reviews.csv"])

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
