# Data Collection

In [46]:
# external libraries
# Please use ' pip install requirements.txt ' on terminal. (Creating a venv is prefered)

import requests
from bs4 import BeautifulSoup
import pandas as pd
from textblob import TextBlob
from symspellpy.symspellpy import SymSpell
import emoji


In [47]:
# Parsing "https://www.trustpilot.com/review/tesla.com", UNSTRUCTURED DATA

URL1 = "https://www.trustpilot.com/review/tesla.com"
CONTENT_CLASS_1 = "CDS_Typography_appearance-default__96c1da CDS_Typography_prettyStyle__96c1da CDS_Typography_body-l__96c1da"

# Comment Section of website contains 27 pages. Using for loop to access each of them.

for i in range(1, 28):
    res1 = requests.get(f"{URL1}?page={i}")

    soup1 = BeautifulSoup(res1.content, 'html.parser')

    content = soup1.find_all('p', class_=CONTENT_CLASS_1)

    if content:
        with open('content1.txt', 'a', encoding='utf-8') as m:
            for para in content:
                m.write(para.text.strip() + '\n\n')
    else:
        print("content not found")

In [48]:
# Converting raw txt file to dataframe for data preprocessing.

with open('content1.txt', 'r', encoding='utf-8') as n:
    Reviews = n.read().strip().split('\n\n')

data = pd.DataFrame(Reviews, columns=['Reviews'])
data

Unnamed: 0,Reviews
0,Absolutely unacceptable customer service from ...
1,I have 2 Powerwalls series 2. they worked well...
2,Over all happy with the car. the ghost braking...
3,I chose Tesla after a random test drive and I ...
4,Denmark
...,...
658,Denmark
659,"1,706 total"
660,Best CEO. Best company. Best products.
661,"horrible, horrible customer service! I ordered..."


In [49]:
# Removing unncessesy and repeating columns.

unwanted = ['Denmark', '1,703 total', '1,706 total']
pattern = '|'.join(unwanted)

raw_data = data[~data['Reviews'].str.contains(pattern)].reset_index(drop=True)
raw_data

Unnamed: 0,Reviews
0,Absolutely unacceptable customer service from ...
1,I have 2 Powerwalls series 2. they worked well...
2,Over all happy with the car. the ghost braking...
3,I chose Tesla after a random test drive and I ...
4,Avoid Tesla Energy — Extremely Unprofessional ...
...,...
599,Over all happy with the car. the ghost braking...
600,I chose Tesla after a random test drive and I ...
601,Best CEO. Best company. Best products.
602,"horrible, horrible customer service! I ordered..."


# Data Preprocessing

#### Basic cleaning

In [50]:
# Basic cleaning includes removal of urls, emojies, spelling corrections.

# Spelling correction: Using textblob library

# TextBlob took 15m 41.2s to complete task. Too slow for larger data sets.
# def spell_check(column):
#     text = TextBlob(column)
#     return str(text.correct())

# raw_data['Reviews'] = raw_data['Reviews'].apply(spell_check)

text = SymSpell()
text.load_dictionary("frequency_dictionary_en_82_765.txt", 0, 1)

def spell_check(inp):
    row = str(inp) if inp is not None else ""
    suggetions = text.lookup_compound(row, max_edit_distance=2)
    return str(suggetions[0].term) if suggetions else row

# removing emoji.

def remove_emoji(inp):
    return emoji.demojize(inp)


raw_data['Reviews'] = raw_data['Reviews'].apply(spell_check, remove_emoji)


  raw_data['Reviews'] = raw_data['Reviews'].apply(spell_check, remove_emoji)


In [51]:
raw_data

Unnamed: 0,Reviews
0,absolutely unacceptable customer service from ...
1,i have a power walls series a they worked well...
2,over all happy with they car they ghost brakin...
3,i chose tesla after a random test drive and i ...
4,avoid tesla energy extremely unprofessional an...
...,...
599,over all happy with they car they ghost brakin...
600,i chose tesla after a random test drive and i ...
601,best co best company best products
602,horrible horrible customer service i ordered m...


#### Basic preprocessing.