In [1]:
import re
import time
import requests
import numpy as np
import pandas as pd
from pprint import pprint
from textblob import TextBlob
from nltk.corpus import stopwords
from scrapy.http import TextResponse
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
url = "https://www.rogerebert.com/reviews"

In [3]:
errors = []
def review_info_scraper(url):
    page = requests.get(url)
    response = TextResponse(url=page.url,body=page.text,encoding="utf-8")
    try:
        title = response.css("h5.title > a::text").extract()
        author = response.css("figure[class='movie review'] > p.byline::text").extract()
        year = response.css("span.release-year::text").re("\d+")##getting only digits
        rating = [i.count("full")+0.5*i.count("half") for i in response.css("span.star-rating").extract()]
        link = ["https://www.rogerebert.com"+i for i in response.css("h5.title > a::attr(href)").extract()]
        movie_info = {"title":title,"author":author,"year":year,"rating":rating,"link":link}
    except:
        errors.append(url)
    return movie_info

In [4]:
movie_data = review_info_scraper(url)

In [20]:
errors #we do not have any errors

[]

In [6]:
def get_desc(movie_url):
    page = requests.get(movie_url) # MISTAKE
    response = TextResponse(url=page.url,body=page.text,encoding="utf-8")
    desc_list = response.css("div[itemprop='reviewBody'] > p ::text").extract()
    desc = " ".join(desc_list)
    return desc

In [7]:
all_descriptions = []
error_list = []
for i in movie_data["link"]:
    try:
        all_descriptions.append(get_desc(i))
    except:
        error_list.append(i)
    time.sleep(2)

In [8]:
#clean textual data

#make lowercase, make singular, remove stopwords, remove punct, lem., stem

In [9]:
sw = stopwords.words("english")

In [10]:
all_desc = []
for j in all_descriptions:
    all_words = TextBlob(j).words.lower()
    current_desc = ""
    for i in all_words:
        if i not in sw and i.isalnum():
            current_desc = current_desc+i
    all_desc.append(current_desc)

In [11]:
all_desc_clean = [" ".join([i for i in TextBlob(j).words.lower() if i not in sw and i.isalnum() ]) for j in all_descriptions]

In [12]:
tf_idf = TfidfVectorizer()
tfidf_matrix = tf_idf.fit_transform(all_desc_clean)

In [13]:
words = tf_idf.get_feature_names()
data = tfidf_matrix.toarray()
df = pd.DataFrame(data,columns=words)

In [21]:
df.shape ##where rows are reviews columns are words values are importance of each word

(24, 5216)

In [15]:
df.head()

Unnamed: 0,100,106,10s,11,15,16,16mm,17,18,1950s,...,zealots,zealous,zeros,zhao,zhou,zimbardo,zingers,zip,zoe,zuckerberg
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.051184,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.091871,...,0.0,0.034603,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.034951,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
algo = KMeans(3) ##number of groups are 3

In [17]:
df["cluster"] = algo.fit_predict(df)

In [18]:
df["names"] = movie_data["title"]

In [19]:
df[df["cluster"]==1].names

1            The Mountain
2    Mike Wallace Is Here
Name: names, dtype: object