In [1]:
import requests
from bs4 import BeautifulSoup
#importing beautiful soap for scrapping the data

## Dependencies

`BBeautifulSoup` :    pip install BeautifulSoup

In [2]:

import pandas as pd
import numpy as np
import itertools #to create efficent looping to fetch more data in a go
import re 
import random 

### Creating BS4 Functions for scrapping

  Movies are cateogirsed into seven and each category is processed by indivitual team members.
- category 1: [1940 to 1980 200 movie listing with rating=10000](https://www.imdb.com/search/title/?title_type=feature&release_date=1940-01-01,1980-12-31&num_votes=10000,&count=200) 
- category 2: [2020 to 2021 200 movie listing with rating=20000](https://www.imdb.com/search/title/?title_type=feature&release_date=2020-01-01,2021-12-31&num_votes=20000,&count=200) 
- category 3: [2000 to 2021 200 movie listing with rating=60000](https://www.imdb.com/search/title/?title_type=feature&release_date=2000-01-01,2021-12-31&num_votes=60000,&count=200) 
- category 4: [1940 to 1980 200 movie listing with rating=10000](https://www.imdb.com/search/title/?title_type=feature&release_date=2005-01-01,2015-12-31&num_votes=30000,&count=200) 
- category 5: [1980 to 2019 200 movie listing with rating=500000](https://www.imdb.com/search/title/?title_type=feature&release_date=1980-01-01,2019-12-31&num_votes=500000,&count=200) 
- category 6: [1980 to 2019 200 movie listing with rating=80000](https://www.imdb.com/search/title/?title_type=tv_series&release_date=1980-01-01,2019-12-31&num_votes=80000,&count=200) 
- category 7: [2005 to 2010 200 movie listing with rating=5000](https://www.imdb.com/search/title/?title_type=feature&release_date=2005-01-01,2010-12-31&num_votes=5000,&count=200) 

#Sample files are put into Data_scrapped folder.

In [3]:
url = "https://www.imdb.com/search/title/?title_type=feature&release_date=2009-01-01,2011-12-31&num_votes=2000,&count=200" #2000 - 2021 6000votes filter 200 titles (imdb not letting to filter >200 titles in a go)

def getSoup(url):
    """
    Utility function this get soup function will fetch the above url which stored in url var.
    """
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    return soup

def getReviews(soup):
    '''Function returns all reviews including postive and negative..'''
    
    # get a list of user ratings
    user_review_ratings = [tag.previous_element for tag in 
                           soup.find_all('span', attrs={'class': 'point-scale'})]      
                             #can search div by inspect elementor
    
    
    # get the review tags
    user_review_list = soup.find_all('a', attrs={'class':'title'})
    ans = []
    for i in range(5):
        ans.append(user_review_list[random.randint(0, len(user_review_list) -1)])
    links = ["https://www.imdb.com" + tag['href'] for tag in ans]
    return links


def getReviewText(review_url):
    '''Returns the user review text given the review url.'''
    # get the review_url's soup
    soup = getSoup(review_url)
    # find div tags with class text show-more__control
    tag = soup.find('div', attrs={'class': 'text show-more__control'})
    return tag.getText()

def getMovieTitle(review_url):
    '''Returns the movie title from the review url.'''
    # get the review_url's soup
    soup = getSoup(review_url)
    # find h1 tag
    tag = soup.find('h1')
    return list(tag.children)[1].getText()

def getNounChunks(user_review):
    # create the doc object
    doc = nlp(user_review)
    # get a list of noun_chunks
    noun_chunks = list(doc.noun_chunks)
    # convert noun_chunks from span objects to strings, otherwise it won't pick
    noun_chunks_strlist = [chunk.text for chunk in noun_chunks]
    return noun_chunks_strlist
movies_soup = getSoup(url)

## Filtering the movie tags

In [8]:
movie_tags = movies_soup.find_all('a', attrs={'class': None})

# filter the a-tags to get just the titles
movie_tags = [tag.attrs['href'] for tag in movie_tags 
              if tag.attrs['href'].startswith('/title') & tag.attrs['href'].endswith('/')]

# remove duplicate links
movie_tags = list(dict.fromkeys(movie_tags))

print("There are a total of " + str(len(movie_tags)) + " movie titles")
print("Displaying 10 titles")
movie_tags[:2]

There are a total of 200 movie titles
Displaying 10 titles


['/title/tt1262416/', '/title/tt0491152/']

## Filtering the movie URL's

In [7]:

base_url = "https://www.imdb.com"
movie_links = [base_url + tag + 'reviews' for tag in movie_tags]
print("There are a total of " + str(len(movie_links)) + " movie user reviews")
print("Displaying 20 user reviews links")
movie_links[:2]

There are a total of 200 movie user reviews
Displaying 20 user reviews links


['https://www.imdb.com/title/tt1262416/reviews',
 'https://www.imdb.com/title/tt0491152/reviews']

In [9]:
movie_soups = [getSoup(link) for link in movie_links]

# get all movie review links from the 200 listing
movie_review_list = [getReviews(movie_soup) for movie_soup in movie_soups]


In [10]:
#Checking how many movie review were able to filter.
movie_review_list = list(itertools.chain(*movie_review_list))
print(len(movie_review_list))

print("There are a total of " + str(len(movie_review_list)) + " individual movie reviews")
print("Displaying 10 reviews")
movie_review_list[:5]

1000
There are a total of 1000 individual movie reviews
Displaying 10 reviews


['https://www.imdb.com/review/rw2503689/',
 'https://www.imdb.com/review/rw6051819/',
 'https://www.imdb.com/review/rw2416020/',
 'https://www.imdb.com/review/rw5243143/',
 'https://www.imdb.com/review/rw5243143/']

## Converting into the Pandas Data Frame

In [12]:
review_texts = [getReviewText(url) for url in movie_review_list]

# get movie name from the review link
movie_titles = [getMovieTitle(url) for url in movie_review_list]

# Filtering the dataframe with only User_reviews by avoiding links and title

# construct a dataframe
df = pd.DataFrame({
             'user_review': review_texts })

In [13]:
df.head(5) #displaying the resultant data frame

Unnamed: 0,user_review
0,That wasn't really I question for my review. B...
1,Despite there being no Scream movies for a dec...
2,"The ""Scream"" film franchise carved a memorable..."
3,Fifteen years have passed since the original W...
4,Fifteen years have passed since the original W...


## The data frame need to remove index and filter the limit review length by 250 words

In [14]:
text_list = [m for m in df['user_review']]
#text_list

In [15]:
#calculating the length of the text
text_list_length = [len(m.split()) for m in text_list]     
df['length'] = text_list_length
df

Unnamed: 0,user_review,length
0,That wasn't really I question for my review. B...,208
1,Despite there being no Scream movies for a dec...,337
2,"The ""Scream"" film franchise carved a memorable...",957
3,Fifteen years have passed since the original W...,352
4,Fifteen years have passed since the original W...,352
...,...,...
995,"I watched Film 2009 with Jonathan Ross, and re...",316
996,Kind of a tame CGI fest Hebrew Exorcist regard...,506
997,This is an astonishing story about a vengeful ...,430
998,Well it's now a few years since The Unborn was...,248


In [16]:
df = df[df['length'] < 250]  #limiting the df by 250 in length
df

Unnamed: 0,user_review,length
0,That wasn't really I question for my review. B...,208
5,Rachel (Ginnifer Goodwin) and Darcy (Kate Huds...,169
7,Overall I can see why this movie got bad revie...,135
8,Surprisingly not that bad. Very predictable an...,11
9,23 December 2016. Credit can be given to the p...,135
...,...,...
984,Another Film that plays the Same Scenes again ...,154
985,I would love to hate Richard Gere. He's too go...,243
987,"An Akita Inu puppy, transplanted from a Japane...",232
989,"Based on the true story of a faithful Akita, t...",198


In [17]:
df.drop('length', axis=1, inplace=True)
df
#dropping the len row

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


Unnamed: 0,user_review
0,That wasn't really I question for my review. B...
5,Rachel (Ginnifer Goodwin) and Darcy (Kate Huds...
7,Overall I can see why this movie got bad revie...
8,Surprisingly not that bad. Very predictable an...
9,23 December 2016. Credit can be given to the p...
...,...
984,Another Film that plays the Same Scenes again ...
985,I would love to hate Richard Gere. He's too go...
987,"An Akita Inu puppy, transplanted from a Japane..."
989,"Based on the true story of a faithful Akita, t..."


In [18]:
#converting only reviews to CSV & removing the index
df.to_csv('data_scrapped/data.csv', index=False) 

## Splitting the csv file to the indivitual text files

In [27]:
import csv

with open("data_scrapped/data.csv", "r") as f:
        reader = csv.reader(f)
        rownumber = 2639
        for row in reader:
             g=open(str(rownumber)+".txt","w")
             g.write(str(row))
             rownumber = rownumber + 1
             g.close()

## <------------------------------------------------------EOL----------------------------------------------------------------->

## Final Dataset

Here is the Link to Final Dataset: [Drive Link](https://drive.google.com/file/d/1sTNAeuy-99Hao0V5AOVznLXyDJC2zuFn/view?usp=sharing)