# Data Acquistion

In [1]:
import urllib.request
import os

#Creating data folder
if not os.path.exists('./data'):
    os.makedirs('./data')
    
# Downloading dataset using the url that hosts it
kaggle_url = 'https://github.com/sundeepblue/movie_rating_prediction/raw/master/movie_metadata.csv'
if not os.path.exists('/data/kaggle_dataset.csv'):
    data = urllib.request.urlretrieve(kaggle_url, './data/kaggle_dataset.csv')

 

In [2]:
import gzip

#Obtaining IMDB's text files that are zipped winrar
imdb_url_prefix = 'https://ftp.funet.fi/pub/mirrors/ftp.imdb.com/pub/frozendata/'
imdb_files_list = ['genres.list.gz', 'ratings.list.gz']

for name in imdb_files_list:
    if not os.path.exists('./data/' + name):
        data = urllib.request.urlretrieve(imdb_url_prefix + name, './data/' + name)
        urllib.request.urlcleanup() # urllib fails to download two files from a ftp source. This fixes the bug!
        with gzip.open('./data/' + name, 'rb') as comp_file, open('./data/' + name[:-3], 'wb') as reg_file:   #'w' alone doesn't work because you are reading binary, so you have to write binary, so use 'wb'
            file_content = comp_file.read()
            reg_file.write(file_content)
            
# Error WHY DO I HAVE TO RUN THIS TWICE< IT WONT DOWNLOAD BOTH FILES SAME TIME

In [3]:
imdb_url = 'https://anaconda.org/BigGorilla/datasets/1/download/imdb_dataset.csv'

if not os.path.exists('./data/imdb_dataset.csv'):  #Avoids downloading if it already exists
    headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'}
    req = urllib.request.Request(url=imdb_url, headers=headers) #To fix HTTP error 403: Forbidden
    data = urllib.request.urlopen(req).read()
    
    with open("./data/imdb_dataset.csv", "wb") as f:
        f.write(data)

# Data Extraction

The “Kaggle 5000 Movie Dataset” is stored in a .csv file which is already structured and ready to use. On the other hand, the “IMDB Plain Text Data” is a collection of semi-structured text files that need to be processed to extract the data. A quick look at the first few lines of each files shows that each file has a different format and has to be handled separately.

**Contents of "ratings.list" file**

In [4]:
with open("./data/ratings.list") as f:
    head = [next(f) for x in range(38)]
print(''.join(head[28:38]))  #Skip the first 28 lines cuz they are descriptive headers

      0000000125  1888533   9.2  The Shawshank Redemption (1994)
      0000000125  1289428   9.2  The Godfather (1972)
      0000000124  889607   9.0  The Godfather: Part II (1974)
      0000000124  1864164   9.0  The Dark Knight (2008)
      0000000133  518449   8.9  12 Angry Men (1957)
      0000000133  971107   8.9  Schindler's List (1993)
      0000000123  1477112   8.9  Pulp Fiction (1994)
      0000000124  1349449   8.9  The Lord of the Rings: The Return of the King (2003)
      0000000123  559468   8.8  Il buono, il brutto, il cattivo (1966)
      0000000133  1513600   8.8  Fight Club (1999)



**Conents of "genres.list" file**

In [5]:
with open('./data/genres.list') as f:
    head = [next(f) for x in range(392)]
print(''.join(head[382:392]))  # skipping the first 382 lines as they are descriptive header


"!Next?" (1994)						Documentary
"#1 Single" (2006)					Reality-TV
"#15SecondScare" (2015)					Horror
"#15SecondScare" (2015)					Short
"#15SecondScare" (2015)					Thriller
"#15SecondScare" (2015) {Who Wants to Play with the Rabbit? (#1.2)}	Drama
"#15SecondScare" (2015) {Who Wants to Play with the Rabbit? (#1.2)}	Horror
"#15SecondScare" (2015) {Who Wants to Play with the Rabbit? (#1.2)}	Short



**Extracing info from "genres.list"**

In [10]:
import re
import pandas as pd

with open("./data/genres.list") as genres_file:
    raw_content = genres_file.readlines()
    genres_list = []         #List that will contain lists for each [movie,year,genre]
    content = raw_content[384:]
    for line in content:
            #print(line.strip())
            m = re.match(r'"?(.*[^"])"? \(((?:\d|\?){4})(?:/\w*)?\).*\s((?:\w|-)+)', line.strip())
            #print(m.group(1), m.group(2), m.group(3))
            genres_list.append([m.group(1), m.group(2), m.group(3)])
            
    genres_data = pd.DataFrame(genres_list, columns=['movie', 'year', 'genre'])

In [12]:
genres_data

Unnamed: 0,movie,year,genre
0,!Next?,1994,Documentary
1,#1 Single,2006,Reality-TV
2,#15SecondScare,2015,Horror
3,#15SecondScare,2015,Short
4,#15SecondScare,2015,Thriller
...,...,...,...
2658936,überRICH,2017,Comedy
2658937,überRICH,2017,Short
2658938,üç,2012,Adventure
2658939,üç,2012,Comedy
