In [1]:
"""
CONFIGURATION:

In this cell we can read our data and and do all the necessary imports.
"""

import pandas as pd
import numpy as np

reviews = pd.read_csv("reviews.tsv", sep="\t")
movie_info = pd.read_csv("movie_info.tsv", sep="\t")

In [2]:
reviews.head()

Unnamed: 0,id,review,rating,fresh,critic,top_critic,publisher,date
0,3,A distinctly gallows take on contemporary fina...,3/5,fresh,PJ Nabarro,0,Patrick Nabarro,"November 10, 2018"
1,3,It's an allegory in search of a meaning that n...,,rotten,Annalee Newitz,0,io9.com,"May 23, 2018"
2,3,... life lived in a bubble in financial dealin...,,fresh,Sean Axmaker,0,Stream on Demand,"January 4, 2018"
3,3,Continuing along a line introduced in last yea...,,fresh,Daniel Kasman,0,MUBI,"November 16, 2017"
4,3,... a perverse twist on neorealism...,,fresh,,0,Cinema Scope,"October 12, 2017"


In [3]:
movie_info.head()

Unnamed: 0,id,synopsis,rating,genre,director,writer,theater_date,dvd_date,currency,box_office,runtime,studio
0,1,"This gritty, fast-paced, and innovative police...",R,Action and Adventure|Classics|Drama,William Friedkin,Ernest Tidyman,"Oct 9, 1971","Sep 25, 2001",,,104 minutes,
1,3,"New York City, not-too-distant-future: Eric Pa...",R,Drama|Science Fiction and Fantasy,David Cronenberg,David Cronenberg|Don DeLillo,"Aug 17, 2012","Jan 1, 2013",$,600000.0,108 minutes,Entertainment One
2,5,Illeana Douglas delivers a superb performance ...,R,Drama|Musical and Performing Arts,Allison Anders,Allison Anders,"Sep 13, 1996","Apr 18, 2000",,,116 minutes,
3,6,Michael Douglas runs afoul of a treacherous su...,R,Drama|Mystery and Suspense,Barry Levinson,Paul Attanasio|Michael Crichton,"Dec 9, 1994","Aug 27, 1997",,,128 minutes,
4,7,,NR,Drama|Romance,Rodney Bennett,Giles Cooper,,,,,200 minutes,


In [4]:
"""
PRELIMINARY EXPLORATORY DATA ANALYSIS
"""
# Get all unique ids from reviews
reviews_ids = list(set(reviews['id'].tolist()))

# Remove movies that dont have reviews
movie_info = movie_info[movie_info['id'].isin(reviews_ids)]

# TODO podemos eliminar todos que nao tem rating e review
# Numa primeira faze podemos eliminar todos os que nao tem rating
# podemos utilizar text mining no review para eferir o rating mas nao é necessario
# Find reviews with no rating
print("Number of reviews with no rating: ", len(reviews.loc[reviews['rating'].isnull()]))

# Find reviews with no rating and no review
# possivle escala trenaria, viu e gostou, viu e nao gostou, viu
print("Number of reviews with no rating and review:", len(reviews.loc[reviews['rating'].isnull() & reviews['review'].isnull()]))

# Remove reviews with no rating
reviews = reviews.loc[~reviews['rating'].isnull()]

Number of reviews with no rating:  13517
Number of reviews with no rating and review: 27


In [5]:
print(len(reviews))
print(len(reviews.loc[reviews['rating'].isnull()]))

40915
0


In [6]:
"""
TEST Cell
To be removed
"""
# Get all unique ratings
reviews_ratings = list(set(reviews['rating'].tolist()))
print(reviews_ratings)

rating_alpha = [r for r in reviews_ratings if r.isalpha() or "-" in r or "+" in r]
rating_num = [r for r in reviews_ratings if r.isdigit() or ("." in r and "/" not in r)]
rating_outof = [r for r in reviews_ratings if "/" in r]

print(rating_alpha)
print(len(reviews_ratings))
print(len(rating_alpha) + len(rating_outof) + len(rating_num))

['4.0/5', '0.5/4', '4.9', '2.6/5', '1/2', '3.1', '2/5', '2.3/4', '3.0', '8.1/10', '4.8', '5.5/10', '2/2', '6.7', '0.5/5', '1.0/4', '4.4/5', '2.2', '1', '7.3', '5.0/10', '8.9/10', 'T', '3.7', '7.2/10', '7.5/10', '7.4', '8', '8.7/10', '3/2', '9.2/10', '6.9/10', '6.2/10', '2.5', '3.0/5', 'A+', '3.5/10', '2.5/10', '1.7', '3.5/4', '1/10', '4.2', '3.6/5', '3.2', '2.7/5', '7.9', '4', '8.3/10', '5.5/5', '7.3/10', '4/10', '6.0/10', '6.2', '5.9/10', '5.8/10', '5/4', '2.4/5', '1.9/5', '6/8', '2.5/5', '3/10', 'A', '6.8/10', '9.6/10', '4.7', '4.3/10', '7.0/10', '3.3', '4.2/5', '5.9', '2/4', '3.5', '7.4/10', 'C+', '4.5/5', '6.5/10', '7/10', '2.2/5', '8.5/10', '4.2/10', '7.7/10', '8.8/10', '1.5', '0/5', 'B+', '8.0/10', '2/6', '1/5', '6/10', '5.8', '2.0/4', '9.5/10', 'B-', '7.8/10', '4.5', '9.8', '4.1', '2/10', '3/6', '3.0/4', '4.0/10', '9.2', '3.3/5', 'C-', '1.5/4', '4.0', '1.5/10', '1/4', '3', '0', '0/4', '9.0', '8.4', '2.5/4', 'B', '1.0/5', '4.0/4', '2.6/6', '3.8/10', '8.2', '7', '2.1/2', '3.4', '2

In [9]:
"""
Converting all ratings to one single scale /10
For any v/m the new rating will be (v * 10)/m
Note: We won't to be able to convert single numbers like ['1', '6', ...] since we don't know the corresponding scale.
"""
grade_letters = {'A+': 12, 'A': 11, 'A-': 10, 'B+': 9, 'B': 8, 'B-': 7, 'C+':6, 'C':5, 'C-':4, 'D+':3, 'D':2, 'D-':1, 'F': 0}
def convert_rating(rating):
    new_rating = rating
    if '/' in str(rating) and ' ' not in str(rating):
        # print(rating)
        i = rating.index('/')
        value = float(rating[:i])
        max_value = float(rating[i+1:])
        new_rating = value*10/max_value
    elif str(rating) in grade_letters:
        print(rating)
        new_rating = grade_letters[rating]*10/12
    return new_rating

reviews['rating'] = reviews['rating'].apply(convert_rating)
reviews.head()

Unnamed: 0,id,review,rating,fresh,critic,top_critic,publisher,date
0,3,A distinctly gallows take on contemporary fina...,6.0,fresh,PJ Nabarro,0,Patrick Nabarro,"November 10, 2018"
6,3,"Quickly grows repetitive and tiresome, meander...",4.16667,rotten,Eric D. Snider,0,EricDSnider.com,"July 17, 2013"
7,3,Cronenberg is not a director to be daunted by ...,4.0,rotten,Matt Kelemen,0,Las Vegas CityLife,"April 21, 2013"
11,3,"While not one of Cronenberg's stronger films, ...",5.83333,fresh,Emanuel Levy,0,EmanuelLevy.Com,"February 3, 2013"
12,3,Robert Pattinson works mighty hard to make Cos...,5.0,rotten,Christian Toto,0,Big Hollywood,"January 15, 2013"
