# Conexión con BigQuery

In [2]:
import os
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="E:/Desktop/Python Top Gun Lab/final_project/centering-valve-358203-eb0bab19cd78.json"


In [3]:
from google.cloud import bigquery

# Construct a BigQuery client object.
client = bigquery.Client()

query = """
    SELECT * 
    FROM `bigquery-public-data.imdb.reviews` as reviews
    INNER JOIN `bigquery-public-data.imdb.title_basics` as titles
    ON reviews.movie_id = titles.tconst
"""
df = client.query(query).to_dataframe()  # Make an API request.


# Análisis de datos
El dataset que se esocogio fue el de IMDB, del cual seleccionamos dos tablas: reviews y title_basics las cuales se relacionan por medio de la columna movie_id y tconst.
Como resultado tenemos un dataset de 94674 filas y 16 columnas previo a su limpieza.
Respecto a cada columna tenemos:
1. Reviews: hace referencia a la review de cierta pelicula/Serie/show/etc realizada por un usuario
2. split: etiquta la fila para pertenecer a un subconjuto de datos: train y test, para posteriormente ser entrenada en un modelo de machine learning
3. label: Etiqueta que le asigna la polaridad de la review realizada(Positiva o negativa)
4. movie_id: caracter alfanumerico que identifica el producto audiovisual
5. reviewer_rating: El rating que le dio la persona que realizó la review 
6. movie_url: url para ver la información de la película en imdb 
7. title: titulo de la pelicula 
8. tconst: caracter alfanumerico que identifica el producto audiovisual
9. title_type: tipo de producto audiovisual: serie, pelicula, short, tvshow, etc
10. primary_title: El titulo mas popular.
11. original_title: El titulo original en el idioma original
12. is_adult: Si la pellicula es para adultos o no
13. start_year: Año de lanzamiento
14. end_year: año en que termino en caso de ser una serie o show
15. runtime_minutes: tiempo de duración
16. genres: Generos a losque pertenece la pelicula 
 

In [4]:
#Los tipos de datos se encuentran a continuacion donde object hace referencia a string, int a entero
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 94674 entries, 0 to 94673
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   review           94674 non-null  object
 1   split            94674 non-null  object
 2   label            94674 non-null  object
 3   movie_id         94674 non-null  object
 4   reviewer_rating  47285 non-null  Int64 
 5   movie_url        94674 non-null  object
 6   title            94673 non-null  object
 7   tconst           94674 non-null  object
 8   title_type       94674 non-null  object
 9   primary_title    94674 non-null  object
 10  original_title   94674 non-null  object
 11  is_adult         94674 non-null  Int64 
 12  start_year       94669 non-null  Int64 
 13  end_year         4834 non-null   Int64 
 14  runtime_minutes  93779 non-null  Int64 
 15  genres           94628 non-null  object
dtypes: Int64(5), object(11)
memory usage: 12.0+ MB


In [7]:
#diferentes tipos de formatos audiovisuales, solo nos interesan las peliculas
df['title_type'].value_counts()

movie           74754
tvMovie          5788
video            4835
tvSeries         4104
short            1675
tvMiniSeries     1502
tvEpisode        1241
videoGame         385
tvSpecial         334
tvShort            56
Name: title_type, dtype: int64

In [5]:
#Dimensión del dataset luego de solo elegir las peliculas
df_movies = df.loc[df['title_type'] == 'movie'].reset_index(drop=True)
df_movies.shape

(74754, 16)

In [6]:
#Columnas a eliminar: tconst, movie_url, title_type, end_year
df_movies.head()

Unnamed: 0,review,split,label,movie_id,reviewer_rating,movie_url,title,tconst,title_type,primary_title,original_title,is_adult,start_year,end_year,runtime_minutes,genres
0,"I don't know where to begin, so I'll begin wit...",train,Negative,tt0331834,4,http://www.imdb.com/title/tt0331834/,Alive,tt0331834,movie,Alive,Alive,0,2002,,119,"Action,Drama,Horror"
1,"I'll keep this short, as I know I don't need t...",train,Negative,tt0331834,3,http://www.imdb.com/title/tt0331834/,Alive,tt0331834,movie,Alive,Alive,0,2002,,119,"Action,Drama,Horror"
2,Having seen Versus previously I had high hopes...,train,Negative,tt0331834,2,http://www.imdb.com/title/tt0331834/,Alive,tt0331834,movie,Alive,Alive,0,2002,,119,"Action,Drama,Horror"
3,I really tried to like this movie but in the e...,train,Negative,tt0331834,4,http://www.imdb.com/title/tt0331834/,Alive,tt0331834,movie,Alive,Alive,0,2002,,119,"Action,Drama,Horror"
4,I've seen most of Ryuhei Kitamura's work and I...,train,Negative,tt0331834,4,http://www.imdb.com/title/tt0331834/,Alive,tt0331834,movie,Alive,Alive,0,2002,,119,"Action,Drama,Horror"


In [7]:
df_movies.drop(['tconst', 'movie_url', 'title_type', 'end_year', 'title', 'original_title'], axis=1, inplace=True)
df_movies.head()

Unnamed: 0,review,split,label,movie_id,reviewer_rating,primary_title,is_adult,start_year,runtime_minutes,genres
0,"I don't know where to begin, so I'll begin wit...",train,Negative,tt0331834,4,Alive,0,2002,119,"Action,Drama,Horror"
1,"I'll keep this short, as I know I don't need t...",train,Negative,tt0331834,3,Alive,0,2002,119,"Action,Drama,Horror"
2,Having seen Versus previously I had high hopes...,train,Negative,tt0331834,2,Alive,0,2002,119,"Action,Drama,Horror"
3,I really tried to like this movie but in the e...,train,Negative,tt0331834,4,Alive,0,2002,119,"Action,Drama,Horror"
4,I've seen most of Ryuhei Kitamura's work and I...,train,Negative,tt0331834,4,Alive,0,2002,119,"Action,Drama,Horror"


In [9]:
df_movies['label'].value_counts()
    

Unsupervised    37883
Positive        18710
Negative        18161
Name: label, dtype: int64

In [32]:
df_movies['split'].value_counts()

train    56666
test     18088
Name: split, dtype: int64

# Machine Learning model

In [1]:
import pandas as pd
df_movies = pd.read_csv('clean_imdb.csv')

In [2]:
data_model = df_movies[['review','label','split']]

In [13]:
data_model.head()

Unnamed: 0,review,label,split
0,"I don't know where to begin, so I'll begin wit...",Negative,train
1,"I'll keep this short, as I know I don't need t...",Negative,train
2,Having seen Versus previously I had high hopes...,Negative,train
3,I really tried to like this movie but in the e...,Negative,train
4,I've seen most of Ryuhei Kitamura's work and I...,Negative,train


In [15]:
data_model['label'].value_counts()

Unsupervised    37883
Positive        18710
Negative        18161
Name: label, dtype: int64

In [3]:
from MLModelPreprocess import DataPreprocess

[nltk_data] Downloading package stopwords to C:\Users\Pablo
[nltk_data]     Tamayo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
preprocess = DataPreprocess(data_model)

clean_text = preprocess.convert_data()

In [5]:
clean_text[1]

'keep this short know don need say much aliv strang littl film that obvious appeal some but found shock bland from almost the veri begin the film did veri littl make ani the charact likabl and the stori time becam convolut that complet lost interest said know other enjoy but found kitamura aliv anyth but lame extrem bore drama disguis thought provok action sci flick felt like was sucker into watch this film base it intrigu premis and uber excit cover art suggest pass for kitamura far more enjoy freshman effort versus his riot godzilla final war and don get wrong alway for good think man film but this certain wasn there was nari moment that actual care about singl event take place this over preachi dialogu heavi movi you wanna talk about someth talk about bore'

In [6]:
vectorizer, text_features = preprocess.vectroize_data(clean_text)

In [10]:
#division del dataset en entrenamiento y prueba: El dataset por defecto traia el mismo numero de train y test en la 
#columna split, por lo que ignoraremos esa notación y crearemos nuestro propio conjunto de entrenamiento y prueba
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

X_train, X_test, y_train, y_test = train_test_split(text_features, data_model['label'], test_size=0.2, random_state=0)

In [11]:
nb = MultinomialNB()
nb.fit(X_train, y_train)

In [14]:
predictions = nb.predict(X_train)

# Ahora calculamos el score de accuracy enviando las predicciónes y los valores reales de polaridad.
print(accuracy_score(y_train, predictions))


0.5048241727003662
