In [1]:
#Add Dependencies
import pandas as pd
import psycopg2 as pg
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score

### Connection to the Postgres database and Retrive data from the tables

In [2]:
#Read Data from the PostgressSql
conn = pg.connect(user = "postgres",
                                  password = "akshaj10",
                                  host = "127.0.0.1",
                                  port = "5432",
                                  database = "movie_data")
sql1 = "Select * From movies;"
sql2 = "Select * From sherlock_ratings;"
movie_df = pd.read_sql_query(sql1, conn)
rating_df = pd.read_sql_query(sql2, conn)
conn = None


In [3]:
#Load Data into Pandas Data frame and Display the dataframe.
movie_df.shape

(6051, 37)

#### Data Clean Up ,Preprocessing and Filtering

In [4]:
# Drop null rows
movie_df = movie_df.dropna(axis = 0,how= 'any')
movie_df.shape

(237, 37)

In [5]:
# Filter all rows for US as the country
 
movie_us_df = movie_df[movie_df['country'] == 'United States'] 
movie_us_df.head(5)

Unnamed: 0,index,wikipedia_url,year,imdb_link,based_on,starring,cinematography,country,director,distributor,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
92,90,https://en.wikipedia.org/wiki/The_Hunt_for_Red...,1990,https://www.imdb.com/title/tt0099810/,"{""The Hunt for Red October"",by,""Tom Clancy""}","{""Sean Connery"",""Alec Baldwin"",""Scott Glenn"",""...",Jan de Bont,United States,John McTiernan,Paramount Pictures,...,1990-03-02,199200000.0,134.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Invisible. Silent. Stolen.,The Hunt for Red October,False,7.2,971.0
662,653,https://en.wikipedia.org/wiki/Jurassic_Park_(f...,1993,https://www.imdb.com/title/tt0107290/,"{""Jurassic Park"",""by Michael Crichton""}","{""Sam Neill"",""Laura Dern"",""Jeff Goldblum"",""Ric...",Dean Cundey,United States,Steven Spielberg,Universal Pictures,...,1993-06-11,920100000.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,An adventure 65 million years in the making.,Jurassic Park,False,7.6,4956.0
1324,1309,https://en.wikipedia.org/wiki/Escape_from_L.A.,1996,https://www.imdb.com/title/tt0116225/,"{""Characters created by"",""John Carpenter"",""Nic...","{""Kurt Russell"",""Stacy Keach"",""Steve Buscemi"",...",Gary B. Kibbe,United States,John Carpenter,Paramount Pictures,...,1996-08-09,42277365.0,97.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Snake Is Back.,Escape from L.A.,False,5.6,381.0
1425,1415,https://en.wikipedia.org/wiki/Mission:_Impossi...,1996,https://www.imdb.com/title/tt0117060/,"{""Mission: Impossible"",by,""Bruce Geller""}","{""Tom Cruise"",""Jon Voight"",""Henry Czerny"",""Emm...",Stephen H. Burum,United States,Brian De Palma,Paramount Pictures,...,1996-05-22,457696359.0,110.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Expect the Impossible.,Mission: Impossible,False,6.7,2677.0
1481,1472,https://en.wikipedia.org/wiki/Star_Trek:_First...,1996,https://www.imdb.com/title/tt0117731/,"{""Star Trek"",by,""Gene Roddenberry""}","{""Patrick Stewart"",""Jonathan Frakes"",""Brent Sp...",Matthew F. Leonetti,United States,Jonathan Frakes,Paramount Pictures,...,1996-11-21,150000000.0,111.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Resistance is futile.,Star Trek: First Contact,False,7.0,671.0


In [6]:
# Generate our categorical variable list for movie dataset
movie_cat = movie_us_df.dtypes[movie_us_df.dtypes == "object"].index.tolist()

# Check the number of unique values in each column
movie_us_df[movie_cat].nunique()

wikipedia_url            186
imdb_link                186
based_on                 141
starring                 185
cinematography           115
country                    1
director                 144
distributor               43
editors                  158
composers                 93
producers                147
writers                  164
imdb_id                  186
belongs_to_collection    115
genres                   114
homepage                 170
original_language          1
original_title           186
overview                 186
poster_path              186
production_companies     172
production_countries      25
spoken_languages          48
status                     1
tagline                  184
title                    186
dtype: int64

In [7]:
#Data Clean rating dataset
rating_df = rating_df.dropna(axis = 0,how= 'any')
rating_df.shape

(486, 42)

In [8]:
# Filter all rows for US as the country
rating_us_df = rating_df[rating_df['country'] == 'United States'] 
rating_us_df.shape

(386, 42)

In [9]:
# Generate our categorical variable list for movie dataset
rating_cat = rating_us_df.dtypes[rating_us_df.dtypes == "object"].index.tolist()

# Check the number of unique values in each column
rating_us_df[rating_cat].nunique()

imdb_id                  386
title                    385
original_title           385
tagline                  383
belongs_to_collection    220
wikipedia_url            386
imdb_link                386
genres                   200
original_language          1
overview                 386
spoken_languages          80
country                    1
production_companies     331
production_countries      34
distributor               72
producers                308
director                 278
starring                 384
cinematography           211
editors                  309
writers                  335
composers                157
based_on                 291
dtype: int64

In [11]:
#Data Clean movie rating dataset
rating_us_df.dtypes

index                             int64
imdb_id                          object
kaggle_id                         int64
title                            object
original_title                   object
tagline                          object
belongs_to_collection            object
wikipedia_url                    object
imdb_link                        object
runtime                         float64
budget                          float64
revenue                         float64
release_date             datetime64[ns]
popularity                      float64
vote_average                    float64
vote_count                      float64
genres                           object
original_language                object
overview                         object
spoken_languages                 object
country                          object
production_companies             object
production_countries             object
distributor                      object
producers                        object


In [15]:
rating_us_df['release_date']

8      1990-06-08
27     1990-11-09
44     1990-08-24
47     1990-07-02
54     1990-08-17
          ...    
5974   2017-04-12
5986   2017-04-19
5999   2017-05-30
6010   2017-06-21
6021   2017-07-11
Name: release_date, Length: 386, dtype: datetime64[ns]

In [16]:
#Convert the release date to year
rating_us_df['release_date'] = pd.DatetimeIndex(rating_us_df['release_date']).year

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [17]:
rating_us_df['release_date']

8       1990
27      1990
44      1990
47      1990
54      1990
        ... 
5974    2017
5986    2017
5999    2017
6010    2017
6021    2017
Name: release_date, Length: 386, dtype: int64

In [18]:
rating_us_df.dtypes

index                      int64
imdb_id                   object
kaggle_id                  int64
title                     object
original_title            object
tagline                   object
belongs_to_collection     object
wikipedia_url             object
imdb_link                 object
runtime                  float64
budget                   float64
revenue                  float64
release_date               int64
popularity               float64
vote_average             float64
vote_count               float64
genres                    object
original_language         object
overview                  object
spoken_languages          object
country                   object
production_companies      object
production_countries      object
distributor               object
producers                 object
director                  object
starring                  object
cinematography            object
editors                   object
writers                   object
composers 

In [None]:
columns =[]

In [None]:
df = df.loc[:, columns].copy()

In [None]:
#Classification (RandomForestClassification)

In [None]:
# Naive Bayes Classifier Model for Movie Review Sentiment Analysis