[View in Colaboratory](https://colab.research.google.com/github/ShriPunta/Netflix-Graph-Dataset-Project/blob/master/ParsingTheCombinedFile.ipynb)

In [0]:
!pip install -U -q PyDrive


**Setup all the Authentication for PyDrive**

In [0]:
import pandas as pd
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
import io
from googleapiclient.http import MediaIoBaseDownload
from googleapiclient.discovery import build
import numpy as np



In [0]:
# 1. Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)


---


** *Method*: To read the contents of a local Google File**



In [0]:
def read_drive_file_into_variable(file_id_to_read):
  drive_service = build('drive', 'v3')
  request = drive_service.files().get_media(fileId=file_id_to_read)
  downloaded = io.BytesIO()
  downloader = MediaIoBaseDownload(downloaded, request)
  done = False
  while done is False:
    # _ is a placeholder for a progress object that we ignore.
    # (Our file is small, so we skip reporting progress.)
    status, done = downloader.next_chunk()
    prog = int(status.progress() * 100)
    print("Download "+str(prog))
  
  #Set the pointer to the start
  downloaded.seek(0)
  #print('Downloaded file contents are: {}'.format(downloaded.read()))
  
  #Read Everthing into a variable called "View", its in a "Bytes" datatype
  view = downloaded.read1(-1)
  #len(view)
  
  #decode Bytes to String format
  decoded = view.decode(encoding="utf-8")
  #type(decoded)
  
  #Convert this continous string to List for each new line
  variable_to_set = decoded.splitlines()
  #type(splitted)
  
  return variable_to_set
  
  

---

**Read the main two files into variables**

In [0]:
movie_Title_file_id = '0B8qgJkz0ynl8czBBNG9qZ2JPeW9RaWVuZktobUE1b29qaER3'
rating_file_id = '1iABtudmoCPxcFYiYQ0cQwfcQCZtfHsHd'
test_file_id = '11QhP0HwV7x6huJX3-JqJzGv7shMiZVPH'

In [6]:
key_file_list = read_drive_file_into_variable(movie_Title_file_id)


Download 100


In [7]:
rating_file_list = read_drive_file_into_variable(rating_file_id)

Download 21
Download 42
Download 63
Download 84
Download 100


In [0]:
#test_file_list = read_drive_file_into_variable(test_file_id)

---

**Convert the key file into a dataframe**

In [9]:
#Read the key file into a Panda dataframe
key_file_df = pd.DataFrame([sub.split(",") for sub in key_file_list],columns = ["MovieId","Year","Name","Genre1","Genre2","Genre3"])

key_file_df.head()

Unnamed: 0,MovieId,Year,Name,Genre1,Genre2,Genre3
0,1,2003,Dinosaur Planet,"""Animation",Documentary,"Family"""
1,3,1997,Character,"""Crime",Drama,"Mystery"""
2,6,1997,Sick,Drama,,
3,7,1992,8 Man,"""Action","Sci-Fi""",
4,10,2001,Fighter,Documentary,,


In [10]:
key_file_df.info(memory_usage = 'deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11267 entries, 0 to 11266
Data columns (total 6 columns):
MovieId    11267 non-null object
Year       11267 non-null object
Name       11267 non-null object
Genre1     11267 non-null object
Genre2     8381 non-null object
Genre3     4876 non-null object
dtypes: object(6)
memory usage: 3.8 MB


**Clean the dataframe**

In [0]:
#key_file_df['Genre'] = key_file_df[['Genre1','Genre2','Genre3']].apply(lambda x: ''.join(x), axis=1)
# key_file_df.head()

def clean_the_movie_key_df(key_file_df):
  #Convert NaN to blank
  key_file_df = key_file_df.replace(np.nan, '', regex=True)
    
  #Combine the Genres into a single column
  key_file_df['Genre'] = key_file_df['Genre1'] + ',' + key_file_df['Genre2'] + ','+ key_file_df['Genre3']

  #Clean the column by removing double quotes; Also converts to string
  key_file_df['Genre'] = key_file_df['Genre'].str.replace('"', '')

  #Drop the unnecessary columns
  key_file_df.drop(['Genre1','Genre2','Genre3'], axis=1, inplace=True)
  
  #Convert to numeric
  key_file_df['MovieId'] = pd.to_numeric(key_file_df['MovieId'],errors = 'coerce')
  
  #Convert to numeric
  key_file_df['Year'] = pd.to_numeric(key_file_df['Year'],errors = 'coerce')
  
  #Convert to String
  key_file_df['Name'] = key_file_df['Name'].astype('str')
  
  
  #Set MovieId as the Index
  key_file_df.set_index('MovieId',inplace=True)
  
  #Drop any rows which NaN or NULL
  key_file_df.dropna(axis=0, how='any',inplace=True)

  return key_file_df


In [0]:
key_file_df = clean_the_movie_key_df(key_file_df)

In [13]:
#Gives detailing on how many null or na there are
key_file_df.isnull().sum()

Year     0
Name     0
Genre    0
dtype: int64

In [14]:
key_file_df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11265 entries, 1 to 17770
Data columns (total 3 columns):
Year     11265 non-null float64
Name     11265 non-null object
Genre    11265 non-null object
dtypes: float64(1), object(2)
memory usage: 1.7 MB


** We store the index(which are the movie Ids) as a list; to be used as a reference later**

In [15]:
key_movie_values = key_file_df.index.values
key_movie_values


array([    1,     3,     6, ..., 17768, 17769, 17770])

---

**---> PARSNG and SAMPLING <---**

**SAMPLING  : It takes the 23 million ratings, and only keeps limited number of ratings per movie (This is controlled by the 'limiter' variable).**

**We only want values whose genre we have, hence we weed out those movies, who are not present in the movie_Title_file**

**----**
  
**PARSING :  It also adds a comma separated value of the movie id to the tuple. (It is split later to create a dataframe).**

In [0]:
def do_sampling(list_to_iterate):
  flag=True
  #Limiter variable
  limiter = 100
  count = 0
  #variable to be returned
  refined_list = []
  key_to_search = 0
  movieId_to_search=''
  for ele in list_to_iterate:
    #If the element has a ':' then its a movie id
    if ele.find(':') != -1:
      count = 1
      #we will remove the ':' from the tuple
      #This is done as we can then successfully apply the pd.to_numeric method to drastically reduce the dataframe size
      movieId_to_search = ele.replace(':','')
      #We want only those movies which are present in the movie_Title_file
      
      if movieId_to_search in key_movie_values:
        print('found')
        #If this movie is present, set flag as true
        flag = True
        refined_list.append(movieId_to_search)
      else:
        flag = False
    else:
      #If the flag is false(i.e. movie not found) or if the count more than the limit; SKIP the record
      if count > limiter or flag is False:
        continue
      #Add a comma separated value of the movieId to split and form a column later
      ele += ',' +str(movieId_to_search)
      
      refined_list.append(ele)
      
      count+=1
  return refined_list

In [0]:
def do_sampling1(list_to_iterate):
  flag=True
  limiter = 100
  count = 0
  refined_list = []
  key_to_search = 0
  for ele in list_to_iterate:
    if ele.find(':') != -1:
      count = 1
      key_to_search = (ele.split(':'))[0]
      if int(key_to_search) in key_movie_values:
        flag = True
      else:
        flag = False
    else:
      if count > limiter or flag is False:
        continue
      refined_list.append(ele+','+ str(key_to_search))
      count+=1
  return refined_list

In [0]:
refined_list = do_sampling1(rating_file_list)


---
**Get a judgement of the size of the earlier list and size after sampling**


In [51]:
from sys import getsizeof
#print("Total number of elements -->",len(rating_file_list),"  Size they occupy in bytes -->",getsizeof(rating_file_list))

print("Total number of Elements in rating_file_list list -->",len(rating_file_list),"  Size they occupy in bytes -->",getsizeof(rating_file_list))
print("Total number of Elements in refined_list list -->",len(refined_list),"  Size they occupy in bytes -->",getsizeof(refined_list))

Total number of Elements in rating_file_list list --> 24058263   Size they occupy in bytes --> 209183992
Total number of Elements in refined_list list --> 277420   Size they occupy in bytes --> 2380488


---

**Convert rating File into dataframe**

In [52]:
#Read the key file into a Panda dataframe
test_file_df = pd.DataFrame([sub.split(",") for sub in refined_list],columns = ["UserId","Rating","YearWatched","MovieId"])

#Initialize a blank column called MovieId ; to be filled latter
test_file_df[['MovieId']] = test_file_df[['MovieId']].apply(pd.to_numeric)

#Need memory optimization, convert the object type to numeric
test_file_df[['UserId']] = test_file_df[['UserId']].apply(pd.to_numeric)

#Need memory optimization, convert the object type to numeric
test_file_df[['Rating']] = test_file_df[['Rating']].apply(pd.to_numeric)

#Convert the column to datetime and keep only the year
test_file_df['YearWatched'] = pd.to_datetime(test_file_df['YearWatched']).dt.year

#Need memory optimization, convert the object type to numeric
test_file_df['UserId'] = pd.to_numeric(test_file_df['UserId'],errors='ignore')



#Applying these changes in datatypes brought memory usage from 5.4 GB to 2 GB
test_file_df.info(memory_usage = 'deep')


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 277420 entries, 0 to 277419
Data columns (total 4 columns):
UserId         277420 non-null int64
Rating         277420 non-null int64
YearWatched    277420 non-null int64
MovieId        277420 non-null int64
dtypes: int64(4)
memory usage: 8.5 MB




---

**BELOW 2 blocks have been deemed deprecated due to this statement**

In [0]:
#Drop any rows which NaN or NULL
test_file_df.dropna(axis=0, how='any',inplace=True,thresh = 2)


In [54]:
test_file_df.head(2)

Unnamed: 0,UserId,Rating,YearWatched,MovieId
0,1488844,3,2005,1
1,822109,5,2005,1




---





---



In [63]:
#Create a matrix which is the same size of test_file_df matrix, but instead has True/False about which if the value is NaN
the_NaN_matrix = test_file_df.isnull().sum()
the_NaN_matrix

UserId         0
Rating         0
YearWatched    0
MovieId        0
Genre          0
dtype: int64

In [0]:
#Ultra fast method to get the indexes of the rows we need to drop; Reduces time from a minute to a second
#For records which only had the movie id and nothing else, will have NaN for columns other than the first column
#fancy_list = test_file_df[the_NaN_matrix['Rating'] == True].index.values



In [0]:
#type(fancy_list[2])

---
**This below block is deprecated, due to the faster and optimized ways.**

In [0]:
# #We will store the rows to drop in this
# rows_to_drop = []
# currentMovieId = 0
# list_movieId = []


# #Iterate over the dataframe to split it
# for index, row in test_file_df.iterrows():
#   if the_NaN_matrix.iloc[index,2]:
#     #If entered it means that there is a colon on the row and it is a movie id
#     #row gives the first character on that row
#     #currentMovieId = row['UserId']
#     rows_to_drop.append(index)
#   else:
#     list_movieId.append(currentMovieId)
#     #print("Not")
#     #Assign the movie id to the consecutive rows
#     #test_file_df.iloc[index,3] = currentMovieId

# #Create a series object from the list
# #MovieIdSeries = pd.Series(list_movieId)  



---



In [0]:
#Drop the rows which had the movie Id and the columns to get a seamless dataframe
#test_file_df.drop(test_file_df.index[fancy_list],inplace=True)




---





---



In [0]:
test_file_df['Genre'] = ''


#Copy the Genre Column from key file pandas dataframe
test_file_df['Genre'] = test_file_df['MovieId'].map(key_file_df['Genre'])

In [59]:
test_file_df.head(20)

Unnamed: 0,UserId,Rating,YearWatched,MovieId,Genre
0,1488844,3,2005,1,"Animation,Documentary,Family"
1,822109,5,2005,1,"Animation,Documentary,Family"
2,885013,4,2005,1,"Animation,Documentary,Family"
3,30878,4,2005,1,"Animation,Documentary,Family"
4,823519,3,2004,1,"Animation,Documentary,Family"
5,893988,3,2005,1,"Animation,Documentary,Family"
6,124105,4,2004,1,"Animation,Documentary,Family"
7,1248029,3,2004,1,"Animation,Documentary,Family"
8,1842128,4,2004,1,"Animation,Documentary,Family"
9,2238063,3,2005,1,"Animation,Documentary,Family"


---
 ** *Method*: To see the GPU usage**

In [0]:
def checkGPU():
  # memory footprint support libraries/code
  !ln -sf /opt/bin/nvidia-smi /usr/bin/nvidia-smi
  !pip install gputil
  !pip install psutil
  !pip install humanize
  import psutil
  import humanize
  import os
  import GPUtil as GPU
  GPUs = GPU.getGPUs()
  # XXX: only one GPU on Colab and isn’t guaranteed
  gpu = GPUs[0]
  process = psutil.Process(os.getpid())
  print("Gen RAM Free: " + humanize.naturalsize( psutil.virtual_memory().available ), " I Proc size: " + humanize.naturalsize( process.memory_info().rss))
  print("GPU RAM Free: {0:.0f}MB | Used: {1:.0f}MB | Util {2:3.0f}% | Total {3:.0f}MB".format(gpu.memoryFree, gpu.memoryUsed, gpu.memoryUtil*100, gpu.memoryTotal))
  


---

In [61]:
checkGPU()

Looking in indexes: https://pypi.org/simple, https://legacy.pypi.org/simple
Looking in indexes: https://pypi.org/simple, https://legacy.pypi.org/simple
Looking in indexes: https://pypi.org/simple, https://legacy.pypi.org/simple
Gen RAM Free: 8.7 GB  I Proc size: 4.2 GB
GPU RAM Free: 11439MB | Used: 0MB | Util   0% | Total 11439MB
