In [0]:
!pip install -U -q PyDrive


**Setup all the Authentication for PyDrive**

In [0]:
import pandas as pd
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
import io
from googleapiclient.http import MediaIoBaseDownload
from googleapiclient.discovery import build


In [0]:
# 1. Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)


---


** *Method*: To read the contents of a local Google File**



In [0]:
def read_drive_file_into_variable(file_id_to_read):
  drive_service = build('drive', 'v3')
  request = drive_service.files().get_media(fileId=file_id_to_read)
  downloaded = io.BytesIO()
  downloader = MediaIoBaseDownload(downloaded, request)
  done = False
  while done is False:
    # _ is a placeholder for a progress object that we ignore.
    # (Our file is small, so we skip reporting progress.)
    status, done = downloader.next_chunk()
    prog = int(status.progress() * 100)
    print("Download "+str(prog))
  
  #Set the pointer to the start
  downloaded.seek(0)
  #print('Downloaded file contents are: {}'.format(downloaded.read()))
  
  #Read Everthing into a variable called "View", its in a "Bytes" datatype
  view = downloaded.read1(-1)
  #len(view)
  
  #decode Bytes to String format
  decoded = view.decode(encoding="utf-8")
  #type(decoded)
  
  #Convert this continous string to List for each new line
  variable_to_set = decoded.splitlines()
  #type(splitted)
  
  return variable_to_set
  
  

---

**Read the main two files into variables**

In [0]:
movie_Title_file_id = '0B8qgJkz0ynl8czBBNG9qZ2JPeW9RaWVuZktobUE1b29qaER3'
rating_file_id = '1iABtudmoCPxcFYiYQ0cQwfcQCZtfHsHd'
test_file_id = '11QhP0HwV7x6huJX3-JqJzGv7shMiZVPH'

In [6]:
key_file_list = read_drive_file_into_variable(movie_Title_file_id)


Download 100


In [0]:
#rating_file_list = read_drive_file_into_variable(rating_file_id)

In [8]:
test_file_list = read_drive_file_into_variable(test_file_id)

Download 100


---

**Convert the key file into a dataframe**

In [9]:
#Read the key file into a Panda dataframe
key_file_df = pd.DataFrame([sub.split(",") for sub in key_file_list],columns = ["MovieId","Year","Name","Genre1","Genre2","Genre3"])
key_file_df.head()

Unnamed: 0,MovieId,Year,Name,Genre1,Genre2,Genre3
0,1,2003,Dinosaur Planet,"""Animation",Documentary,"Family"""
1,3,1997,Character,"""Crime",Drama,"Mystery"""
2,6,1997,Sick,Drama,,
3,7,1992,8 Man,"""Action","Sci-Fi""",
4,10,2001,Fighter,Documentary,,


**Clean the dataframe**

In [10]:
# key_file_df['Genre'] = key_file_df[['Genre1','Genre2','Genre3']].apply(lambda x: ''.join(x), axis=1)
# key_file_df.head()
#Combine the Genres into a single column

key_file_df['Genre'] = key_file_df['Genre1'] + ',' + key_file_df['Genre2'] + ','+ key_file_df['Genre3']

#Clean the column by removing double quotes
for i, col in enumerate(key_file_df.columns):
    key_file_df.iloc[:,6] = key_file_df.iloc[:,6].str.replace('"', '')
    
#Drop the unnecessary columns
key_file_df.drop(['Genre1','Genre2','Genre3'], axis=1, inplace=True)

#Set MovieId as the Index
key_file_df.set_index('MovieId',inplace=True)

key_file_df.head()


Unnamed: 0_level_0,Year,Name,Genre
MovieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,2003,Dinosaur Planet,"Animation,Documentary,Family"
3,1997,Character,"Crime,Drama,Mystery"
6,1997,Sick,
7,1992,8 Man,
10,2001,Fighter,


---

**Convert rating File into dataframe**

In [0]:
#Read the key file into a Panda dataframe
test_file_df = pd.DataFrame([sub.split(",") for sub in test_file_list],columns = ["UserId","Rating","DateWatched"])

#Initialize a blank column called MovieId ; to be filled latter
test_file_df["MovieId"] = ''

test_file_df.head()


---

In [0]:
#We will store the rows to drop in this
rows_to_drop = []
currentMovieId = 0

#Iterate over the dataframe to split it
for index, row in test_file_df.iterrows():
  if row['UserId'].find(':') != -1:
    #If entered it means that there is a colon on the row and it is a movie id
    #row[0][0] gives the first character on that row
    currentMovieId = int(row[0][0])
    rows_to_drop.append(index)
  else:
    #print("Not")
    #Assign the movie id to the consecutive rows
    test_file_df.iloc[index][3] = currentMovieId

In [34]:
#We get a dataframe with the movie id generated, now we need to filter out the rows
test_file_df.head()

Unnamed: 0,UserId,Rating,DateWatched,MovieId
0,1:,,,
1,1488844,3.0,2005-09-06,1.0
2,822109,5.0,2005-05-13,1.0
3,885013,4.0,2005-10-19,1.0
4,30878,4.0,2005-12-26,1.0
5,2:,,,
6,1488844,3.0,2005-09-06,2.0
7,822109,5.0,2005-05-13,2.0
8,885013,4.0,2005-10-19,2.0
9,30878,4.0,2005-12-26,2.0


In [40]:
#Drop the rows which had the movie Id and the columns to get a seamless dataframe
test_file_df.drop(test_file_df.index[rows_to_drop],inplace=True)
test_file_df.head()

Unnamed: 0,UserId,Rating,DateWatched,MovieId
1,1488844,3,2005-09-06,1
2,822109,5,2005-05-13,1
3,885013,4,2005-10-19,1
4,30878,4,2005-12-26,1
6,1488844,3,2005-09-06,2
7,822109,5,2005-05-13,2
8,885013,4,2005-10-19,2
9,30878,4,2005-12-26,2
11,1488844,3,2005-09-06,3
12,822109,5,2005-05-13,3


In [0]:
#Convert the column to datetime and keep only the year
test_file_df['YearWatched'] = pd.to_datetime(test_file_df['DateWatched']).dt.year

In [45]:
test_file_df

Unnamed: 0,UserId,Rating,DateWatched,MovieId,YearWatched
1,1488844,3,2005-09-06,1,2005
2,822109,5,2005-05-13,1,2005
3,885013,4,2005-10-19,1,2005
4,30878,4,2005-12-26,1,2005
6,1488844,3,2005-09-06,2,2005
7,822109,5,2005-05-13,2,2005
8,885013,4,2005-10-19,2,2005
9,30878,4,2005-12-26,2,2005
11,1488844,3,2005-09-06,3,2005
12,822109,5,2005-05-13,3,2005


---
 ** *Method*: To see the GPU usage**

In [0]:
def checkGPU():
  # memory footprint support libraries/code
  !ln -sf /opt/bin/nvidia-smi /usr/bin/nvidia-smi
  !pip install gputil
  !pip install psutil
  !pip install humanize
  import psutil
  import humanize
  import os
  import GPUtil as GPU
  GPUs = GPU.getGPUs()
  # XXX: only one GPU on Colab and isn’t guaranteed
  gpu = GPUs[0]
  process = psutil.Process(os.getpid())
  print("Gen RAM Free: " + humanize.naturalsize( psutil.virtual_memory().available ), " I Proc size: " + humanize.naturalsize( process.memory_info().rss))
  print("GPU RAM Free: {0:.0f}MB | Used: {1:.0f}MB | Util {2:3.0f}% | Total {3:.0f}MB".format(gpu.memoryFree, gpu.memoryUsed, gpu.memoryUtil*100, gpu.memoryTotal))
  


---

In [14]:
checkGPU()

[33mYou are using pip version 9.0.3, however version 10.0.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
[33mYou are using pip version 9.0.3, however version 10.0.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
[33mYou are using pip version 9.0.3, however version 10.0.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
Gen RAM Free: 12.9 GB  I Proc size: 151.7 MB
GPU RAM Free: 11439MB | Used: 0MB | Util   0% | Total 11439MB
