# Data Preparation

#### Here, we will prepare our data for our search function. Currently, our data is stored in three different csv files.

<ul>
    <li>movies.csv</li>
    <li>links.csv</li>
    <li>ratings.csv</li>
</ul>

In [2]:
import pandas as pd
from collections import defaultdict
from os import getcwd

### Defining path to the data files

In [3]:
path_links = f"{getcwd()}/links.csv"
path_movies = f"{getcwd()}/movies.csv"
path_ratings = f"{getcwd()}/ratings.csv"

## Data Engineering

In [4]:
#Reading data from movies.csv
df_movies = pd.read_csv(path_movies)
movies_table_columns = df_movies.columns.tolist()
print(f"COLUMNS : {movies_table_columns}")

COLUMNS : ['movieId', 'title', 'genres']


In [5]:
#Reading data from links.csv
df_links = pd.read_csv(path_links)
links_table_columns = df_links.columns.tolist()
print(f"COLUMNS : {links_table_columns}")

COLUMNS : ['movieId', 'imdbId', 'tmdbId']


In [6]:
#Reading data from ratings.csv
df_ratings = pd.read_csv(path_ratings)
path_table_columns = df_ratings.columns.tolist()
print(f"COLUMNS : {path_table_columns}")

COLUMNS : ['userId', 'movieId', 'rating', 'timestamp']


In [7]:
print(f"It is {pd.Series(df_movies['movieId']).is_unique}  that the column 'movieId' has unique values for all entries in movies dataframe.")
print(f"It is {pd.Series(df_links['movieId']).is_unique}  that the column 'movieId' has unique values for all entries in links dataframe.")
print(f"It is {pd.Series(df_ratings['userId']).is_unique} that the column 'userId'  has unique values for all entries in ratings dataframe.")

# Sorting movie dataframe on the basis of movieId as movieId is unique
df_movies_sorted = df_movies.sort_values(by=['movieId'])

# Sorting links dataframe on the basis of movieId as movieId is unique
df_links_sorted = df_links.sort_values(by=['movieId'])

It is True  that the column 'movieId' has unique values for all entries in movies dataframe.
It is True  that the column 'movieId' has unique values for all entries in links dataframe.
It is False that the column 'userId'  has unique values for all entries in ratings dataframe.


In [9]:
#From movies dataframe
movieIds = df_movies_sorted["movieId"].tolist()
movieTitles = df_movies_sorted["title"].tolist()
movieGenres = [genre.split("|") for genre in df_movies["genres"].tolist()]

#From links dataframe
imdbId = df_links_sorted["imdbId"].tolist()
tmdbId = df_links_sorted["tmdbId"].tolist()

In [10]:
movieDict = {}
global_secondaryIndex = {}
for idx, movieId in enumerate(movieIds):
    movieDict[movieId] = {
        "genre" : movieGenres[idx],
        "links" : {"imdb" : imdbId[idx], "tmdb" : tmdbId[idx]}
    }
    
    global_secondaryIndex[movieTitles[idx]] = movieId

In [12]:
#Since there is no column with unique values in ratings dataframe, so we will perform grouping 
userIds = df_ratings["userId"].tolist()
movieIds = df_ratings["movieId"].tolist()
user_ratings = df_ratings["rating"].tolist()

ratings = {}

for idx, mId in enumerate(movieIds):
    try : i = ratings[mId]
    except: 
        ratings[mId] = [{"userId" : userIds[idx], "rating" : user_ratings[idx]}]
    
    try : i = ratings[mId][userIds[idx]]
    except: ratings[mId].append({"userId" : userIds[idx], "rating" : user_ratings[idx]})

for mId, i in movieDict.items():
    try   : movieDict[mid]["user_rating"] = ratings[mId][1:]
    except: 
        try   : movieDict[mId]["user_rating"] = [] 
        except: pass 

In [14]:
import json
print("[INFO] Writing movie Data into the disk...")
with open('dataFinal.json', 'w') as fp:
    json.dump(movieDict, fp, sort_keys=True, indent=4)
print("[INFO] Writing Global Secondary Index Data into the disk...")
with open('dataFinal_GIS.json', 'w') as fp:
    json.dump(global_secondaryIndex, fp, sort_keys=True, indent=4)

[INFO] Writing movie Data into the disk...
[INFO] Writing Global Secondary Index Data into the disk...


<h6>So we have prepared our data for the searching purpose</h6>