# Creating Cleaned Movies dataset for Data Exploration

In [1]:
#Importing required packages
import pandas as pd
import datetime
import numpy as np
pd.set_option('display.max_columns', 23)

In [2]:
#Importing movies dataset from Data folder
movies=pd.read_csv('../Data/movies.csv')

In [3]:
movies.head(5)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


We see that Genre for each movie is combined into 1 column. To explore how the genre and ratings are related, we split the genres into different columns and insert 1/0 for each movie.

We implented this in 2 methods and used the most time efficient method

In [4]:
#Method 1:
#Step 1: Create add new column in the movies dataframe and insert 1 where the genre is found
#Step 2: Fill the NA's with 0

print(datetime.datetime.now())
#Creating different columns for each genre and inserting 1/0 for each movie
for i,row in movies.iterrows():
    for x in row['genres'].split("|"):
        movies.at[i,x]=1
        
movies=movies.fillna(0)
print(datetime.datetime.now())

2018-08-06 20:13:15.432476
2018-08-06 20:13:17.698147


#Time consuming method
#Method 2:
#Step 1: Create add new column in the movies dataframe and insert 0 for all the genres for all movies
#Step 2: Insert 1 for a specific genre for the specific movie.

print(datetime.datetime.now())

for i,row in movies.iterrows():
    for x in row['genres'].split("|"):
        movies[x]=0
        
for i,row in movies.iterrows():
    for x in row['genres'].split("|"):
        movies.at[i,x]=1
        
print(datetime.datetime.now())

#Takes 10 seconds more for 15 million records

In [5]:
#Display all the columns
pd.set_option('display.max_columns', 23)
movies.head(5)

Unnamed: 0,movieId,title,genres,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Mystery,Sci-Fi,IMAX,Documentary,War,Musical,Western,Film-Noir,(no genres listed)
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,Jumanji (1995),Adventure|Children|Fantasy,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,Grumpier Old Men (1995),Comedy|Romance,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,Father of the Bride Part II (1995),Comedy,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
col_list=movies.columns.values[3:]
col_list

array(['Adventure', 'Animation', 'Children', 'Comedy', 'Fantasy',
       'Romance', 'Drama', 'Action', 'Crime', 'Thriller', 'Horror',
       'Mystery', 'Sci-Fi', 'IMAX', 'Documentary', 'War', 'Musical',
       'Western', 'Film-Noir', '(no genres listed)'], dtype=object)

In [7]:
movies['Genre_Count'] = movies[col_list].sum(axis=1)

In [8]:
#Saving the output to CleanedMovies.csv in an Output folder
movies.to_csv('../Output/CleanedMovies.csv')

# Create Ratings Sample

In [9]:
#Importing rating dataset from Data folder
ratings=pd.read_csv('../Data/ratings.csv')

In [10]:
#Find the lowest count of movies for a specific rating to get a downsampling size
sampleSize=ratings.groupby("rating").count().reset_index()['userId'].min()
sampleSize

239125

In [11]:
#Saving unique ratings into a numpy array
value=ratings['rating'].unique()
value

array([3.5, 4. , 3. , 4.5, 5. , 2. , 1. , 2.5, 0.5, 1.5])

In [12]:
RatingSampledf=pd.DataFrame()

In [13]:
for ratingValue in value:
    filtereddf = ratings.loc[lambda x: x['rating'] == ratingValue]
    sampledf=filtereddf.sample(sampleSize)
    RatingSampledf=pd.concat([RatingSampledf, sampledf],axis=0)
print("Sample created")

Sample created


In [14]:
len(RatingSampledf)

2391250

In [15]:
#Saving the output to SampleRatings.csv in an Output folder
RatingSampledf.to_csv("../Output/SampleRatings.csv")

# Create a WordCloud for genres of top 100 movies with maximum ratings

In [16]:
#Select top 100 values using 'nlargest'
top100ratings=ratings.groupby('movieId').count().nlargest(100,'rating')

In [17]:
movie_rating=pd.merge(top100ratings,movies, on='movieId', how='inner')

In [18]:
#Convert dataframe to list
Genres=movie_rating['genres'].values.tolist()

In [19]:
#Join list values
Genres=",".join(Genres).join(("",""))

In [20]:
#Replace '|' with ',' and then replace ',' with ',\n' to save the entire dataset into an excel
GenresString=Genres.replace('|',',')
GenresString=GenresString.replace(',',',\n')

In [21]:
f = open('../Output/GenreString.csv','w')
f.write(GenresString) #Give your csv text here.
## Python will convert \n to os.linesep
f.close()

# Check sparsity

We tried finding the sparsity for the user vs movie rating for all the movies and tried the below methods

In [22]:
#Method 1:
#Find the total movies, user and ratings and calculate the sparsity mathematically
movielen=len(movies['movieId'].unique())
userlen=len(ratings['userId'].unique())
sparsity=(len(ratings)*100)/(movielen*userlen)
print(sparsity)

0.5294139230357805


#Method 2: <br />
#Step 1: Create a dataframe of size len(users) X len(movie) with 0 <br />
#Step 2: Add the rating for a user in the corresponsding movie column <br />
#Step 3: Find sparsity by taking the length of non-zero, dividing this value with the shape of user X movies and multiplying the #entire value with 100 <br />
 <br />
sparsedf = np.zeros((userlen, movielen)) <br />
for row in ratings.itertuples(): <br />
    sparsedf[row[1]-1, row[2]-1] = row[3] <br />
<br />
sparsity = float(len(sparsedf.nonzero()[0])) <br />
sparsity /= (sparsedf.shape[0] * sparsedf.shape[1]) <br />
sparsity *= 100 <br />
print('Sparsity: {:4.2f}%'.format(sparsity)) <br />
 <br />
#Error encoundered: <br />
#MovieId and count of movies varies and leads to index out of bound error

#Method 3: <br />
#Step 1: Creating a datframe and adding 1 in the corresponding movie column for a specific user
 <br />
newratings=pd.DataFrame() <br />
for i,row in ratings.iterrows():  <br />
    newratings.at[row['movieId'],row['userId']]=1  <br />
 <br />
newratings=newratings.fillna(0) <br />
<br />
#Error Encountered:
#Takes more than 100 minutes to execute for 20 million observations