In [34]:
# Libraries for data file processing 
import os

# libraries for special vizualization 
import wordcloud
from wordcloud import STOPWORDS
from wordcloud import WordCloud

# Essential librarires
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

## Data Preprocessing

**Summary:**
These files contain 1,000,209 anonymous ratings of approximately 3,900 movies 
made by 6,040 MovieLens users who joined MovieLens in 2000.

In [35]:
# DAT file declaration - Ratings data 
rating_dat_file = 'ratings.dat'

# Read the Ratings File
ratings_data = pd.read_csv(os.path.join("dat_files", rating_dat_file), sep='::', engine='python', encoding='latin-1',
                    names=['user_id', 'movie_id', 'rating', 'timestamp'])

print("Loading.. ", len(ratings_data))

Loading..  1000209


In [36]:
# DAT file declaration - Movies data 
movies_dat_file = 'movies.dat'

# Read the Movies File
movies_data = pd.read_csv(os.path.join("dat_files", movies_dat_file), sep='::', engine='python', encoding='latin-1',
                    names=["movie_id", "title", "genres"])

print("Loading.. ", len(movies_data))

Loading..  3883


In [37]:
# DAT file declaration - Users data 
users_dat_file = 'users.dat'

# Read the Users File
users_data = pd.read_csv(os.path.join("dat_files", users_dat_file), sep='::', engine='python', encoding='latin-1',
                    names=["user_id", "gender", "age", "occupation", "zipcode"])

print("Loading.. ", len(users_data))
print()
# Displaying the dataset
print("Current Users dataframe that is numerically encoded: ")
users_data.head()

Loading..  6040

Current Users dataframe that is numerically encoded: 


Unnamed: 0,user_id,gender,age,occupation,zipcode
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


It can be seen from the above users dataframe that the occupation feature is numericaly encoded and the age is categorized as mentioned below:

- Age is chosen from the following ranges:

	*  1:  "Under 18"
	* 18:  "18-24"
	* 25:  "25-34"
	* 35:  "35-44"
	* 45:  "45-49"
	* 50:  "50-55"
	* 56:  "56+"
- Occupation is chosen from the following choices:

	*  0:  "other" or not specified
	*  1:  "academic/educator"
	*  2:  "artist"
	*  3:  "clerical/admin"
	*  4:  "college/grad student"
	*  5:  "customer service"
	*  6:  "doctor/health care"
	*  7:  "executive/managerial"
	*  8:  "farmer"
	*  9:  "homemaker"
	* 10:  "K-12 student"
	* 11:  "lawyer"
	* 12:  "programmer"
	* 13:  "retired"
	* 14:  "sales/marketing"
	* 15:  "scientist"
	* 16:  "self-employed"
	* 17:  "technician/engineer"
	* 18:  "tradesman/craftsman"
	* 19:  "unemployed"
	* 20:  "writer"

In [38]:
# Categorizing the age and occupation columns based on the information provided above, dictinary data structure is used 
# for key value pairs
age_category = { 1: "Under 18", 
                18: "18-24", 
                25: "25-34", 
                35: "35-44", 
                45: "45-49", 
                50: "50-55", 
                56: "56+"}

occupation_category = {0: "other or not specified",
                       1: "academic/educator",
                       2: "artist", 
                       3: "clerical/admin",
                       4: "college/grad student", 
                       5: "customer service", 
                       6: "doctor/health care",
                       7: "executive/managerial", 
                       8: "farmer", 
                       9: "homemaker", 
                       10: "K-12 student", 
                       11: "lawyer",
                       12: "programmer", 
                       13: "retired", 
                       14: "sales/marketing", 
                       15: "scientist", 
                       16: "self-employed",
                       17: "technician/engineer", 
                       18: "tradesman/craftsman", 
                       19: "unemployed", 
                       20: "writer"}

# Creating new cloumns with updated values/category info using lambda function on each datapoint for both features
users_data['age_group'] = users_data['age'].apply(lambda x: age_category[x])
users_data['occupation'] = users_data['occupation'].apply(lambda x: occupation_category[x])

# Resultant dataframe 
print("Processed dataframe: ")
users_data.head()

Processed dataframe: 


Unnamed: 0,user_id,gender,age,occupation,zipcode,age_group
0,1,F,1,K-12 student,48067,Under 18
1,2,M,56,self-employed,70072,56+
2,3,M,25,scientist,55117,25-34
3,4,M,45,executive/managerial,2460,45-49
4,5,M,25,writer,55455,25-34


#### Saving data into respective csv files

In [39]:
# Creating a csv file to save rating.csv
ratings_data.to_csv("ratings_data.csv", sep='\t', header=True, encoding='latin-1', 
               columns=['user_id', 'movie_id', 'rating', 'timestamp'])

print("rating_data.csv saved in directory..")

rating_data.csv saved in directory..


In [40]:
# Creating a csv file to save movies.csv
movies_data.to_csv("movies_data.csv", sep='\t', header=True, encoding='latin-1', 
               columns=["movie_id", "title", "genres"])

print("movies_data.csv saved in directory..")

movies_data.csv saved in directory..


In [41]:
# Creating a csv file to save users.csv
users_data.to_csv("users_data.csv", sep='\t', header=True, encoding='latin-1', 
               columns=["user_id", "gender", "age", "occupation", "zipcode"])

print("users_data.csv saved in directory..")

users_data.csv saved in directory..


# Exploratory Data Analysis

### Movie dataset

In [42]:
# Top five observations of movies dataset
movies_data.head()

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [45]:
movies_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3883 entries, 0 to 3882
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   movie_id  3883 non-null   int64 
 1   title     3883 non-null   object
 2   genres    3883 non-null   object
dtypes: int64(1), object(2)
memory usage: 91.1+ KB


### Ratings dataset 

In [43]:
# Top five observations of ratings dataset
ratings_data.drop("timestamp", axis=1, inplace=True)
ratings_data.head()

Unnamed: 0,user_id,movie_id,rating
0,1,1193,5
1,1,661,3
2,1,914,3
3,1,3408,4
4,1,2355,5


In [46]:
ratings_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000209 entries, 0 to 1000208
Data columns (total 3 columns):
 #   Column    Non-Null Count    Dtype
---  ------    --------------    -----
 0   user_id   1000209 non-null  int64
 1   movie_id  1000209 non-null  int64
 2   rating    1000209 non-null  int64
dtypes: int64(3)
memory usage: 22.9 MB


### Users dataset

In [44]:
# Top five observations of users dataset
users_data.head()

Unnamed: 0,user_id,gender,age,occupation,zipcode,age_group
0,1,F,1,K-12 student,48067,Under 18
1,2,M,56,self-employed,70072,56+
2,3,M,25,scientist,55117,25-34
3,4,M,45,executive/managerial,2460,45-49
4,5,M,25,writer,55455,25-34


In [47]:
users_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6040 entries, 0 to 6039
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   user_id     6040 non-null   int64 
 1   gender      6040 non-null   object
 2   age         6040 non-null   int64 
 3   occupation  6040 non-null   object
 4   zipcode     6040 non-null   object
 5   age_group   6040 non-null   object
dtypes: int64(2), object(4)
memory usage: 283.2+ KB
