In [114]:
# Libraries for data file processing 
import os

# Essential librarires
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

## Data Preprocessing

**Summary:**
These files contain 1,000,209 anonymous ratings of approximately 3,900 movies 
made by 6,040 MovieLens users who joined MovieLens in 2000.

In [115]:
# DAT file declaration - Ratings data 
rating_dat_file = 'ratings.dat'

# Read the Ratings File
ratings_data = pd.read_csv(os.path.join("dat_files", rating_dat_file), sep='::', engine='python', encoding='latin-1',
                    names=['user_id', 'movie_id', 'rating', 'timestamp'])

print("Loading.. ", len(ratings_data))

Loading..  1000209


In [116]:
# DAT file declaration - Movies data 
movies_dat_file = 'movies.dat'

# Read the Movies File
movies_data = pd.read_csv(os.path.join("dat_files", movies_dat_file), sep='::', engine='python', encoding='latin-1',
                    names=["movie_id", "title", "genres"])

print("Loading.. ", len(movies_data))

Loading..  3883


In [117]:
# DAT file declaration - Users data 
users_dat_file = 'users.dat'

# Read the Users File
users_data = pd.read_csv(os.path.join("dat_files", users_dat_file), sep='::', engine='python', encoding='latin-1',
                    names=["user_id", "gender", "age", "occupation", "zipcode"])

print("Loading.. ", len(users_data))
print()
# Displaying the dataset
print("Current Users dataframe that is numerically encoded: ")
users_data.head()

Loading..  6040

Current Users dataframe that is numerically encoded: 


Unnamed: 0,user_id,gender,age,occupation,zipcode
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


It can be seen from the above users dataframe that the occupation feature is numericaly encoded and the age is categorized as mentioned below:

- Age is chosen from the following ranges:

	*  1:  "Under 18"
	* 18:  "18-24"
	* 25:  "25-34"
	* 35:  "35-44"
	* 45:  "45-49"
	* 50:  "50-55"
	* 56:  "56+"
- Occupation is chosen from the following choices:

	*  0:  "other" or not specified
	*  1:  "academic/educator"
	*  2:  "artist"
	*  3:  "clerical/admin"
	*  4:  "college/grad student"
	*  5:  "customer service"
	*  6:  "doctor/health care"
	*  7:  "executive/managerial"
	*  8:  "farmer"
	*  9:  "homemaker"
	* 10:  "K-12 student"
	* 11:  "lawyer"
	* 12:  "programmer"
	* 13:  "retired"
	* 14:  "sales/marketing"
	* 15:  "scientist"
	* 16:  "self-employed"
	* 17:  "technician/engineer"
	* 18:  "tradesman/craftsman"
	* 19:  "unemployed"
	* 20:  "writer"

In [118]:
# Categorizing the age and occupation columns based on the information provided above, dictinary data structure is used 
# for key value pairs
age_category = { 1: "Under 18", 
                18: "18-24", 
                25: "25-34", 
                35: "35-44", 
                45: "45-49", 
                50: "50-55", 
                56: "56+"}

occupation_category = {0: "other or not specified",
                       1: "academic/educator",
                       2: "artist", 
                       3: "clerical/admin",
                       4: "college/grad student", 
                       5: "customer service", 
                       6: "doctor/health care",
                       7: "executive/managerial", 
                       8: "farmer", 
                       9: "homemaker", 
                       10: "K-12 student", 
                       11: "lawyer",
                       12: "programmer", 
                       13: "retired", 
                       14: "sales/marketing", 
                       15: "scientist", 
                       16: "self-employed",
                       17: "technician/engineer", 
                       18: "tradesman/craftsman", 
                       19: "unemployed", 
                       20: "writer"}

# Creating new cloumns with updated values/category info using lambda function on each datapoint for both features
users_data['age_group'] = users_data['age'].apply(lambda x: age_category[x])
users_data['occupation'] = users_data['occupation'].apply(lambda x: occupation_category[x])

# Resultant dataframe 
print("Processed dataframe: ")
users_data.head()

Processed dataframe: 


Unnamed: 0,user_id,gender,age,occupation,zipcode,age_group
0,1,F,1,K-12 student,48067,Under 18
1,2,M,56,self-employed,70072,56+
2,3,M,25,scientist,55117,25-34
3,4,M,45,executive/managerial,2460,45-49
4,5,M,25,writer,55455,25-34


#### Saving data into respective csv files

In [119]:
# Creating a csv file to save rating.csv
ratings_data.to_csv("ratings_data.csv", sep='\t', header=True, encoding='latin-1', 
               columns=['user_id', 'movie_id', 'rating', 'timestamp'])

print("rating_data.csv saved in directory..")

rating_data.csv saved in directory


In [120]:
# Creating a csv file to save movies.csv
movies_data.to_csv("movies_data.csv", sep='\t', header=True, encoding='latin-1', 
               columns=["movie_id", "title", "genres"])

print("movies_data.csv saved in directory..")

movies_data.csv saved in directory


In [121]:
# Creating a csv file to save users.csv
users_data.to_csv("users_data.csv", sep='\t', header=True, encoding='latin-1', 
               columns=["user_id", "gender", "age", "occupation", "zipcode"])

print("users_data.csv saved in directory..")

users_data.csv saved in directory
