In [1]:
import numpy as np
import pandas as pd
import chardet as ct

In [2]:
# needed as encodings of different files can be different and can cause problems
with open('ml-1m/movies.dat','rb') as rawdata:
    result=ct.detect(rawdata.read())
    print(result)

{'encoding': 'ISO-8859-1', 'confidence': 0.73, 'language': ''}


In [3]:
movies=pd.read_csv('ml-1m/movies.dat',sep='::',engine='python',header=None,names=['Index','Movie_name','Genre'],index_col='Index',encoding=result['encoding'])

In [4]:
movies.nunique()

Movie_name    3883
Genre          301
dtype: int64

In [5]:
movies.tail()

Unnamed: 0_level_0,Movie_name,Genre
Index,Unnamed: 1_level_1,Unnamed: 2_level_1
3948,Meet the Parents (2000),Comedy
3949,Requiem for a Dream (2000),Drama
3950,Tigerland (2000),Drama
3951,Two Family House (2000),Drama
3952,"Contender, The (2000)",Drama|Thriller


In [6]:
movies.head()

Unnamed: 0_level_0,Movie_name,Genre
Index,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Animation|Children's|Comedy
2,Jumanji (1995),Adventure|Children's|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama
5,Father of the Bride Part II (1995),Comedy


In [7]:
movies['Year']=movies['Movie_name'].str.extract(r'([0-9]{4})')
# Why the extra parenthesis after: "[0-9]{4}"? as extract requires a capture grp
# This capture grp is identified inside the parenthesis only, else it doesn't understand
# The regex syntax is right in itself, its the extract method that causes issues

movies['Movie_name']=movies['Movie_name'].str.replace(r'\((\d{4})\)','',regex=True)
# Notice I used different regex but both are basically the same
# Also notice, replace doesn't need a capture grp like extract(no paranthesis to end the regex)

In [8]:
movies.sample(10)

Unnamed: 0_level_0,Movie_name,Genre,Year
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1039,Synthetic Pleasures,Documentary,1995
795,Somebody to Love,Drama,1994
2231,Rounders,Crime|Drama,1998
3699,Starman,Adventure|Drama|Romance|Sci-Fi,1984
1862,Species II,Horror|Sci-Fi,1998
1476,Private Parts,Comedy|Drama,1997
1921,Pi,Sci-Fi|Thriller,1998
498,Mr. Jones,Drama|Romance,1993
3417,"Crimson Pirate, The",Adventure|Comedy|Sci-Fi,1952
3644,Dark Command,Western,1940


In [9]:
# Lets turn "Hobbit, The" type names to "The Hobbit" type
movies['Movie_name']=movies['Movie_name'].str.replace(r'^(.*), (The|An|A)',r'\2 \1',regex=True)
# Here \2 and \1 represents the 2 capture group from the earlier regex
# These are called backreferences, where \2 represents 2nd capture grp

In [10]:
movies.sample(10)

Unnamed: 0_level_0,Movie_name,Genre,Year
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
3904,An Uninvited Guest,Drama,2000
102,Mr. Wrong,Comedy,1996
2015,The Absent Minded Professor,Children's|Comedy|Fantasy,1961
3129,Sweet and Lowdown,Comedy|Drama,1999
841,Eyes Without a Face,Horror,1959
747,The Stupids,Comedy,1996
928,Rebecca,Romance|Thriller,1940
278,Miami Rhapsody,Comedy,1995
1344,Cape Fear,Film-Noir|Thriller,1962
3234,Train Ride to Hollywood,Comedy,1978


In [11]:
movies_new=movies.copy()

In [12]:
# Now for seperating genres in a new dataframe
movies_new['Genre']=movies_new['Genre'].str.split('|')

# Explode needs a list of string that's why we split here

In [13]:
movies_new.sample(10)

Unnamed: 0_level_0,Movie_name,Genre,Year
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2349,Mona Lisa,"[Comedy, Thriller]",1986
3180,Play it to the Bone,"[Comedy, Drama]",1999
3897,Almost Famous,"[Comedy, Drama]",2000
514,The Ref,[Comedy],1994
1926,The Broadway Melody,[Musical],1929
518,The Road to Wellville,[Comedy],1994
2227,The Lodger,[Thriller],1926
3857,Bless the Child,[Thriller],2000
1864,Sour Grapes,[Comedy],1998
3271,Of Mice and Men,[Drama],1992


In [14]:
exploded=movies_new.explode('Genre')
"""
Use of explode:
It takes a single row with a list-like value (e.g., a list of genres) and 
creates a new row for each item in that list. 
The data in the other columns is duplicated.
"""

'\nUse of explode:\nIt takes a single row with a list-like value (e.g., a list of genres) and \ncreates a new row for each item in that list. \nThe data in the other columns is duplicated.\n'

In [15]:
exploded.head(10)

Unnamed: 0_level_0,Movie_name,Genre,Year
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Toy Story,Animation,1995
1,Toy Story,Children's,1995
1,Toy Story,Comedy,1995
2,Jumanji,Adventure,1995
2,Jumanji,Children's,1995
2,Jumanji,Fantasy,1995
3,Grumpier Old Men,Comedy,1995
3,Grumpier Old Men,Romance,1995
4,Waiting to Exhale,Comedy,1995
4,Waiting to Exhale,Drama,1995


In [24]:
with open('ml-1m/ratings.dat','rb') as rat:
    result=ct.detect(rat.read())
    # print(result)

head=['UserID','Movie_id','Ratings','Timestamp']
ratings=pd.read_csv('ml-1m/ratings.dat',sep='::',names=head,index_col='Movie_id',encoding=result['encoding'])

  ratings=pd.read_csv('ml-1m/ratings.dat',sep='::',names=head,index_col='Movie_id',encoding=result['encoding'])


In [25]:
ratings.head()

Unnamed: 0_level_0,UserID,Ratings,Timestamp
Movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1193,1,5,978300760
661,1,3,978302109
914,1,3,978301968
3408,1,4,978300275
2355,1,5,978824291


In [26]:
with open('ml-1m/users.dat','rb') as user:
    result=ct.detect(user.read())
    # print(result)
# UserID::Gender::Age::Occupation::Zip-code
head=['UserID','Gender','Age','Occupation','Zip-code']
users=pd.read_csv('ml-1m/users.dat',sep='::',names=head,index_col='UserID',encoding=result['encoding'])

  users=pd.read_csv('ml-1m/users.dat',sep='::',names=head,index_col='UserID',encoding=result['encoding'])


In [27]:
users.head()

Unnamed: 0_level_0,Gender,Age,Occupation,Zip-code
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,F,1,10,48067
2,M,56,16,70072
3,M,25,15,55117
4,M,45,7,2460
5,M,25,20,55455
