In [31]:
import numpy as np
import pandas as pd
import chardet as ct

In [32]:
# needed as encodings of different files can be different and can cause problems
with open('ml-1m/movies.dat','rb') as rawdata:
    result=ct.detect(rawdata.read())
    print(result)

{'encoding': 'ISO-8859-1', 'confidence': 0.73, 'language': ''}


In [33]:
movies=pd.read_csv('ml-1m/movies.dat',sep='::',engine='python',header=None,names=['Index','Movie_name','Genre'],index_col='Index',encoding=result['encoding'])

In [34]:
movies.nunique()

Movie_name    3883
Genre          301
dtype: int64

In [35]:
movies.tail()

Unnamed: 0_level_0,Movie_name,Genre
Index,Unnamed: 1_level_1,Unnamed: 2_level_1
3948,Meet the Parents (2000),Comedy
3949,Requiem for a Dream (2000),Drama
3950,Tigerland (2000),Drama
3951,Two Family House (2000),Drama
3952,"Contender, The (2000)",Drama|Thriller


In [36]:
movies.head()

Unnamed: 0_level_0,Movie_name,Genre
Index,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Animation|Children's|Comedy
2,Jumanji (1995),Adventure|Children's|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama
5,Father of the Bride Part II (1995),Comedy


In [37]:
movies['Year']=movies['Movie_name'].str.extract(r'([0-9]{4})')
# Why the extra parenthesis after: "[0-9]{4}"? as extract requires a capture grp
# This capture grp is identified inside the parenthesis only, else it doesn't understand
# The regex syntax is right in itself, its the extract method that causes issues

movies['Movie_name']=movies['Movie_name'].str.replace(r'\((\d{4})\)','',regex=True)
# Notice I used different regex but both are basically the same
# Also notice, replace doesn't need a capture grp like extract(no paranthesis to end the regex)

In [38]:
movies.sample(10)

Unnamed: 0_level_0,Movie_name,Genre,Year
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
940,"Adventures of Robin Hood, The",Action|Adventure,1938
1541,Addicted to Love,Comedy|Romance,1997
2904,Rain,Drama,1932
2187,Stage Fright,Mystery|Thriller,1950
2,Jumanji,Adventure|Children's|Fantasy,1995
350,"Client, The",Drama|Mystery|Thriller,1994
2397,Mass Appeal,Drama,1984
1121,Glory Daze,Drama,1996
3644,Dark Command,Western,1940
1713,Mouse Hunt,Children's|Comedy,1997


In [39]:
# Lets turn "Hobbit, The" type names to "The Hobbit" type
movies['Movie_name']=movies['Movie_name'].str.replace(r'^(.*), (The|An|A)',r'\2 \1',regex=True)
# Here \2 and \1 represents the 2 capture group from the earlier regex
# These are called backreferences, where \2 represents 2nd capture grp

In [None]:
movies.index.name="Movie_ID" # Renamed from index to Movie_ID
movies.sample(10)

Unnamed: 0_level_0,Movie_name,Genre,Year
Movie_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1906,Mr. Jealousy,Comedy|Romance,1997
3014,Bustin' Loose,Comedy,1981
2918,Ferris Bueller's Day Off,Comedy,1986
187,Party Girl,Comedy,1995
3109,The River,Drama,1984
2024,The Rapture,Drama|Mystery,1991
1852,Love Walked In,Drama|Thriller,1998
2118,The Dead Zone,Horror|Thriller,1983
618,Two Much,Comedy|Romance,1996
820,"Death in the Garden (Mort en ce jardin, La)",Drama,1956


In [41]:
movies_new=movies.copy()

In [42]:
# Now for seperating genres in a new dataframe
movies_new['Genre']=movies_new['Genre'].str.split('|')

# Explode needs a list of string that's why we split here

In [43]:
movies_new.sample(10)

Unnamed: 0_level_0,Movie_name,Genre,Year
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2239,Swept Away (Travolti da un insolito destino ne...,"[Comedy, Drama]",1975
29,The City of Lost Children,"[Adventure, Sci-Fi]",1995
3829,Mad About Mambo,"[Comedy, Romance]",2000
2195,Dirty Work,[Comedy],1998
166,The Doom Generation,"[Comedy, Drama]",1995
1811,"Niagara, Niagara",[Drama],1997
2871,Deliverance,"[Adventure, Thriller]",1972
2093,Return to Oz,"[Adventure, Children's, Fantasy, Sci-Fi]",1985
3209,The Loves of Carmen,[Drama],1948
2917,Body Heat,"[Crime, Thriller]",1981


In [44]:
exploded=movies_new.explode('Genre')
"""
Use of explode:
It takes a single row with a list-like value (e.g., a list of genres) and 
creates a new row for each item in that list. 
The data in the other columns is duplicated.
"""

'\nUse of explode:\nIt takes a single row with a list-like value (e.g., a list of genres) and \ncreates a new row for each item in that list. \nThe data in the other columns is duplicated.\n'

In [45]:
exploded.head(10)

Unnamed: 0_level_0,Movie_name,Genre,Year
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Toy Story,Animation,1995
1,Toy Story,Children's,1995
1,Toy Story,Comedy,1995
2,Jumanji,Adventure,1995
2,Jumanji,Children's,1995
2,Jumanji,Fantasy,1995
3,Grumpier Old Men,Comedy,1995
3,Grumpier Old Men,Romance,1995
4,Waiting to Exhale,Comedy,1995
4,Waiting to Exhale,Drama,1995


In [46]:
with open('ml-1m/ratings.dat','rb') as rat:
    result=ct.detect(rat.read())
    # print(result)

head=['UserID','Movie_id','Ratings','Timestamp']
ratings=pd.read_csv('ml-1m/ratings.dat',sep='::',names=head,index_col='Movie_id',encoding=result['encoding'])

  ratings=pd.read_csv('ml-1m/ratings.dat',sep='::',names=head,index_col='Movie_id',encoding=result['encoding'])


In [47]:
ratings.head()

Unnamed: 0_level_0,UserID,Ratings,Timestamp
Movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1193,1,5,978300760
661,1,3,978302109
914,1,3,978301968
3408,1,4,978300275
2355,1,5,978824291


In [48]:
# Currently the timestamp is in Unix timestamp also known as the epoch timestamp
ratings['Timestamp']=pd.to_datetime(ratings['Timestamp'],unit='s')
# 's' specifies seconds

In [56]:
ratings.index.name="Movie_ID"
ratings.head()
# Notice now timestamp has both date and time, lets seperate them out

Unnamed: 0_level_0,UserID,Ratings,Time,Date
Movie_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1193,1,5,22:12:40,2000-12-31
661,1,3,22:35:09,2000-12-31
914,1,3,22:32:48,2000-12-31
3408,1,4,22:04:35,2000-12-31
2355,1,5,23:38:11,2001-01-06


In [None]:
# ratings['Date']=ratings['Timestamp'].str.extract(r'(\d{4}-\d{2}-\d{2})')
"""Can't do the above as a datetime object can't be converted to string type"""

ratings['Date']=ratings['Timestamp'].dt.date
ratings['Timestamp']=ratings['Timestamp'].dt.time
ratings.rename(columns={'Timestamp':'Time'},inplace=True)
ratings.head()

Unnamed: 0_level_0,UserID,Ratings,Time,Date
Movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1193,1,5,22:12:40,2000-12-31
661,1,3,22:35:09,2000-12-31
914,1,3,22:32:48,2000-12-31
3408,1,4,22:04:35,2000-12-31
2355,1,5,23:38:11,2001-01-06


In [65]:
movies.index.max()

np.int64(3952)

In [63]:
ratings.index.max()

3952

#### both have the same number of entries BUT the index datatype is different

In [None]:
ratings['UserID'].unique()

6040

In [51]:
with open('ml-1m/users.dat','rb') as user:
    result=ct.detect(user.read())
    # print(result)
# UserID::Gender::Age::Occupation::Zip-code
head=['UserID','Gender','Age','Occupation','Zip-code']
users=pd.read_csv('ml-1m/users.dat',sep='::',names=head,index_col='UserID',encoding=result['encoding'])

  users=pd.read_csv('ml-1m/users.dat',sep='::',names=head,index_col='UserID',encoding=result['encoding'])


In [52]:
users.head()

Unnamed: 0_level_0,Gender,Age,Occupation,Zip-code
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,F,1,10,48067
2,M,56,16,70072
3,M,25,15,55117
4,M,45,7,2460
5,M,25,20,55455
