In [1]:
import pandas as pd
import numpy as np
import random
from tqdm.notebook import tqdm_notebook
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

In [2]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [3]:
cd "/content/drive/MyDrive/Fall'22/17645 B - MLP/Assignment 4"

/content/drive/MyDrive/Fall'22/17645 B - MLP/Assignment 4


# Load User Data

In [4]:
user_data = pd.read_csv("./data/cleaned_user_data.csv",header=None,names=["User_ID","Age","Occupation","Gender"],skiprows=1)
user_data.head()

Unnamed: 0,User_ID,Age,Occupation,Gender
0,2,33,college/grad student,M
1,3,29,scientist,M
2,4,30,other or not specified,M
3,5,26,scientist,M
4,6,27,college/grad student,F


In [5]:
user_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 999999 entries, 0 to 999998
Data columns (total 4 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   User_ID     999999 non-null  int64 
 1   Age         999999 non-null  int64 
 2   Occupation  999999 non-null  object
 3   Gender      999999 non-null  object
dtypes: int64(2), object(2)
memory usage: 38.1+ MB


In [6]:
user_data.Occupation.value_counts()

college/grad student      241685
executive/managerial      193153
sales/marketing           132826
scientist                  94787
other or not specified     66787
self-employed              66133
academic/educator          38205
K-12 student               35844
homemaker                  27847
artist                     21633
retired                    17177
clerical/admin             16916
technician/engineer        14037
programmer                 10652
tradesman/craftsman         5962
writer                      4192
lawyer                      3657
doctor/health care          3432
customer service            2770
unemployed                  1859
farmer                       445
Name: Occupation, dtype: int64

In [7]:
user_data.Age.value_counts()

26    84253
32    83755
28    83637
29    83597
31    83530
      ...  
65      514
76      509
73      504
88      496
57      477
Name: Age, Length: 83, dtype: int64

In [8]:
user_data.Gender.value_counts()

M    829959
F    170040
Name: Gender, dtype: int64

# Load Movie Data

In [9]:
movieinfo_data = pd.read_csv("./data/flattened_movie_data_from_functions.csv",index_col=0)
movieinfo_data.head()

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,id,original_title,budget,genres,original_language,overview,popularity,production_companies,production_countries,release_date,...,production_countries_15,production_countries_16,production_countries_17,production_countries_18,production_countries_19,production_countries_20,production_countries_21,production_countries_22,production_countries_23,production_countries_24
0,next+friday+2000,Next Friday,11000000,"[{'id': 35, 'name': 'Comedy'}]",en,"Ice Cube returns as Craig Jones, a streetwise ...",10.06545,"[{'name': 'New Line Cinema', 'id': 12}]","[{'iso_3166_1': 'US', 'name': 'United States o...",2000-01-12,...,,,,,,,,,,
1,the+goonies+1985,The Goonies,19000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 35, '...",en,A young teenager named Mikey Walsh finds an ol...,14.280703,"[{'name': 'Amblin Entertainment', 'id': 56}, {...","[{'iso_3166_1': 'US', 'name': 'United States o...",1985-06-06,...,,,,,,,,,,
2,inception+2010,Inception,160000000,"[{'id': 28, 'name': 'Action'}, {'id': 53, 'nam...",en,"Cobb, a skilled thief who commits corporate es...",29.108149,"[{'name': 'Legendary Pictures', 'id': 923}, {'...","[{'iso_3166_1': 'GB', 'name': 'United Kingdom'...",2010-07-14,...,,,,,,,,,,
3,true+lies+1994,True Lies,115000000,"[{'id': 28, 'name': 'Action'}, {'id': 53, 'nam...",en,Harry Tasker is a secret agent for the United ...,11.396099,[{'name': 'Twentieth Century Fox Film Corporat...,"[{'iso_3166_1': 'US', 'name': 'United States o...",1994-07-14,...,,,,,,,,,,
4,the+rock+1996,The Rock,75000000,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",en,A group of renegade marine commandos seizes a ...,13.249824,"[{'name': 'Hollywood Pictures', 'id': 915}, {'...","[{'iso_3166_1': 'US', 'name': 'United States o...",1996-06-06,...,,,,,,,,,,


In [10]:
movieinfo_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26667 entries, 0 to 26666
Columns: 147 entries, id to production_countries_24
dtypes: float64(2), int64(4), object(141)
memory usage: 30.1+ MB


In [11]:
movieinfo_data.genre_0.unique()

array(["'Comedy'", "'Adventure'", "'Action'", "'Drama'", "'Science",
       "'Animation'", "'Music'", "'Crime'", "'Romance'", "'Thriller'",
       "'Mystery'", "'History'", "'War'", "'Fantasy'", "'Family'",
       "'Documentary'", "'Horror'", "'Western'", nan, "'TV", "'Foreign'"],
      dtype=object)

In [12]:
genre_cols = ['genre_0']
movieinfo_genre_data = movieinfo_data[["id"] + ["original_title"]+ genre_cols]  
movieinfo_genre_data['genre_0'] = movieinfo_genre_data['genre_0'].str.replace("'",'')
movieinfo_genre_data = movieinfo_genre_data.dropna()
movieinfo_genre_data.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,id,original_title,genre_0
0,next+friday+2000,Next Friday,Comedy
1,the+goonies+1985,The Goonies,Adventure
2,inception+2010,Inception,Action
3,true+lies+1994,True Lies,Action
4,the+rock+1996,The Rock,Action


In [13]:
movieinfo_genre_data.genre_0.unique()

array(['Comedy', 'Adventure', 'Action', 'Drama', 'Science', 'Animation',
       'Music', 'Crime', 'Romance', 'Thriller', 'Mystery', 'History',
       'War', 'Fantasy', 'Family', 'Documentary', 'Horror', 'Western',
       'TV', 'Foreign'], dtype=object)

In [14]:
movieinfo_genre_data.genre_0.value_counts()

Drama          7597
Comedy         5398
Action         2897
Documentary    1850
Horror         1519
Crime          1088
Adventure      1013
Thriller        805
Romance         626
Animation       568
Fantasy         457
Science         350
Mystery         298
Music           271
Western         242
War             238
Family          227
History         160
TV              138
Foreign          37
Name: genre_0, dtype: int64

# Load Ratings Data

In [15]:
ratings_data =pd.read_csv("./data/cleaned_rating_data.csv",header=None,
                          names=["DateTime","User_ID","Movie_ID","Rating"],skiprows=1)

ratings_data.head()

Unnamed: 0,DateTime,User_ID,Movie_ID,Rating
0,2022-10-28T03:25:44,256889,the+cave+of+the+golden+rose+1991,4
1,2022-10-28T03:25:44,532887,conception+2011,4
2,2022-10-28T03:25:44,570513,the+godfather+1972,4
3,2022-10-28T03:25:45,85424,chinatown+1974,5
4,2022-10-28T03:25:45,621913,12+angry+men+1957,4


In [16]:
ratings_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1999819 entries, 0 to 1999818
Data columns (total 4 columns):
 #   Column    Dtype 
---  ------    ----- 
 0   DateTime  object
 1   User_ID   int64 
 2   Movie_ID  object
 3   Rating    int64 
dtypes: int64(2), object(2)
memory usage: 76.3+ MB


In [17]:
ratings_data['DateTime'] = pd.to_datetime(ratings_data['DateTime'],errors='coerce')
dateNaNs = ratings_data[ratings_data['DateTime'].isna()].index 
ratings_data = ratings_data.drop(index=dateNaNs).set_index(keys = "DateTime")
ratings_data.info()



<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1999732 entries, 2022-10-28 03:25:44 to 2022-11-02 02:07:09
Data columns (total 3 columns):
 #   Column    Dtype 
---  ------    ----- 
 0   User_ID   int64 
 1   Movie_ID  object
 2   Rating    int64 
dtypes: int64(2), object(1)
memory usage: 61.0+ MB


In [18]:
ratings_data.User_ID.nunique()

649268

In [19]:
ratings_data.Movie_ID.nunique()

26708

# Join Ratings Data with User and Movie Genre Info

In [20]:
combined_data = pd.merge(ratings_data,user_data)
combined_data.head()

Unnamed: 0,User_ID,Movie_ID,Rating,Age,Occupation,Gender
0,256889,the+cave+of+the+golden+rose+1991,4,33,college/grad student,F
1,256889,sister+my+sister+1994,4,33,college/grad student,F
2,256889,cat+on+a+hot+tin+roof+1958,4,33,college/grad student,F
3,256889,better+off+dead...+1985,4,33,college/grad student,F
4,256889,into+the+woods+1991,3,33,college/grad student,F


In [21]:
combined_data.User_ID.nunique()

649261

In [22]:
combined_data = pd.merge(combined_data,movieinfo_genre_data,left_on="Movie_ID",right_on="id")
combined_data = combined_data.drop(columns=["id","original_title"]).rename(columns={"genre_0":"Genre"})
combined_data = pd.concat([combined_data.drop(columns="Rating"),combined_data["Rating"]],axis=1)
combined_data.head()

Unnamed: 0,User_ID,Movie_ID,Age,Occupation,Gender,Genre,Rating
0,256889,the+cave+of+the+golden+rose+1991,33,college/grad student,F,Adventure,4
1,704061,the+cave+of+the+golden+rose+1991,29,sales/marketing,M,Adventure,3
2,743279,the+cave+of+the+golden+rose+1991,33,executive/managerial,M,Adventure,4
3,657376,the+cave+of+the+golden+rose+1991,32,college/grad student,M,Adventure,4
4,504412,the+cave+of+the+golden+rose+1991,23,college/grad student,M,Adventure,5


In [23]:
combined_data.Movie_ID.nunique()

25506

In [24]:
combined_data.User_ID.nunique()

646971

In [25]:
combined_data.Gender.value_counts()

M    1634478
F     335529
Name: Gender, dtype: int64

In [26]:
combined_data.Occupation.value_counts()

college/grad student      476261
executive/managerial      377211
sales/marketing           262251
scientist                 186867
other or not specified    132238
self-employed             132208
academic/educator          75054
K-12 student               70444
homemaker                  55618
artist                     42466
clerical/admin             33638
retired                    33608
technician/engineer        27823
programmer                 21668
tradesman/craftsman        11382
writer                      8155
lawyer                      6885
doctor/health care          6313
customer service            5436
unemployed                  3721
farmer                       760
Name: Occupation, dtype: int64

In [27]:
combined_data.Genre.value_counts()

Drama          738860
Comedy         272916
Adventure      189568
Documentary    148115
Action         147106
Crime          126304
Animation       78939
Thriller        50377
Mystery         44800
Fantasy         42966
Horror          40694
Science         18535
Western         17866
Romance         15991
Family          13935
War              7535
History          6438
Music            5382
TV               3149
Foreign           531
Name: Genre, dtype: int64

In [28]:
combined_data.Rating.value_counts()

4    1048094
5     475974
3     408921
2      35732
1       1286
Name: Rating, dtype: int64


# Clean and Prepare data for the ML Model

In [29]:
combined_data["User_Movie"] = combined_data["User_ID"].astype(str) + "-" + combined_data["Movie_ID"]
combined_data.head()

Unnamed: 0,User_ID,Movie_ID,Age,Occupation,Gender,Genre,Rating,User_Movie
0,256889,the+cave+of+the+golden+rose+1991,33,college/grad student,F,Adventure,4,256889-the+cave+of+the+golden+rose+1991
1,704061,the+cave+of+the+golden+rose+1991,29,sales/marketing,M,Adventure,3,704061-the+cave+of+the+golden+rose+1991
2,743279,the+cave+of+the+golden+rose+1991,33,executive/managerial,M,Adventure,4,743279-the+cave+of+the+golden+rose+1991
3,657376,the+cave+of+the+golden+rose+1991,32,college/grad student,M,Adventure,4,657376-the+cave+of+the+golden+rose+1991
4,504412,the+cave+of+the+golden+rose+1991,23,college/grad student,M,Adventure,5,504412-the+cave+of+the+golden+rose+1991


In [30]:
combined_data_cleaned = combined_data.drop(columns=["User_ID","Movie_ID"]).set_index(keys="User_Movie")
combined_data_cleaned.head()

Unnamed: 0_level_0,Age,Occupation,Gender,Genre,Rating
User_Movie,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
256889-the+cave+of+the+golden+rose+1991,33,college/grad student,F,Adventure,4
704061-the+cave+of+the+golden+rose+1991,29,sales/marketing,M,Adventure,3
743279-the+cave+of+the+golden+rose+1991,33,executive/managerial,M,Adventure,4
657376-the+cave+of+the+golden+rose+1991,32,college/grad student,M,Adventure,4
504412-the+cave+of+the+golden+rose+1991,23,college/grad student,M,Adventure,5


In [31]:
combined_data_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1970007 entries, 256889-the+cave+of+the+golden+rose+1991 to 434863-death+ship+1980
Data columns (total 5 columns):
 #   Column      Dtype 
---  ------      ----- 
 0   Age         int64 
 1   Occupation  object
 2   Gender      object
 3   Genre       object
 4   Rating      int64 
dtypes: int64(2), object(3)
memory usage: 90.2+ MB


## Checking Duplicates

In [32]:
combined_data_cleaned[combined_data_cleaned.duplicated()].sort_values(by=combined_data_cleaned.columns.to_list())

Unnamed: 0_level_0,Age,Occupation,Gender,Genre,Rating
User_Movie,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
416576-the+aggression+scale+2012,8,K-12 student,F,Action,2
788446-the+twilight+samurai+2002,8,K-12 student,F,Action,3
184347-men+in+black+ii+2002,8,K-12 student,F,Action,3
6258-dead+end+drive-in+1986,8,K-12 student,F,Action,3
437627-repo+men+2010,8,K-12 student,F,Action,3
...,...,...,...,...,...
211898-schindlers+list+1993,90,writer,M,Drama,5
76942-pather+panchali+1955,90,writer,M,Drama,5
661956-sanjuro+1962,90,writer,M,Drama,5
210499-grand+illusion+1937,90,writer,M,Drama,5


In [33]:
combined_data_cleaned.duplicated()

User_Movie
256889-the+cave+of+the+golden+rose+1991          False
704061-the+cave+of+the+golden+rose+1991          False
743279-the+cave+of+the+golden+rose+1991          False
657376-the+cave+of+the+golden+rose+1991          False
504412-the+cave+of+the+golden+rose+1991          False
                                                 ...  
192074-missing+in+action+2+the+beginning+1985     True
505222-it+runs+in+the+family+2003                 True
834805-the+pirate+movie+1982                      True
405895-stag+1997                                  True
434863-death+ship+1980                            True
Length: 1970007, dtype: bool

In [34]:
combined_data_cleaned.drop_duplicates()

Unnamed: 0_level_0,Age,Occupation,Gender,Genre,Rating
User_Movie,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
256889-the+cave+of+the+golden+rose+1991,33,college/grad student,F,Adventure,4
704061-the+cave+of+the+golden+rose+1991,29,sales/marketing,M,Adventure,3
743279-the+cave+of+the+golden+rose+1991,33,executive/managerial,M,Adventure,4
657376-the+cave+of+the+golden+rose+1991,32,college/grad student,M,Adventure,4
504412-the+cave+of+the+golden+rose+1991,23,college/grad student,M,Adventure,5
...,...,...,...,...,...
745489-honey+i+blew+up+the+kid+1992,53,scientist,M,Adventure,1
699213-deadly+friend+1986,38,sales/marketing,F,Horror,3
527143-torque+2004,39,writer,M,Action,1
826252-losin+it+1983,30,self-employed,M,Comedy,1


## Assigning Ratings of 4 and above as High i.e. 1 and below 4 as Low or 0

In [35]:
combined_data_cleaned["Rating_Level"]=  np.where(combined_data_cleaned["Rating"] >= 4,1,0)
combined_data_cleaned.head()

Unnamed: 0_level_0,Age,Occupation,Gender,Genre,Rating,Rating_Level
User_Movie,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
256889-the+cave+of+the+golden+rose+1991,33,college/grad student,F,Adventure,4,1
704061-the+cave+of+the+golden+rose+1991,29,sales/marketing,M,Adventure,3,0
743279-the+cave+of+the+golden+rose+1991,33,executive/managerial,M,Adventure,4,1
657376-the+cave+of+the+golden+rose+1991,32,college/grad student,M,Adventure,4,1
504412-the+cave+of+the+golden+rose+1991,23,college/grad student,M,Adventure,5,1


In [36]:
combined_data_cleaned.Rating.value_counts()

4    1048094
5     475974
3     408921
2      35732
1       1286
Name: Rating, dtype: int64

In [37]:
combined_data_cleaned.Rating_Level.value_counts()

1    1524068
0     445939
Name: Rating_Level, dtype: int64

## Dropping Duplicates

In [38]:
combined_data_cleaned_new = combined_data_cleaned.drop(columns="Rating").drop_duplicates()
combined_data_cleaned_new.head()

Unnamed: 0_level_0,Age,Occupation,Gender,Genre,Rating_Level
User_Movie,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
256889-the+cave+of+the+golden+rose+1991,33,college/grad student,F,Adventure,1
704061-the+cave+of+the+golden+rose+1991,29,sales/marketing,M,Adventure,0
743279-the+cave+of+the+golden+rose+1991,33,executive/managerial,M,Adventure,1
657376-the+cave+of+the+golden+rose+1991,32,college/grad student,M,Adventure,1
504412-the+cave+of+the+golden+rose+1991,23,college/grad student,M,Adventure,1


In [39]:
combined_data_cleaned_new.info()

<class 'pandas.core.frame.DataFrame'>
Index: 42074 entries, 256889-the+cave+of+the+golden+rose+1991 to 648565-hercules+in+new+york+1969
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Age           42074 non-null  int64 
 1   Occupation    42074 non-null  object
 2   Gender        42074 non-null  object
 3   Genre         42074 non-null  object
 4   Rating_Level  42074 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 1.9+ MB


In [40]:
combined_data_cleaned_new.Occupation.value_counts()

executive/managerial      3797
retired                   3448
academic/educator         3446
other or not specified    3038
sales/marketing           2894
self-employed             2716
scientist                 2664
college/grad student      2595
homemaker                 2430
artist                    2328
clerical/admin            2077
K-12 student              1953
writer                    1526
doctor/health care        1284
technician/engineer       1224
programmer                1017
lawyer                    1000
customer service           859
tradesman/craftsman        798
unemployed                 748
farmer                     232
Name: Occupation, dtype: int64

## Dropping Unwanted Values

In [41]:
combined_data_cleaned_new = combined_data_cleaned_new[combined_data_cleaned_new.Occupation != "other or not specified"]
len(combined_data_cleaned_new)

39036

In [42]:
combined_data_cleaned_new.Occupation.unique()

array(['college/grad student', 'sales/marketing', 'executive/managerial',
       'scientist', 'K-12 student', 'self-employed', 'clerical/admin',
       'retired', 'customer service', 'academic/educator',
       'technician/engineer', 'artist', 'homemaker', 'doctor/health care',
       'unemployed', 'tradesman/craftsman', 'lawyer', 'programmer',
       'writer', 'farmer'], dtype=object)

In [43]:
combined_data_cleaned_new.Gender.value_counts()

M    22530
F    16506
Name: Gender, dtype: int64

# Making Columns into Categorical for Classifier

In [44]:
combined_data_cleaned_new.info()

<class 'pandas.core.frame.DataFrame'>
Index: 39036 entries, 256889-the+cave+of+the+golden+rose+1991 to 699213-deadly+friend+1986
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Age           39036 non-null  int64 
 1   Occupation    39036 non-null  object
 2   Gender        39036 non-null  object
 3   Genre         39036 non-null  object
 4   Rating_Level  39036 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 1.8+ MB


In [45]:
combined_data_cleaned_new_new = combined_data_cleaned_new.copy()
combined_data_cleaned_new_new["Occupation"] = combined_data_cleaned_new["Occupation"].astype("category")
combined_data_cleaned_new_new["Gender"] = combined_data_cleaned_new["Gender"].astype("category")
combined_data_cleaned_new_new["Genre"] = combined_data_cleaned_new["Genre"].astype("category")
combined_data_cleaned_new_new

Unnamed: 0_level_0,Age,Occupation,Gender,Genre,Rating_Level
User_Movie,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
256889-the+cave+of+the+golden+rose+1991,33,college/grad student,F,Adventure,1
704061-the+cave+of+the+golden+rose+1991,29,sales/marketing,M,Adventure,0
743279-the+cave+of+the+golden+rose+1991,33,executive/managerial,M,Adventure,1
657376-the+cave+of+the+golden+rose+1991,32,college/grad student,M,Adventure,1
504412-the+cave+of+the+golden+rose+1991,23,college/grad student,M,Adventure,1
...,...,...,...,...,...
213441-rec+genesis+2012,82,academic/educator,M,Horror,0
86844-pharaohs++army+1995,68,retired,F,Western,0
742558-the+crocodile+hunter+collision+course+2002,43,scientist,M,Family,0
270882-lawnmower+man+2+beyond+cyberspace+1996,42,customer service,M,Action,0


In [46]:
combined_data_cleaned_new_new.info()

<class 'pandas.core.frame.DataFrame'>
Index: 39036 entries, 256889-the+cave+of+the+golden+rose+1991 to 699213-deadly+friend+1986
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   Age           39036 non-null  int64   
 1   Occupation    39036 non-null  category
 2   Gender        39036 non-null  category
 3   Genre         39036 non-null  category
 4   Rating_Level  39036 non-null  int64   
dtypes: category(3), int64(2)
memory usage: 1.0+ MB


In [47]:
combined_data_cleaned_new = combined_data_cleaned_new_new.copy()

# Train-Test Split

In [48]:
X, y = combined_data_cleaned_new.iloc[:,:-1], combined_data_cleaned_new["Rating_Level"]

In [49]:
X.shape

(39036, 4)

In [50]:
y.shape

(39036,)

Intentionally no stratification done.No Train, Val, Test split as that is not the aim of the demo

In [51]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25,stratify=X["Age"],
                                                    random_state=42)

In [52]:
X_train.shape

(29277, 4)

In [53]:
X_test.shape

(9759, 4)

# Setup Preprocessing ( for OHE of Cat Columns ) and Classification Pipeline

In [54]:
categorical_features = ["Occupation", "Gender", "Genre"]
categorical_transformer = OneHotEncoder(handle_unknown="ignore")

In [55]:
preprocessor = ColumnTransformer(
    transformers=
        [("cat", categorical_transformer, categorical_features)],
        remainder="drop")

In [56]:
model_pipeline = Pipeline(steps=[("preprocessor", preprocessor), ("classifier", RandomForestClassifier())])

# Train a Random Forect Classifier with Gridsearch

In [57]:
param_grid = { 
    'classifier__n_estimators': [100,200],
    'classifier__max_features': ['sqrt', 'log2',None],
    'classifier__max_depth' : [3,5,10]
}

In [59]:
grid = GridSearchCV(estimator=model_pipeline, param_grid=param_grid, cv= 5,n_jobs=-1,verbose=3)
grid.fit(X_train,y_train)

Fitting 5 folds for each of 18 candidates, totalling 90 fits


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('cat',
                                                                         OneHotEncoder(handle_unknown='ignore'),
                                                                         ['Occupation',
                                                                          'Gender',
                                                                          'Genre'])])),
                                       ('classifier',
                                        RandomForestClassifier())]),
             n_jobs=-1,
             param_grid={'classifier__max_depth': [3, 5, 10],
                         'classifier__max_features': ['sqrt', 'log2', None],
                         'classifier__n_estimators': [100, 200]},
             verbose=3)

In [60]:
print("Best params:")
print(grid.best_params_)

Best params:
{'classifier__max_depth': 3, 'classifier__max_features': 'sqrt', 'classifier__n_estimators': 100}


In [61]:
print(f"Internal CV score: {grid.best_score_:.3f}")

Internal CV score: 0.570


In [62]:
best_model = grid.best_estimator_
best_model

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('cat',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['Occupation', 'Gender',
                                                   'Genre'])])),
                ('classifier',
                 RandomForestClassifier(max_depth=3, max_features='sqrt'))])

In [72]:
print(
    (
        "best classification from grid search: %.2f"
        % best_model.score(X_test, y_test)
    )
)

best classification from grid search: 0.57


In [73]:
y_pred = best_model.predict(X_test)
y_pred

array([1, 1, 1, ..., 1, 1, 1])

In [65]:
y_pred.shape

(9759,)

In [67]:
best_model['classifier'].feature_importances_

array([0.05141235, 0.00838458, 0.00616121, 0.00439756, 0.00322696,
       0.02736733, 0.08503067, 0.03573986, 0.01647904, 0.00684822,
       0.00956428, 0.0038775 , 0.00758283, 0.03183586, 0.00122686,
       0.00198019, 0.007774  , 0.00068036, 0.00264803, 0.03776809,
       0.12236642, 0.14735025, 0.00362724, 0.00436939, 0.00920209,
       0.01304828, 0.01707312, 0.00841505, 0.0745933 , 0.01524129,
       0.02907703, 0.00310745, 0.00526403, 0.01990225, 0.01676194,
       0.09624409, 0.00707905, 0.00220129, 0.00928575, 0.01886511,
       0.00292162, 0.02401819])

In [68]:
best_model['preprocessor']

ColumnTransformer(transformers=[('cat', OneHotEncoder(handle_unknown='ignore'),
                                 ['Occupation', 'Gender', 'Genre'])])

# Write to CSV

In [69]:
score_df = pd.DataFrame(y_pred,columns=["score"],index=X_test.index)
label_df = pd.DataFrame(y_test).rename(columns={"Rating_Level":"label_value"}) 

In [70]:
final_df = pd.concat([score_df,label_df,X_test],axis=1)
final_df

Unnamed: 0_level_0,score,label_value,Age,Occupation,Gender,Genre
User_Movie,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
333449-a+better+tomorrow+iii+love+and+death+in+saigon+1989,1,0,59,retired,F,War
520321-into+the+woods+1991,1,1,50,executive/managerial,M,Drama
438331-seven+samurai+1954,1,0,26,sales/marketing,F,Action
25685-north+by+northwest+1959,1,1,88,clerical/admin,M,Mystery
741056-frozen+planet+2011,1,1,28,farmer,M,Documentary
...,...,...,...,...,...,...
530555-no+greater+love+2009,1,0,54,doctor/health care,F,Documentary
516885-the+kingdom+ii+1997,1,1,33,homemaker,F,Horror
408947-committed+2000,1,0,75,executive/managerial,F,Comedy
135766-the+tragedy+of+macbeth+1971,1,0,25,writer,F,War


In [71]:
final_df.to_csv(path_or_buf="./output.csv")