In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from category_encoders.m_estimate import MEstimateEncoder

from sklearn.feature_selection import mutual_info_regression
from sklearn.model_selection import cross_val_score
from xgboost import XGBRegressor

In [2]:
df = pd.read_csv('./data/movielens1m.csv')
df = df.astype(np.uint8, errors='ignore')
print(df.head())
print(df.shape)

  Movie ID  Action  Adventure  Animation  Children  Comedy  Crime  \
0  b'3107'       1          0          0         0       0      0   
1  b'2114'       0          0          0         0       0      0   
2   b'256'       0          0          0         0       1      0   
3  b'1389'       1          0          0         0       0      0   
4  b'3635'       1          0          0         0       0      0   

   Documentary  Drama  Fantasy  ...  Unknown  War  Western  \
0            0      1        0  ...        0    0        0   
1            0      1        0  ...        0    0        0   
2            0      0        0  ...        0    0        0   
3            0      0        0  ...        0    0        0   
4            0      0        0  ...        0    0        0   

   (no genres listed)  User ID  Age  Gender  Occupation   Zipcode  Rating  
0                   0   b'130'   35       1          18  b'50021'       5  
1                   0  b'3829'   25       0           0  b'2

In [3]:
X_encode, X_trin = train_test_split(df, test_size=0.75)
# 25% data used for encoding 

In [4]:
y_encode = X_encode.pop('Rating')
y_train = X_trin.pop('Rating')

In [5]:
print(X_encode.shape)
print(y_encode.shape)

(250052, 27)
(250052,)


In [6]:
y_encode.head()

977111    4
28645     1
97114     2
104316    4
279818    5
Name: Rating, dtype: uint8

In [7]:
encoder = MEstimateEncoder(cols=["Zipcode"], m=5.0)
encoder.fit(X_encode, y_encode)

  elif pd.api.types.is_categorical(cols):


MEstimateEncoder(cols=['Zipcode'], m=5.0)

In [9]:
X_trin = encoder.transform(X_trin)
X_trin.head()

Unnamed: 0,Movie ID,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,...,Thriller,Unknown,War,Western,(no genres listed),User ID,Age,Gender,Occupation,Zipcode
584982,b'587',0,0,0,0,1,0,0,0,0,...,1,0,0,0,0,b'1329',18,1,14,3.583871
673833,b'1978',0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,b'3648',18,1,17,3.583871
303404,b'733',1,1,0,0,0,0,0,0,0,...,1,0,0,0,0,b'3213',25,0,0,3.583871
347612,b'1210',1,1,0,0,0,0,0,0,0,...,0,0,1,0,0,b'3777',18,1,17,3.583871
98747,b'2837',0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,b'5284',25,1,16,3.583871


In [10]:

def make_mi_scores(X, y):
    X = X.copy()
    for colname in X.select_dtypes(["object", "category"]):
        X[colname], _ = X[colname].factorize()
    # All discrete features should now have integer dtypes
    discrete_features = [pd.api.types.is_integer_dtype(t) for t in X.dtypes]
    mi_scores = mutual_info_regression(X, y, discrete_features=discrete_features, random_state=0)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores


def score_dataset(X, y, model=XGBRegressor()):
    # Label encoding for categoricals
    for colname in X.select_dtypes(["category", "object"]):
        X[colname], _ = X[colname].factorize()
    # Metric for Housing competition is RMSLE (Root Mean Squared Log Error)
    score = cross_val_score(
        model, X, y, cv=5, scoring="neg_mean_squared_log_error",
    )
    score = -1 * score.mean()
    score = np.sqrt(score)
    return score