In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_absolute_error

r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
i_cols = ['movie_id', 'movie_title', 'release_date', 'video_release_date',
          'IMDb_URL', 'unknown', 'Action', 'Adventure', 'Animation', 'Children',
          'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir',
          'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
u_cols = ['user_id', 'age', 'gender', 'occupation', 'zip_code']

train_data = pd.read_csv("/Users/sivanr/PycharmProjects/eCommerce/ml-100k/u1.base", delimiter='\t', names=r_cols)
test_data = pd.read_csv("/Users/sivanr/PycharmProjects/eCommerce/ml-100k/u1.test", delimiter='\t', names=r_cols)
data_i = pd.read_csv('/Users/sivanr/PycharmProjects/eCommerce/ml-100k/u.item', delimiter='|', names=i_cols,
                     encoding='latin-1')
data_u = pd.read_csv('/Users/sivanr/PycharmProjects/eCommerce/ml-100k/u.user', delimiter='|', names=u_cols)


In [None]:

movies_rating_train = data_i.merge(train_data, on='movie_id', how='inner')
movies_rating_test = data_i.merge(test_data, on='movie_id', how='inner')

users_ratings_train = train_data.merge(data_u, on='user_id', how='inner')
users_ratings_test = test_data.merge(data_u, on='user_id', how='inner')

movies_users_ratings_train = movies_rating_train.merge(users_ratings_train, on=['user_id', 'movie_id'], how='inner',
                                                       suffixes=('', '_y'))
movies_users_ratings_test = movies_rating_test.merge(users_ratings_test, on=['user_id', 'movie_id'], how='inner',
                                                     suffixes=('', '_y'))

columns_to_drop = ['movie_id', 'movie_title', 'user_id', 'rating_y', 'timestamp', 'timestamp_y', 'release_date',
                   'video_release_date', 'IMDb_URL', 'zip_code']
movies_users_ratings_train.drop(columns_to_drop, inplace=True, axis=1)
movies_users_ratings_test.drop(columns_to_drop, inplace=True, axis=1)

print("train set shape: ", movies_users_ratings_train.shape)
print("test set shape: ", movies_users_ratings_test.shape)


In [None]:
cat_vars = ['gender', 'occupation']

for var in cat_vars:
    cat_list = pd.get_dummies(movies_users_ratings_train[var], prefix=var)
    movies_users_ratings_train = movies_users_ratings_train.join(cat_list)

    cat_list = pd.get_dummies(movies_users_ratings_test[var], prefix=var)
    movies_users_ratings_test = movies_users_ratings_test.join(cat_list)


In [None]:
movies_users_ratings_train['age'] = np.digitize(movies_users_ratings_train['age'], bins=[10, 20, 35, 50])
movies_users_ratings_test['age'] = np.digitize(movies_users_ratings_test['age'], bins=[10, 20, 35, 50])
movies_users_ratings_train


In [None]:
columns = movies_users_ratings_train.columns.values.tolist()
to_keep = [i for i in columns if i not in cat_vars]

movies_users_ratings_train = movies_users_ratings_train[to_keep]
movies_users_ratings_test = movies_users_ratings_test[to_keep]


X_train = movies_users_ratings_train.loc[:, movies_users_ratings_train.columns != 'rating']
X_test = movies_users_ratings_test.loc[:, movies_users_ratings_test.columns != 'rating']
y_train = movies_users_ratings_train.loc[:, movies_users_ratings_train.columns == 'rating']
y_test = movies_users_ratings_test.loc[:, movies_users_ratings_test.columns == 'rating']

print("train set shape: ", X_train.shape)
print("test set shape: ", X_test.shape)


In [None]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

y_pred = logreg.predict(X_test)
print('MAE of logistic regression classifier on test set: {:.2f}'.format(mean_absolute_error(y_test, y_pred)))


In [None]:
to_keep = ['age', 'gender_M', 'gender_F']
X_train = X_train.filter(to_keep)
X_test = X_test.filter(to_keep)

# logreg = RandomForestClassifier(n_estimators=20, max_depth=10, min_samples_split=3)
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

y_pred = logreg.predict(X_test)
print('MAE of logistic regression classifier on test set: {:.2f}'.format(mean_absolute_error(y_test, y_pred)))

For each two features we evaluated the model to determine which two features give the lowest MAE with the loop below:

In [None]:
for i in range(X_train.shape[1]):
    for j in range(X_train.shape[1]):
        if i != j:
            temp_train = X_train.iloc[:, [i, j]]
            temp_test = X_test.iloc[:, [i, j]]
            logreg = LogisticRegression()
            logreg.fit(temp_train, y_train)

            y_pred = logreg.predict(temp_test)
            print("features: ", temp_train.columns.values.tolist(), ": ")
            print('MAE of logistic regression classifier on test set: {:.2f}'.format(mean_absolute_error(y_test, y_pred)))


we found out that the differences between the models are very
if small.