In [1]:
%matplotlib inline

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import matplotlib as mpl
import sklearn
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor

In [2]:
master_dataset = pd.read_csv('data_exploded.csv', encoding='latin-1', low_memory=False)
data = master_dataset.copy()

#dep_data = data.select_dtypes(include=["int64"])
#X_train, X_test, y_train, y_test = train_test_split(dep_data.drop("rating", axis=1), data["rating"], test_size=0.2, random_state=0)

In [3]:
master_dataset[master_dataset['user_id']==3][:5]

Unnamed: 0,movie_id,user_id,rating,age,gender,occupation,zip code,movie_title,release_date,video_release_date,...,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,rating_exists
3364,1,3,-1.0,23,M,writer,32067,Toy Story (1995),1995,01-Jan-1995,...,0,0,0,0,0,0,0,0,0,False
3365,2,3,-1.0,23,M,writer,32067,GoldenEye (1995),1995,01-Jan-1995,...,0,0,0,0,0,0,1,0,0,False
3366,3,3,-1.0,23,M,writer,32067,Four Rooms (1995),1995,01-Jan-1995,...,0,0,0,0,0,0,1,0,0,False
3367,4,3,-1.0,23,M,writer,32067,Get Shorty (1995),1995,01-Jan-1995,...,0,0,0,0,0,0,0,0,0,False
3368,5,3,-1.0,23,M,writer,32067,Copycat (1995),1995,01-Jan-1995,...,0,0,0,0,0,0,1,0,0,False


We can try multiple data imputation techniques:
1. Let us try imputing the missing ratings value with the average values for age, release_date and occupation

In [4]:
age_mean_rating = master_dataset[master_dataset['rating'] != -1].groupby(['age', 'occupation', 'release_date'])['rating'].mean()
age_mean_rating[:5]
age_mean_rating[23,'student', '1993']

3.3947368421052633

In [5]:
def impute_value(row, anchor, by=age_mean_rating):
    indices = [row[ele] for ele in anchor]
    #print(indices, len(indices), indices[0])
    if row['rating'] < 0:
        return age_mean_rating[indices]
    else:
        return row['rating']

In [None]:
master_dataset['rating'] = master_dataset.apply(lambda row: impute_value(row, ['age', 'occupation', 'release_date']), axis=1)

In [None]:
master_dataset[master_dataset['user_id']==3][:5]

In [None]:
data = master_dataset.copy()

data['rating'] = data['rating'].apply(lambda ele: np.mean(ele))
dep_data = data.select_dtypes(include=["int64", "float64"])
X_train, X_test, y_train, y_test = train_test_split(dep_data.drop("rating", axis=1), data["rating"], test_size=0.2, random_state=0)

In [None]:
dt = DecisionTreeRegressor(min_samples_leaf=3, max_features="sqrt", random_state=0)

dt.fit(X_train, y_train)
tuple(sorted(zip( dt.feature_importances_, X_train.columns), reverse=True))[:10]

In [None]:
y_test = [int(round(ele)) for ele in y_test ]
pred = dt.predict(X_test)
pred = [int(round(ele)) for ele in pred]
cm = sklearn.metrics.confusion_matrix(pred, y_test, labels = [1,2,3,4,5])

In [None]:
fig = plt.figure()
plt.clf()
ax = fig.add_subplot(111)
ax.set_aspect(1)
res = ax.imshow(cm, cmap=plt.cm.jet, 
                interpolation='nearest')

for x in range(5):
    for y in range(5):
        ax.annotate(str(cm[x][y]), xy=(y, x), 
                    horizontalalignment='center',
                    verticalalignment='center')