In [1]:
import os

import pandas as pd
from d2l import mxnet as d2l
from mxnet import gluon, np

In [57]:
d2l.DATA_HUB["ml-1M"] = (
    "http://files.grouplens.org/datasets/movielens/ml-1m.zip",
    "c4d9eecfca2ab87c1945afe126590906",
)
data_dir1m = d2l.download_extract("ml-1M")

Downloading ..\data\ml-1m.zip from http://files.grouplens.org/datasets/movielens/ml-1m.zip...


In [84]:
# @save
def read_data_ml1m():
    iNames = ["user_id", "item_id", "rating", "timestamp"]
    ratings = pd.read_csv(
        os.path.join(data_dir1m, "ratings.dat"), "::", names=iNames, engine="python"
    )
    uNames = ["user_id", "gender", "age", "occupation", "zipcode"]
    users = pd.read_csv(
        os.path.join(data_dir1m, "users.dat"), "::", names=uNames, engine="python"
    )

    # MovieID::Title::Genres
    m_cols = ["item_id", "title", "generes"]
    movies = pd.read_csv(
        os.path.join(data_dir1m, "movies.dat"),
        sep="::",
        names=m_cols,
        usecols=range(3),
        encoding="latin-1",
        engine="python",
    )

    # create one merged DataFrame
    movie_ratings = pd.merge(movies, ratings)
    lens = pd.merge(movie_ratings, users)

    num_users = ratings.user_id.unique().shape[0]
    num_items = ratings.item_id.unique().shape[0]
    return lens, num_users, num_items


rating, num_users, num_items = read_data_ml1m()
rating

Unnamed: 0,item_id,title,generes,user_id,rating,timestamp,gender,age,occupation,zipcode
0,1,Toy Story (1995),Animation|Children's|Comedy,1,5,978824268,F,1,10,48067
1,48,Pocahontas (1995),Animation|Children's|Musical|Romance,1,5,978824351,F,1,10,48067
2,150,Apollo 13 (1995),Drama,1,5,978301777,F,1,10,48067
3,260,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Fantasy|Sci-Fi,1,4,978300760,F,1,10,48067
4,527,Schindler's List (1993),Drama|War,1,5,978824195,F,1,10,48067
...,...,...,...,...,...,...,...,...,...,...
1000204,3513,Rules of Engagement (2000),Drama|Thriller,5727,4,958489970,M,25,4,92843
1000205,3535,American Psycho (2000),Comedy|Horror|Thriller,5727,2,958489970,M,25,4,92843
1000206,3536,Keeping the Faith (2000),Comedy|Romance,5727,5,958489902,M,25,4,92843
1000207,3555,U-571 (2000),Action|Thriller,5727,3,958490699,M,25,4,92843


In [85]:
rating.where(items['gender'] == 'F')

Unnamed: 0,item_id,title,generes,user_id,rating,timestamp,gender,age,occupation,zipcode
0,1.0,Toy Story (1995),Animation|Children's|Comedy,1.0,5.0,978824268.0,F,1.0,10.0,48067
1,48.0,Pocahontas (1995),Animation|Children's|Musical|Romance,1.0,5.0,978824351.0,F,1.0,10.0,48067
2,150.0,Apollo 13 (1995),Drama,1.0,5.0,978301777.0,F,1.0,10.0,48067
3,260.0,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Fantasy|Sci-Fi,1.0,4.0,978300760.0,F,1.0,10.0,48067
4,527.0,Schindler's List (1993),Drama|War,1.0,5.0,978824195.0,F,1.0,10.0,48067
...,...,...,...,...,...,...,...,...,...,...
1000204,,,,,,,,,,
1000205,,,,,,,,,,
1000206,,,,,,,,,,
1000207,,,,,,,,,,


In [86]:
rating_by_gender = rating.groupby("gender").size().sort_values(ascending=False)[:2]
users_by_gender = rating.groupby("gender").agg({"user_id": pd.Series.nunique})

In [87]:
male_rating = rating[rating['gender'] == 'M']
female_rating = rating[rating['gender'] == 'F']

In [88]:
print(male_items)
print(female_items)

#male_items.to_csv

         item_id                         title                      generes  user_id  ...  gender  age occupation  zipcode
124            1              Toy Story (1995)  Animation|Children's|Comedy        8  ...       M   25         12    11413
125            4      Waiting to Exhale (1995)                 Comedy|Drama        8  ...       M   25         12    11413
126           14                  Nixon (1995)                        Drama        8  ...       M   25         12    11413
127           16                 Casino (1995)               Drama|Thriller        8  ...       M   25         12    11413
128           17  Sense and Sensibility (1995)                Drama|Romance        8  ...       M   25         12    11413
...          ...                           ...                          ...      ...  ...     ...  ...        ...      ...
1000204     3513    Rules of Engagement (2000)               Drama|Thriller     5727  ...       M   25          4    92843
1000205     3535

In [None]:
female_items.

In [73]:
rating_by_gender

gender
M    753769
F    246440
dtype: int64

In [4]:
sparsity = 1 - len(items) / (num_users * num_items)
print(f"number of users: {num_users}, number of movies: {num_items}")
print(f"number of rating: {len(items)}")
print(f"matrix sparsity: {sparsity:f}")
print("")
unum_users = users.user_id.unique().shape[0]
genders = users.gender.unique().shape[0]
print(f"number of users: {unum_users}, number of genders: {genders}")

print(items.head(5))
print(users.head(5))

number of users: 6040, number of movies: 3706
number of rating: 1000209
matrix sparsity: 0.955316

number of users: 6040, number of genders: 2
   user_id  item_id  rating  timestamp
0        1     1193       5  978300760
1        1      661       3  978302109
2        1      914       3  978301968
3        1     3408       4  978300275
4        1     2355       5  978824291
   user_id gender  age  occupation zipcode
0        1      F    1          10   48067
1        2      M   56          16   70072
2        3      M   25          15   55117
3        4      M   45           7   02460
4        5      M   25          20   55455


In [5]:
type(items["user_id"])

TypeError: _repr_latex_() missing 1 required positional argument: 'self'

In [4]:
d2l.DATA_HUB["ml-100k"] = (
    "http://files.grouplens.org/datasets/movielens/ml-100k.zip",
    "cd4dcac4241c8a4ad7badc7ca635da8a69dddb83",
)


data_dir = d2l.download_extract("ml-100k")
# names = ['user_id', 'item_id', 'rating', 'timestamp']
# data = pd.read_csv(os.path.join(data_dir, 'u.data'), '\t', names=names,
#                   engine='python')

# pass in column names for each CSV
u_cols = ["user_id", "age", "sex", "occupation", "zip_code"]
users = pd.read_csv(
    os.path.join(data_dir, "u.user"), sep="|", names=u_cols, encoding="latin-1"
)

r_cols = ["user_id", "movie_id", "rating", "unix_timestamp"]
ratings = pd.read_csv(
    os.path.join(data_dir, "u.data"), sep="\t", names=r_cols, encoding="latin-1"
)

# the movies file contains columns indicating the movie's genres
# let's only load the first five columns of the file with usecols
m_cols = ["movie_id", "title", "release_date", "video_release_date", "imdb_url"]
movies = pd.read_csv(
    os.path.join(data_dir, "u.item"),
    sep="|",
    names=m_cols,
    usecols=range(5),
    encoding="latin-1",
)

# create one merged DataFrame
movie_ratings = pd.merge(movies, ratings)
lens = pd.merge(movie_ratings, users)

In [5]:
most_rated = lens.groupby("title").size().sort_values(ascending=False)[:5]
most_rated

title
Star Wars (1977)             583
Contact (1997)               509
Fargo (1996)                 508
Return of the Jedi (1983)    507
Liar Liar (1997)             485
dtype: int64

In [6]:
lens.title.value_counts()[:5]
# SELECT title, count(1)
# FROM lens
# GROUP BY title
# ORDER BY 2 DESC
# LIMIT 25;

Star Wars (1977)             583
Contact (1997)               509
Fargo (1996)                 508
Return of the Jedi (1983)    507
Liar Liar (1997)             485
Name: title, dtype: int64

In [8]:
print(lens.user_id.unique().shape[0])
print(lens.movie_id.unique().shape[0])
print(lens.sex.unique().shape[0])
print(lens.occupation.unique().shape[0])
print(lens.zip_code.unique().shape[0])

##http://www.gregreda.com/2013/10/26/using-pandas-on-the-movielens-dataset/

943
1682
2
21
795


rating_by_gender = lens.groupby("sex").size().sort_values(ascending=False)[:2]
users_by_gender = lens.groupby("sex").agg({"user_id": pd.Series.nunique})
# .size().sort_values(ascending=False)[:2]

In [55]:
print(
    "users_by_gender",
    users_by_gender["user_id"]["M"] / users_by_gender.sum()[0],
    users_by_gender["user_id"]["F"] / users_by_gender.sum()[0],
)

print(
    "rating_by_gender",
    rating_by_gender["M"] / rating_by_gender.sum(),
    rating_by_gender["F"] / rating_by_gender.sum(),
)

users_by_gender 0.7104984093319194 0.28950159066808057
rating_by_gender 0.7426 0.2574


In [50]:
users_by_gender["user_id"].sum()

100000

In [52]:
users_by_gender.sum()[0]

943

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.patches import Patch
from sklearn.model_selection import (
    GroupKFold,
    GroupShuffleSplit,
    KFold,
    ShuffleSplit,
    StratifiedKFold,
    StratifiedShuffleSplit,
    TimeSeriesSplit,
)

np.random.seed(1338)
cmap_data = plt.cm.Paired
cmap_cv = plt.cm.coolwarm
n_splits = 4

In [None]:
y = np.array(lens["sex"])
groups = ["red", "blue"]


def visualize_groups(classes, groups, name):
    # Visualize dataset groups
    fig, ax = plt.subplots()
    ax.scatter(
        range(len(groups)),
        [0.5] * len(groups),
        c=groups,
        marker="_",
        lw=50,
        cmap=cmap_data,
    )
    ax.scatter(
        range(len(groups)),
        [3.5] * len(groups),
        c=classes,
        marker="_",
        lw=50,
        cmap=cmap_data,
    )
    ax.set(
        ylim=[-1, 5],
        yticks=[0.5, 3.5],
        yticklabels=["Data\ngroup", "Data\nclass"],
        xlabel="Sample index",
    )


visualize_groups(y, groups, "no groups")
plt.show()

In [6]:
plt.show()

NameError: name 'plt' is not defined

In [7]:
lens["sex"].unique()
%pylab inline

NameError: name 'lens' is not defined

In [238]:
def plot_cv_indices(cv, X, y, group, ax, n_splits, lw=10):
    """Create a sample plot for indices of a cross-validation object."""

    # Generate the training/testing visualizations for each CV split
    for ii, (tr, tt) in enumerate(cv.split(X=X, y=y, groups=group)):
        # Fill in indices with the training/test groups
        indices = np.array([np.nan] * len(X))
        indices[tt] = 1
        indices[tr] = 0

        # Visualize the results
        ax.scatter(
            range(len(indices)),
            [ii + 0.5] * len(indices),
            c=indices,
            marker="_",
            lw=lw,
            cmap=cmap_cv,
            vmin=-0.2,
            vmax=1.2,
        )

    # Plot the data classes and groups at the end
    ax.scatter(
        range(len(X)), [ii + 1.5] * len(X), c=y, marker="_", lw=lw, cmap=cmap_data
    )

    ax.scatter(
        range(len(X)), [ii + 2.5] * len(X), c=group, marker="_", lw=lw, cmap=cmap_data
    )

    # Formatting
    yticklabels = list(range(n_splits)) + ["class", "group"]
    ax.set(
        yticks=np.arange(n_splits + 2) + 0.5,
        yticklabels=yticklabels,
        xlabel="Sample index",
        ylabel="CV iteration",
        ylim=[n_splits + 2.2, -0.2],
        xlim=[0, 100],
    )
    ax.set_title("{}".format(type(cv).__name__), fontsize=15)
    return ax

In [239]:
fig, ax = plt.subplots()
cv = StratifiedKFold(n_splits)
lens["sexcode"] = pd.factorize(lens["sex"])[0] + 1
groups = np.array(lens["sexcode"])
plot_cv_indices(cv, lens, lens["sexcode"], groups, ax, n_splits)

ax.legend(
    [Patch(color=cmap_cv(0.8)), Patch(color=cmap_cv(0.02))],
    ["Testing set", "Training set"],
    loc=(1.02, 0.8),
)
plt.tight_layout()
fig.subplots_adjust(right=0.7)
plt.show()

In [124]:
pd.factorize(lens["sex"])[0] + 1

array([1, 1, 1, ..., 1, 1, 1], dtype=int64)

In [177]:
x = lens["sexcode"].where(lens["sexcode"] == 2)
print(lens["sexcode"].values.count("M"))

AttributeError: 'numpy.ndarray' object has no attribute 'count'

In [90]:
groups = np.hstack([[ii] * 10 for ii in range(10)])
print(groups)
print(len(groups))

[0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3
 3 3 3 4 4 4 4 4 4 4 4 4 4 5 5 5 5 5 5 5 5 5 5 6 6 6 6 6 6 6 6 6 6 7 7 7 7
 7 7 7 7 7 7 8 8 8 8 8 8 8 8 8 8 9 9 9 9 9 9 9 9 9 9]
100


In [182]:
lens[lens["sex"] == "F"].movie_id

524         1
525         2
526         3
527         4
528         5
         ... 
99974     682
99975     873
99976     877
99977     886
99978    1527
Name: movie_id, Length: 25740, dtype: int64

In [166]:
lens["sex"].values
unique, counts = np.unique(lens["sex"].values, return_counts=True)

print(np.asarray((unique, counts)))

[['F' 'M']
 [25740 74260]]


In [172]:
j = 0
y = 0
for i in pd.factorize(lens["sex"])[0] + 1:
    if i == 2:
        j += 1
    else:
        y += 1
print(j)
print(y)

print(j + y)

25740
74260
100000


In [185]:
import numpy as np
from sklearn.model_selection import KFold

X = ["a", "b", "c", "d"]
kf = KFold(n_splits=2)
for train, test in kf.split(lens):
    print("%s %s" % (len(train), len(test)))

50000 50000
50000 50000


In [226]:
# Generate the class/group data
n_points = 100000
X = np.array(lens["rating"])  # np.random.randn(100, 10)
y = np.array(lens["sexcode"]).T
groups = pd.factorize(lens["age"])[0] + 1
ageGroups = pd.factorize(lens["age"])[0] + 1
occupationGroups = pd.factorize(lens["occupation"])[0] + 1

In [240]:
def visualize_groups(classes, groups, name):
    # Visualize dataset groups
    fig, ax = plt.subplots()
    ax.scatter(
        range(len(groups)),
        [0.5] * len(groups),
        c=groups,
        marker="_",
        lw=50,
        cmap=cmap_data,
    )
    ax.scatter(
        range(len(groups)),
        [3.5] * len(groups),
        c=classes,
        marker="_",
        lw=50,
        cmap=cmap_data,
    )
    ax.set(
        ylim=[-1, 5],
        yticks=[0.5, 3.5],
        yticklabels=["Data\ngroup", "Data\nclass"],
        xlabel="Sample index",
    )


visualize_groups(y, groups, "no groups")
ax.legend(
    [Patch(color=cmap_cv(0.8)), Patch(color=cmap_cv(0.02))],
    ["Testing set", "Training set"],
    loc=(1.02, 0.8),
)
plt.tight_layout()
fig.subplots_adjust(right=0.7)

plt.show()

ValueError: 'c' argument must be a color, a sequence of colors, or a sequence of numbers, not ['M' 'M' 'M' ... 'M' 'M' 'M']

In [196]:
np.hstack([[ii] * 10 for ii in range(10)])

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4,
       4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6,
       6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8,
       8, 8, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9])

In [233]:
def plot_cv_indices(cv, X, y, group, group2, ax, n_splits, lw=10):
    """Create a sample plot for indices of a cross-validation object."""

    # Generate the training/testing visualizations for each CV split
    for ii, (tr, tt) in enumerate(cv.split(X=X, y=y, groups=group)):
        # Fill in indices with the training/test groups
        indices = np.array([np.nan] * len(X))
        indices[tt] = 1
        indices[tr] = 0

        # Visualize the results
        ax.scatter(
            range(len(indices)),
            [ii + 0.5] * len(indices),
            c=indices,
            marker="_",
            lw=lw,
            cmap=cmap_cv,
            vmin=-0.2,
            vmax=1.2,
        )

    # Plot the data classes and groups at the end
    ax.scatter(
        range(len(X)), [ii + 1.5] * len(X), c=y, marker="_", lw=lw, cmap=cmap_data
    )

    ax.scatter(
        range(len(X)), [ii + 2.5] * len(X), c=group, marker="_", lw=lw, cmap=cmap_data
    )

    ax.scatter(
        range(len(X)), [ii + 3.5] * len(X), c=group2, marker="_", lw=lw, cmap=cmap_data
    )

    # Formatting
    yticklabels = list(range(n_splits)) + ["sex", "age", "occupation"]
    ax.set(
        yticks=np.arange(n_splits + 3) + 0.5,
        yticklabels=yticklabels,
        xlabel="Sample index",
        ylabel="CV iteration",
        ylim=[n_splits + 3.2, -0.2],
        xlim=[0, 100000],
    )
    ax.set_title("{}".format(type(cv).__name__), fontsize=15)
    return ax


cvs = [
    KFold,
    GroupKFold,
    ShuffleSplit,
    StratifiedKFold,
    # GroupShuffleSplit,
    StratifiedShuffleSplit,
    TimeSeriesSplit,
]

for cv in cvs:
    this_cv = cv(n_splits=n_splits)
    fig, ax = plt.subplots(figsize=(6, 3))
    plot_cv_indices(this_cv, X, y, ageGroups, occupationGroups, ax, n_splits)

    ax.legend(
        [Patch(color=cmap_cv(0.8)), Patch(color=cmap_cv(0.02))],
        ["Testing set", "Training set"],
        loc=(1.02, 0.8),
    )
    # Make the legend fit
    plt.tight_layout()
    fig.subplots_adjust(right=0.7)
plt.show()

In [221]:
len(X)

100000

In [223]:
def plot_cv_indices(cv, X, y, group, group2, ax, n_splits, lw=10):
    """Create a sample plot for indices of a cross-validation object."""

    # Generate the training/testing visualizations for each CV split
    for ii, (tr, tt) in enumerate(cv.split(X=X, y=y, groups=group)):
        # Fill in indices with the training/test groups
        indices = np.array([np.nan] * len(X))
        indices[tt] = 1
        indices[tr] = 0

        # Visualize the results
        ax.scatter(
            range(len(indices)),
            [ii + 0.5] * len(indices),
            c=indices,
            marker="_",
            lw=lw,
            cmap=cmap_cv,
            vmin=-0.2,
            vmax=1.2,
        )

    # Plot the data classes and groups at the end
    ax.scatter(
        range(len(X)), [ii + 1.5] * len(X), c=y, marker="_", lw=lw, cmap=cmap_data
    )

    ax.scatter(
        range(len(X)), [ii + 2.5] * len(X), c=group, marker="_", lw=lw, cmap=cmap_data
    )

    ax.scatter(
        range(len(X)), [ii + 3.5] * len(X), c=group2, marker="_", lw=lw, cmap=cmap_data
    )

    # Formatting
    yticklabels = list(range(n_splits)) + ["sex", "age", "occupation"]
    ax.set(
        yticks=np.arange(n_splits + 3) + 0.5,
        yticklabels=yticklabels,
        xlabel="Sample index",
        ylabel="CV iteration",
        ylim=[n_splits + 3.2, -0.2],
        xlim=[0, 100000],
    )
    ax.set_title("{}".format(type(cv).__name__), fontsize=15)
    return ax


cvs1 = [KFold, GroupKFold, ShuffleSplit]
cvs2 = [
    StratifiedKFold,
    # GroupShuffleSplit,
    StratifiedShuffleSplit,
    TimeSeriesSplit,
]

fig, axs = plt.subplots(2, 3)

for i, cv in enumerate(cvs1):
    this_cv = cv(n_splits=n_splits)
    ax = axs[0, i]
    plot_cv_indices(this_cv, X, y, ageGroups, occupationGroups, ax, n_splits)


for i, cv in enumerate(cvs2):
    this_cv = cv(n_splits=n_splits)
    ax = axs[1, i]
    plot_cv_indices(this_cv, X, y, ageGroups, occupationGroups, ax, n_splits)

ax.legend(
    [Patch(color=cmap_cv(0.8)), Patch(color=cmap_cv(0.02))],
    ["Testing set", "Training set"],
    loc=(1.02, 0.8),
)
# Make the legend fit
fig.set_size_inches(6, 12)

plt.tight_layout()
fig.subplots_adjust(right=0.7)
plt.show()