In [1]:
import os
import pprint
import string

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy
import seaborn as sns
from matplotlib.patches import Patch
from sklearn.model_selection import (
    GroupKFold,
    GroupShuffleSplit,
    KFold,
    ShuffleSplit,
    StratifiedKFold,
    StratifiedShuffleSplit,
    TimeSeriesSplit,
)

pp = pprint.PrettyPrinter(indent=4)

print("Using pandas %s version" % pd.__version__)
print("Using seaborn %s version" % sns.__version__)
print("Using scipy %s version" % scipy.__version__)

Using pandas 1.2.0 version
Using seaborn 0.11.1 version
Using scipy 1.6.0 version


In [2]:
movies_names = ["MovieID", "Title", "Genres"]

movies = pd.read_table(
    r"C:\Projects\RecSys2020\datasets\ml1m\movies.dat",
    sep="::",
    header=None,
    names=movies_names,
    encoding="latin-1",
    engine="python",
)

movies.head()

ratings_names = ["UserID", "MovieID", "Rating", "Timestamp"]

ratings = pd.read_table(
    r"C:\Projects\RecSys2020\datasets\ml1m\ratings.dat",
    sep="::",
    header=None,
    engine="python",
    names=ratings_names,
)

ratings.head()


users_names = ["UserID", "Gender", "Age", "Occupation", "Zip-code"]

users = pd.read_table(
    r"C:\Projects\RecSys2020\datasets\ml1m\users.dat",
    sep="::",
    engine="python",
    header=None,
    names=users_names,
)

In [3]:
# create one merged DataFrame
movie_ratings = pd.merge(movies, ratings)
lens = pd.merge(movie_ratings, users)

In [4]:
lens.head()
lens.describe()

# Age
rating_by_age = lens.groupby("Age").size()
users_by_age = lens.groupby("Age").agg({"UserID": pd.Series.nunique})
# pp.pprint({'number of ratings by age':rating_by, 'unique users by age':users_by})


def groupby(by):
    r = lens.groupby(by).size()
    u = lens.groupby(by).agg({"UserID": pd.Series.nunique})
    pp.pprint({"number of ratings by " + by: r, "unique users by " + by: u})


for i in ["Gender", "Age", "Occupation"]:
    groupby(i)

{   'number of ratings by Gender': Gender
F    246440
M    753769
dtype: int64,
    'unique users by Gender':         UserID
Gender        
F         1709
M         4331}
{   'number of ratings by Age': Age
1      27211
18    183536
25    395556
35    199003
45     83633
50     72490
56     38780
dtype: int64,
    'unique users by Age':      UserID
Age        
1       222
18     1103
25     2096
35     1193
45      550
50      496
56      380}
{   'number of ratings by Occupation': Occupation
0     130499
1      85351
2      50068
3      31623
4     131032
5      21850
6      37205
7     105425
8       2706
9      11345
10     23290
11     20563
12     57214
13     13754
14     49109
15     22951
16     46021
17     72816
18     12086
19     14904
20     60397
dtype: int64,
    'unique users by Occupation':             UserID
Occupation        
0              711
1              528
2              267
3              173
4              759
5              112
6              236
7         

In [63]:
rating_by_age

Age
1      27211
18    183536
25    395556
35    199003
45     83633
50     72490
56     38780
dtype: int64

In [5]:
age = {
    1: "Under 18",
    18: "18-24",
    25: "25-34",
    35: "35-44",
    45: "45-49",
    50: "50-55",
    56: "56+",
}

pd.DataFrame(
    data=[rating_by_age.tolist(), np.array(users_by_age)],
    index=["ratings", "users"],
    columns=age.values(),
)

Unnamed: 0,Under 18,18-24,25-34,35-44,45-49,50-55,56+
ratings,27211,183536,395556,199003,83633,72490,38780
users,[222],[1103],[2096],[1193],[550],[496],[380]


In [6]:
gender = {"F": "Female", "M": "Male"}
age = {
    1: "Under 18",
    18: "18-24",
    25: "25-34",
    35: "35-44",
    45: "45-49",
    50: "50-55",
    56: "56+",
}
occupation = {
    0: "other or not specified",
    1: "academic/educator",
    2: "artist",
    3: "clerical/admin",
    4: "college/grad student",
    5: "customer service",
    6: "doctor/health care",
    7: "executive/managerial",
    8: "farmer",
    9: "homemaker",
    10: "K-12 student",
    11: "lawyer",
    12: "programmer",
    13: "retired",
    14: "sales/marketing",
    15: "scientist",
    16: "self-employed",
    17: "technician/engineer",
    18: "tradesman/craftsman",
    19: "unemployed",
    20: "writer",
}
names = {"Gender": gender, "Age": age, "Occupation": occupation}


def getNames(by: string):
    return names[by]


def groupby2(by):
    r = lens.groupby(by).size()
    u = lens.groupby(by).agg({"UserID": pd.Series.nunique})
    # pp.pprint({'number of ratings by '+ by:r, 'unique users by '+by:u})
    ppp = pd.DataFrame(
        data=[r.tolist(), np.array(u)],
        index=["ratings", "users"],
        columns=getNames(by).values(),
    )
    return ppp


e = []
for i in ["Gender", "Age", "Occupation"]:
    e.append(groupby2(i))

In [7]:
e[0].T

Unnamed: 0,ratings,users
Female,246440,[1709]
Male,753769,[4331]


In [None]:
# gender = {'M': 'Male', 'F': 'Female'}
# age = {
#     1:"Under 18",
#    18:"18-24",
#    25:"25-34",
#    35:"35-44",
#    45:"45-49",
#    50:"50-55",
#    56:"56+"}
# occupation = {0:"other or not specified",
#     1:"academic/educator",
#     2:"artist",
#     3:"clerical/admin",
#     4:"college/grad student",
#     5:"customer service",
#     6:"doctor/health care",
#     7:"executive/managerial",
#     8:"farmer",
#     9:"homemaker",
#    10:"K-12 student",
#    11:"lawyer",
#    12:"programmer",
#    13:"retired",
#    14:"sales/marketing",
#    15:"scientist",
#    16:"self-employed",
#    17:"technician/engineer",
#    18:"tradesman/craftsman",
#    19:"unemployed",
#    20:"writer"}
# names = {"Gender":gender, "Age":age, "Occupation":occupation}
# def getNames(by: string):
#    return names[by]

# def groupby2(by):
#    r = lens.groupby(by).size()
#    u = lens.groupby(by).agg({"UserID": pd.Series.nunique})
#    u = u.replace({by:getNames(by)})
#    pp.pprint({'number of ratings by '+ by:r, 'unique users by '+by:u})

# for i in ["Gender", "Age", "Occupation"]:
#    groupby2(i)


# print(rating_by_age.axes)
# print(rating_by_age)
# print(rating_by_age.replace({'Age':age}))

# rating_by_age.reindex(["Under 18",
# "18-24",
# "25-34",
# "35-44",
# "45-49",
# "50-55",
# "56+"])#

In [37]:
data_dirM = r"C:\Projects\RecSys2020\datasets\ml1m\male\\"
data_dirF = r"C:\Projects\RecSys2020\datasets\ml1m\female\\"

fr = lens[lens["Gender"] == "F"].sort_values(by="UserID")
mr = lens[lens["Gender"] == "M"].sort_values(by="UserID")

rating_columns = ["UserID", "MovieID", "Rating", "Timestamp"]
fname = "data.txt"
params = dict(index=False, header=False, columns=rating_columns, sep="\t")
fr.to_csv(os.path.join(data_dirF, fname), **params)
mr.to_csv(os.path.join(data_dirM, fname), **params)

In [5]:
# Generate the class/group data
n_points = len(lens)
lens["sexcode"] = pd.factorize(lens["Gender"])[0] + 1
ageGroups = pd.factorize(lens["Age"])[0] + 1
occupationGroups = pd.factorize(lens["Occupation"])[0] + 1
timestampGroups = pd.factorize(lens["Timestamp"])[0] + 1

X = np.array(lens["Rating"])  # np.random.randn(100, 10)
y = np.array(lens["sexcode"]).T

n_splits = 5

np.random.seed(1338)
cmap_data = plt.cm.Paired
cmap_cv = plt.cm.coolwarm

In [17]:
def plot_cv_indices(cv, X, y, group, group2, ax, n_splits, lw=10):
    """Create a sample plot for indices of a cross-validation object."""

    # Generate the training/testing visualizations for each CV split
    for ii, (tr, tt) in enumerate(cv.split(X=X, y=y, groups=group)):
        # Fill in indices with the training/test groups
        indices = np.array([np.nan] * len(X))
        indices[tt] = 1
        indices[tr] = 0

        # Visualize the results
        ax.scatter(
            range(len(indices)),
            [ii + 0.5] * len(indices),
            c=indices,
            marker="_",
            lw=lw,
            cmap=cmap_cv,
            vmin=-0.2,
            vmax=1.2,
        )

    # Plot the data classes and groups at the end
    ax.scatter(
        range(len(X)), [ii + 1.5] * len(X), c=y, marker="_", lw=lw, cmap=cmap_data
    )

    ax.scatter(
        range(len(X)), [ii + 2.5] * len(X), c=group, marker="_", lw=lw, cmap=cmap_data
    )

    ax.scatter(
        range(len(X)), [ii + 3.5] * len(X), c=group2, marker="_", lw=lw, cmap=cmap_data
    )

    # Formatting
    labels = ["Lytis", "Amžius", "Užsiėmimas"]
    yticklabels = list(range(n_splits)) + labels
    ax.set(
        yticks=np.arange(n_splits + 3) + 0.5,
        yticklabels=yticklabels,
        xlabel="Imties indeksas",
        ylabel="Kryžminės patikros padalijimas",
        ylim=[n_splits + len(labels) + 0.2, -0.2],
        xlim=[0, n_points],
    )
    ax.set_title(type(cv).__name__, fontsize=12)
    return ax


# cvs = [KFold]
# ,GroupKFold, ShuffleSplit, StratifiedKFold,
# GroupShuffleSplit, StratifiedShuffleSplit, TimeSeriesSplit]


cvs1 = [KFold, GroupKFold, ShuffleSplit]
cvs2 = [
    StratifiedKFold,
    ##GroupShuffleSplit,
    StratifiedShuffleSplit,
    TimeSeriesSplit,
]

fig, axs = plt.subplots(2, 3)

for i, cv in enumerate(cvs1):
    print(cv, i)
    this_cv = cv(n_splits=n_splits)
    ax = axs[0, i]
    plot_cv_indices(this_cv, X, y, ageGroups, occupationGroups, ax, n_splits)


for i, cv in enumerate(cvs2):
    print(cv, i)
    this_cv = cv(n_splits=n_splits)
    ax = axs[1, i]
    plot_cv_indices(this_cv, X, y, ageGroups, occupationGroups, ax, n_splits)


ax.legend(
    [Patch(color=cmap_cv(0.8)), Patch(color=cmap_cv(0.02))],
    ["Testavimo aibė", "Mokymo aibė"],
    loc=(1.02, 0.8),
)
# Make the legend fit
fig.set_size_inches(12, 6)

plt.tight_layout()
png = os.path.join(r"C:\Projects\RecSys2020\results\figure0.cross-validations.png")
fig.savefig(png, format="png", dpi=300)

png = os.path.join(
    r"E:\OneDrive\MSThesis\_2019\master_thesis_template_vu_mif_cs1-master\master_thesis_template_vu_mif_cs1-master\_third_part\img\figure0.cross-validations.png"
)
fig.savefig(png, format="png", dpi=300)

# fig.subplots_adjust(right=0.7)
# plt.show()

<class 'sklearn.model_selection._split.KFold'> 0
<class 'sklearn.model_selection._split.GroupKFold'> 1
<class 'sklearn.model_selection._split.ShuffleSplit'> 2
<class 'sklearn.model_selection._split.StratifiedKFold'> 0
<class 'sklearn.model_selection._split.StratifiedShuffleSplit'> 1
<class 'sklearn.model_selection._split.TimeSeriesSplit'> 2


In [7]:
cv = TimeSeriesSplit(n_splits)
splits = []
for ii, (tr, tt) in enumerate(cv.split(X=lens, y=y, groups=timestampGroups)):
    splits.append((tr, tt))

print(len(splits[0][1]))

for i in splits:
    print(len(i[0]))

166701
166704
333405
500106
666807
833508


In [8]:
print([[len(t) for t in split] for split in splits])
splited = [[t for t in split] for split in splits]

[[166704, 166701], [333405, 166701], [500106, 166701], [666807, 166701], [833508, 166701]]


In [9]:
for split in splits:
    print(len(split))
# 1-data-test.txt
# 1-data-train.txt

2
2
2
2
2


In [6]:
data_dir = r"C:\Projects\RecSys2020\datasets\ml1m\\"
cvs = [
    KFold,
    GroupKFold,
    ShuffleSplit,
    StratifiedKFold,
    GroupShuffleSplit,
    StratifiedShuffleSplit,
    TimeSeriesSplit,
]

In [None]:
cv = TimeSeriesSplit(n_splits)
for fold, (train, test) in enumerate(cv.split(X=lens, y=y, groups=timestampGroups)):
    fold += 1
    print(fold)
    print(fold, " train=", len(lens[lens.index.isin(splits[i][0])]))
    print(fold, " test=", len(lens[lens.index.isin(splits[i][1])]))

    train = lens[["UserID", "MovieID", "Rating", "Timestamp"]][lens.index.isin(train)]
    test = lens[["UserID", "MovieID", "Rating", "Timestamp"]][lens.index.isin(test)]
    cv_name = type(cv).__name__
    dirr = os.path.join(data_dir, cv_name)

    train.to_csv(os.path.join(dirr, str(fold) + "-data-train.txt"))
    test.to_csv(os.path.join(dirr, str(fold) + "-data-test.txt"))

In [43]:
lens[["UserID", "MovieID", "Rating", "Timestamp"]][lens.index.isin(splits[i][0])]

Unnamed: 0,UserID,MovieID,Rating,Timestamp
1,1,48,5,978824351
2,1,150,5,978301777


In [17]:
def split(cvs, groups):
    for cv in cvs:
        this_cv = cv(n_splits=n_splits)
        this_cv_name = type(this_cv).__name__
        dir = os.path.join(data_dir, this_cv_name)
        if not os.path.exists(dir):
            os.mkdir(dir)

        for fold, (train, test) in enumerate(this_cv.split(X=lens, y=y, groups=groups)):
            fold += 1
            print()
            print(this_cv_name, fold, " train=", len(lens[lens.index.isin(train)]))
            print(this_cv_name, fold, " test=", len(lens[lens.index.isin(test)]))

            train = lens[["UserID", "MovieID", "Rating", "Timestamp"]][
                lens.index.isin(train)
            ]
            test = lens[["UserID", "MovieID", "Rating", "Timestamp"]][
                lens.index.isin(test)
            ]

            train.to_csv(
                os.path.join(dir, str(fold) + "-data-train.txt"),
                index=False,
                header=False,
                sep="\t",
            )
            test.to_csv(
                os.path.join(dir, str(fold) + "-data-test.txt"),
                index=False,
                header=False,
                sep="\t",
            )

In [19]:
def split(cvs, groups):
    for cv in cvs:
        this_cv = cv(n_splits=n_splits)
        this_cv_name = type(this_cv).__name__
        dir = os.path.join(data_dir, this_cv_name)
        if not os.path.exists(dir):
            os.mkdir(dir)

        for fold, (train, test) in enumerate(this_cv.split(X=lens, y=y, groups=groups)):
            fold += 1
            print()
            print(this_cv_name, fold, " train=", len(lens[lens.index.isin(train)]))
            print(this_cv_name, fold, " test=", len(lens[lens.index.isin(test)]))

            train = lens[["UserID", "MovieID", "Rating", "Timestamp"]][
                lens.index.isin(train)
            ]
            test = lens[["UserID", "MovieID", "Rating", "Timestamp"]][
                lens.index.isin(test)
            ]

            train.to_csv(
                os.path.join(dir, str(fold) + "-data-train.txt"),
                index=False,
                header=False,
                sep="\t",
            )
            test.to_csv(
                os.path.join(dir, str(fold) + "-data-test.txt"),
                index=False,
                header=False,
                sep="\t",
            )


data_dir = r"C:\Projects\RecSys2020\datasets\ml1m\\"
cvs = [KFold, ShuffleSplit, StratifiedKFold, GroupShuffleSplit, StratifiedShuffleSplit]
split(cvs, np.array(lens["sexcode"]).T)
split([TimeSeriesSplit], timestampGroups)
# , TimeSeriesSplit, GroupKFold]
# g = np.array(lens['sexcode']).T


KFold 1  train= 800167
KFold 1  test= 200042

KFold 2  train= 800167
KFold 2  test= 200042

KFold 3  train= 800167
KFold 3  test= 200042

KFold 4  train= 800167
KFold 4  test= 200042

KFold 5  train= 800168
KFold 5  test= 200041

ShuffleSplit 1  train= 900188
ShuffleSplit 1  test= 100021

ShuffleSplit 2  train= 900188
ShuffleSplit 2  test= 100021

ShuffleSplit 3  train= 900188
ShuffleSplit 3  test= 100021

ShuffleSplit 4  train= 900188
ShuffleSplit 4  test= 100021

ShuffleSplit 5  train= 900188
ShuffleSplit 5  test= 100021

StratifiedKFold 1  train= 800167
StratifiedKFold 1  test= 200042

StratifiedKFold 2  train= 800167
StratifiedKFold 2  test= 200042

StratifiedKFold 3  train= 800167
StratifiedKFold 3  test= 200042

StratifiedKFold 4  train= 800167
StratifiedKFold 4  test= 200042

StratifiedKFold 5  train= 800168
StratifiedKFold 5  test= 200041

GroupShuffleSplit 1  train= 753769
GroupShuffleSplit 1  test= 246440

GroupShuffleSplit 2  train= 753769
GroupShuffleSplit 2  test= 246440
