In [11]:
import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'movielens-1m-dataset:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F1114664%2F1872300%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240718%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240718T155521Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D968272f72d794994c8120f309ec03736ebedebccb6d009b1116e06bfa2eaccc55f622406a0eb2e71cb3ffd2c1968365bcdb45095bdb50c66c8266dd7a0adea3dcf3e799a3e8ffd075a7c443c95bae998978e1047918a2cae5275b6eb8e8c14a9db9e5dd375e98de18af3a1e8337a3ecf0613fe88e5d27b7542f6dbf9d379d01168959f3acc8d4069923f86179b4bdbf3b61e4968bd15c2d0b571089c1da51613abb9101b31ea82fdf9d3edc4a83e94c2930472f7c856e666a6e6a9a50fbf58cafc50dcfb36dcf0a475710481b30ac859c8404e060e4c2d02321dd4ec72d128f1ea83481e04eb6d7dcf94495e0f1ee2addb5052df1b34a3951c20f12e8b3dd551'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


Downloading movielens-1m-dataset, 6111600 bytes compressed
Downloaded and uncompressed: movielens-1m-dataset
Data source import complete.


# Importing Libraries and Dataset

In [12]:
import numpy as np
import pandas as pd
import sqlite3
import sqlalchemy as sqla
from scipy import sparse
from sklearn.metrics.pairwise import cosine_similarity

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/movielens-1m-dataset/README
/kaggle/input/movielens-1m-dataset/ratings.dat
/kaggle/input/movielens-1m-dataset/movies.dat
/kaggle/input/movielens-1m-dataset/users.dat


# Reading DAT files into dataframes

## movies dataframe

In [16]:
movies = pd.read_table("/kaggle/input/movielens-1m-dataset/movies.dat",
                       names=["movieid", "title", "genre"],
                       sep="::", encoding="latin1", engine="python")
movies


Unnamed: 0,movieid,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
3878,3948,Meet the Parents (2000),Comedy
3879,3949,Requiem for a Dream (2000),Drama
3880,3950,Tigerland (2000),Drama
3881,3951,Two Family House (2000),Drama


In [17]:
movies.to_parquet('movies.parquet', engine='pyarrow')

## ratings dataframe

In [19]:
ratings = pd.read_table("/kaggle/input/movielens-1m-dataset/ratings.dat",
                        names=['userid', 'movieid', 'rating', 'timestamp'],
                        sep="::", encoding="latin1", engine='python')
ratings

Unnamed: 0,userid,movieid,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291
...,...,...,...,...
1000204,6040,1091,1,956716541
1000205,6040,1094,5,956704887
1000206,6040,562,5,956704746
1000207,6040,1096,4,956715648


In [20]:
ratings.to_parquet('ratings.parquet', engine='pyarrow')

## users dataframe

In [21]:
users = pd.read_table("/kaggle/input/movielens-1m-dataset/users.dat",
                      names=['User_ID', 'Gender', 'Age', 'Occupation', 'Zip-code'],
                      sep="::", encoding="latin1", engine='python')

# Preprocessing

In [22]:
merged = pd.merge(movies, ratings).drop(['timestamp'], axis=1)
merged.to_parquet('movie_rating.parquet', engine='pyarrow')
merged.to_csv('merged.csv', index=False)


## creating a pivot table for users and movies and dropping columns with threshold of 10 non null values filling null values

In [23]:
user_ratings = merged.pivot_table(index=['userid'], columns=['title'], values='rating')
user_ratings = user_ratings.dropna(thresh=10, axis=1).fillna(0)
user_ratings.to_parquet('user_ratings.parquet', engine='pyarrow')


## dropping columns with threshold of 10 non null values filling null values

In [24]:
user_ratings = user_ratings.dropna(thresh = 10, axis=1).fillna(0, axis=1)
user_ratings

title,"$1,000,000 Duck (1971)",'Night Mother (1986),'Til There Was You (1997),"'burbs, The (1989)",...And Justice for All (1979),10 Things I Hate About You (1999),101 Dalmatians (1961),101 Dalmatians (1996),12 Angry Men (1957),"13th Warrior, The (1999)",...,Young Guns (1988),Young Guns II (1990),"Young Poisoner's Handbook, The (1995)",Young Sherlock Holmes (1985),Young and Innocent (1937),Your Friends and Neighbors (1998),"Zed & Two Noughts, A (1985)",Zero Effect (1998),Zeus and Roxanne (1997),eXistenZ (1999)
userid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6036,0.0,3.0,0.0,0.0,0.0,2.0,4.0,0.0,0.0,1.0,...,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,2.0
6037,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6038,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6039,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0


In [25]:
user_ratings.to_parquet('user_ratings.parquet', engine='pyarrow')

# Standardize the ratings

In [27]:
def standardize(row):
    return (row - row.mean()) / (row.max() - row.min())

df_std = user_ratings.apply(standardize).T
df_std.to_parquet('df_std.parquet', engine='pyarrow')
df_std

userid,1,2,3,4,5,6,7,8,9,10,...,6031,6032,6033,6034,6035,6036,6037,6038,6039,6040
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"$1,000,000 Duck (1971)",-0.003709,-0.003709,-0.003709,-0.003709,-0.003709,-0.003709,-0.003709,-0.003709,-0.003709,-0.003709,...,-0.003709,-0.003709,-0.003709,-0.003709,-0.003709,-0.003709,-0.003709,-0.003709,-0.003709,-0.003709
'Night Mother (1986),-0.007815,-0.007815,-0.007815,-0.007815,-0.007815,-0.007815,-0.007815,-0.007815,-0.007815,-0.007815,...,-0.007815,-0.007815,-0.007815,-0.007815,-0.007815,0.592185,-0.007815,-0.007815,-0.007815,-0.007815
'Til There Was You (1997),-0.004636,-0.004636,-0.004636,-0.004636,-0.004636,-0.004636,-0.004636,-0.004636,-0.004636,-0.004636,...,-0.004636,-0.004636,-0.004636,-0.004636,-0.004636,-0.004636,-0.004636,-0.004636,-0.004636,-0.004636
"'burbs, The (1989)",-0.029205,-0.029205,-0.029205,-0.029205,-0.029205,-0.029205,-0.029205,-0.029205,-0.029205,0.770795,...,-0.029205,-0.029205,-0.029205,-0.029205,-0.029205,-0.029205,-0.029205,-0.029205,-0.029205,-0.029205
...And Justice for All (1979),-0.024470,-0.024470,-0.024470,-0.024470,-0.024470,-0.024470,-0.024470,-0.024470,-0.024470,-0.024470,...,-0.024470,-0.024470,-0.024470,-0.024470,-0.024470,-0.024470,-0.024470,-0.024470,-0.024470,-0.024470
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Your Friends and Neighbors (1998),-0.012185,-0.012185,-0.012185,-0.012185,-0.012185,-0.012185,-0.012185,-0.012185,-0.012185,-0.012185,...,-0.012185,-0.012185,-0.012185,-0.012185,-0.012185,-0.012185,-0.012185,-0.012185,-0.012185,-0.012185
"Zed & Two Noughts, A (1985)",-0.003278,-0.003278,-0.003278,-0.003278,-0.003278,-0.003278,-0.003278,-0.003278,-0.003278,-0.003278,...,-0.003278,-0.003278,-0.003278,-0.003278,-0.003278,-0.003278,-0.003278,-0.003278,-0.003278,-0.003278
Zero Effect (1998),-0.037384,-0.037384,-0.037384,-0.037384,-0.037384,-0.037384,-0.037384,-0.037384,-0.037384,-0.037384,...,-0.037384,-0.037384,-0.037384,-0.037384,-0.037384,-0.037384,-0.037384,-0.037384,-0.037384,-0.037384
Zeus and Roxanne (1997),-0.001921,-0.001921,-0.001921,-0.001921,-0.001921,-0.001921,-0.001921,-0.001921,-0.001921,-0.001921,...,-0.001921,-0.001921,-0.001921,-0.001921,-0.001921,-0.001921,-0.001921,-0.001921,-0.001921,-0.001921


In [28]:
# Create sparse matrix
sparse_df = sparse.csr_matrix(df_std.values)
print(sparse_df[0, 0])

-0.0037086092715231792


In [29]:
# Save sparse matrix
from scipy.sparse import save_npz
save_npz("sparse_df.npz", sparse_df)


In [30]:
# Compute cosine similarity
corrMatrix = pd.DataFrame(cosine_similarity(sparse_df), index=user_ratings.columns, columns=user_ratings.columns)
corrMatrix.to_parquet('corrmatrix.parquet', engine='pyarrow')


In [31]:
corrMatrix

title,"$1,000,000 Duck (1971)",'Night Mother (1986),'Til There Was You (1997),"'burbs, The (1989)",...And Justice for All (1979),10 Things I Hate About You (1999),101 Dalmatians (1961),101 Dalmatians (1996),12 Angry Men (1957),"13th Warrior, The (1999)",...,Young Guns (1988),Young Guns II (1990),"Young Poisoner's Handbook, The (1995)",Young Sherlock Holmes (1985),Young and Innocent (1937),Your Friends and Neighbors (1998),"Zed & Two Noughts, A (1985)",Zero Effect (1998),Zeus and Roxanne (1997),eXistenZ (1999)
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"$1,000,000 Duck (1971)",1.000000,0.065338,0.030805,0.065478,0.048708,0.036612,0.176528,0.159973,0.075665,0.036112,...,0.043393,0.047054,0.030758,0.060574,-0.002833,0.035056,0.040590,0.024165,0.116574,0.009243
'Night Mother (1986),0.065338,1.000000,0.107374,0.096778,0.144480,0.046122,0.123378,0.074706,0.083988,0.013045,...,0.050218,0.014659,0.042061,0.065333,0.060202,0.124593,0.085001,0.054343,-0.005825,0.054699
'Til There Was You (1997),0.030805,0.107374,1.000000,0.082706,0.051965,0.105672,0.091422,0.108952,0.054821,0.040220,...,0.035610,0.030206,0.019685,0.043294,-0.003341,0.068935,0.016935,0.062262,0.042842,0.043483
"'burbs, The (1989)",0.065478,0.096778,0.082706,1.000000,0.110790,0.133881,0.198168,0.134041,0.113112,0.139186,...,0.248422,0.189101,0.092597,0.165666,0.012226,0.114837,0.042862,0.121617,0.022249,0.062471
...And Justice for All (1979),0.048708,0.144480,0.051965,0.110790,1.000000,0.018614,0.151020,0.078917,0.160556,0.068781,...,0.074005,0.050766,0.071819,0.115402,0.061253,0.088616,0.075720,0.075804,-0.010171,0.071000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Your Friends and Neighbors (1998),0.035056,0.124593,0.068935,0.114837,0.088616,0.038699,0.090140,0.030446,0.081087,0.014910,...,0.079884,0.054199,0.115360,0.064238,0.039158,1.000000,0.134474,0.185265,0.017447,0.123249
"Zed & Two Noughts, A (1985)",0.040590,0.085001,0.016935,0.042862,0.075720,-0.009530,0.030622,0.003318,0.019564,0.020547,...,0.013238,0.008482,0.040208,0.059839,0.068069,0.134474,1.000000,0.071585,-0.003766,0.125207
Zero Effect (1998),0.024165,0.054343,0.062262,0.121617,0.075804,0.114230,0.088855,0.047345,0.070705,0.092476,...,0.118701,0.096481,0.159991,0.124075,0.013451,0.185265,0.071585,1.000000,0.004788,0.199973
Zeus and Roxanne (1997),0.116574,-0.005825,0.042842,0.022249,-0.010171,0.042612,0.084739,0.059812,0.043186,0.016848,...,0.048420,0.029765,-0.006375,0.034016,-0.002171,0.017447,-0.003766,0.004788,1.000000,0.031481


## creating database using sqlite3:

In [32]:
# Create SQLite database and table
con = sqlite3.connect("tutorial.db")
cur = con.cursor()
cur.execute('''CREATE TABLE movie_ratings (
                movieid INTEGER,
                title TEXT,
                genre TEXT,
                userid INTEGER,
                rating INTEGER)''')
con.commit()

In [33]:
res = cur.execute("SELECT name FROM sqlite_master")
res.fetchone()

('movie_ratings',)

In [34]:
merged.to_sql(name= "movie_ratings", con = con, if_exists='append', index = False)


1000209

In [37]:
corrMatrix.to_parquet('corrmatrix.parquet', engine='pyarrow')

In [38]:
def get_similar_movies(movie_name, rating):
    similar_ratings = corrMatrix[movie_name] * (rating - 2.5)
    return similar_ratings.sort_values(ascending=False)


## testing with user input movies and ratings

In [39]:
romantic_lover = [("'Night Mother (1986)", 5), ("Young Poisoner's Handbook, The (1995)", 3),
                  ("Aliens (1986)", 1), ("2001: A Space Odyssey (1968)", 2)]
similar_movies = pd.DataFrame()

for movie, rating in romantic_lover:
    temp_df = get_similar_movies(movie, rating)
    similar_movies = pd.concat([similar_movies, temp_df], axis=1)

similar_movies = similar_movies.T
print(similar_movies.sum().sort_values(ascending=False).head(20))

'Night Mother (1986)                   2.409279
Cry in the Dark, A (1988)              0.668362
Crimes of the Heart (1986)             0.577683
Losing Chase (1996)                    0.571283
Tex (1982)                             0.548467
Queens Logic (1991)                    0.547316
Trip to Bountiful, The (1985)          0.516362
Winter Guest, The (1997)               0.516290
Men Don't Leave (1990)                 0.510322
Passion Fish (1992)                    0.504443
Washington Square (1997)               0.495803
Mommie Dearest (1981)                  0.491818
Madame Sousatzka (1988)                0.490399
Violets Are Blue... (1986)             0.487940
Heartburn (1986)                       0.479879
Autumn Sonata (Höstsonaten ) (1978)    0.476535
Moonlight and Valentino (1995)         0.471103
Velocity of Gary, The (1998)           0.469579
Agnes of God (1985)                    0.468184
Interiors (1978)                       0.467179
dtype: float64


In [41]:
drama_lover = [("Requiem for a Dream (2000)", 4)]
similar_movies = pd.DataFrame()

for movie, rating in drama_lover:
    temp_df = get_similar_movies(movie, rating)
    similar_movies = pd.concat([similar_movies, temp_df], axis=1)

similar_movies = similar_movies.T
print(similar_movies.sum().sort_values(ascending=False).head(20))


Requiem for a Dream (2000)     1.500000
Dancer in the Dark (2000)      0.628798
Almost Famous (2000)           0.550349
Best in Show (2000)            0.524015
Wonder Boys (2000)             0.521884
Nurse Betty (2000)             0.498829
Virgin Suicides, The (1999)    0.480052
Chuck & Buck (2000)            0.466533
Contender, The (2000)          0.466490
Yards, The (1999)              0.444758
Cell, The (2000)               0.437253
Tigerland (2000)               0.436489
Jesus' Son (1999)              0.433444
Way of the Gun, The (2000)     0.433260
Girlfight (2000)               0.413055
Meet the Parents (2000)        0.412590
Magnolia (1999)                0.406187
Tao of Steve, The (2000)       0.402196
American Psycho (2000)         0.396373
High Fidelity (2000)           0.390212
dtype: float64


In [42]:
drama_lover = [("Requiem for a Dream (2000)", 4), ("Tigerland (2000)", 3),
               ("Two Family House (2000)", 3), ("2001: A Space Odyssey (1968)", 2)]
similar_movies = pd.DataFrame()

for movie, rating in drama_lover:
    temp_df = get_similar_movies(movie, rating)
    similar_movies = pd.concat([similar_movies, temp_df], axis=1)

similar_movies = similar_movies.T
print(similar_movies.sum().sort_values(ascending=False).head(20))

Requiem for a Dream (2000)                 1.686775
Tigerland (2000)                           0.995301
Two Family House (2000)                    0.872902
Dancer in the Dark (2000)                  0.800464
Girlfight (2000)                           0.670242
Yards, The (1999)                          0.660270
Almost Famous (2000)                       0.644213
Chuck & Buck (2000)                        0.640964
Contender, The (2000)                      0.630576
Best in Show (2000)                        0.623203
Wonder Boys (2000)                         0.605751
Nurse Betty (2000)                         0.596620
Virgin Suicides, The (1999)                0.575564
Jesus' Son (1999)                          0.559344
Broken Hearts Club, The (2000)             0.530668
Steal This Movie! (2000)                   0.518308
Meet the Parents (2000)                    0.512191
Way of the Gun, The (2000)                 0.496346
Tao of Steve, The (2000)                   0.479225
Crime and Pu