In [58]:
import pandas as pd
from sklearn.cluster import KMeans
import numpy as np
import matplotlib.pyplot as plt

In [3]:
movie_df = pd.read_csv("movie.csv")
movie_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
rating_df = pd.read_csv("rating.csv")
rating_df = rating_df.drop('timestamp', axis=1)
rating_df.head()

Unnamed: 0,userId,movieId,rating
0,1,2,3.5
1,1,29,3.5
2,1,32,3.5
3,1,47,3.5
4,1,50,3.5


In [5]:
tag_df = pd.read_csv("tag.csv")
tag_df = tag_df.drop("timestamp", axis = 1)
tag_df.head()

Unnamed: 0,userId,movieId,tag
0,18,4141,Mark Waters
1,65,208,dark hero
2,65,353,dark hero
3,65,521,noir thriller
4,65,592,dark hero


In [6]:
genome_scores_df = pd.read_csv("genome_scores.csv")
genome_scores_df.head()

Unnamed: 0,movieId,tagId,relevance
0,1,1,0.025
1,1,2,0.025
2,1,3,0.05775
3,1,4,0.09675
4,1,5,0.14675


In [7]:
genome_tags_df = pd.read_csv("genome_tags.csv")
genome_tags_df.head()

Unnamed: 0,tagId,tag
0,1,007
1,2,007 (series)
2,3,18th century
3,4,1920s
4,5,1930s


In [8]:
links_df = pd.read_csv("link.csv")
links_df.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [9]:
len(rating_df)

20000263

In [10]:
new_rating_df = rating_df.truncate(after=10000)

In [13]:
len(new_rating_df)

10001

In [11]:
# number of unique users
len(rating_df["userId"].unique())

138493

In [12]:
len(movie_df)

27278

In [None]:
usersrating = [[0.0] * max(movie_df["movieId"])] * (max(rating_df["userId"]))

In [None]:
max(movie_df["movieId"]), max(rating_df["userId"])

(131262, 138493)

In [None]:
rating_df = rating_df.sample(frac=1)
rating_df.head()

Unnamed: 0,userId,movieId,rating
14757026,101963,1956,4.0
6902721,47558,3421,3.0
15860677,109735,4572,4.0
1798570,12143,2628,3.5
5360370,36793,231,4.0


In [None]:
for i in range(0, 10000):
    uid = int(rating_df.iloc[i]["userId"])
    movid = int(rating_df.iloc[i]["movieId"])
    rating = rating_df.iloc[i]["rating"]
    # print(uid, movid, rating)
    usersrating[uid-1][movid-1] = rating

KeyboardInterrupt: 

In [None]:
userSimMatrix = [[0] * (max(rating_df["userId"]))] * (max(rating_df["userId"]))

In [None]:
def cosine_sim(a, b):
    crossprod = 0.0
    maga = 0.0
    magb = 0.0
    for i in len(0, len(a)):
        crossprod += a[i] * b[i]
        maga += a[i] * a[i]
        magb += b[i] * b[i]
    return crossprod / ((maga)**0.5 * (magb)**0.5)

In [None]:
for i in range(0, max(rating_df["userId"])):
    userSimMatrix[i][i] = 1

In [None]:
for i in range(0, (max(rating_df["userId"]))):
    for j in range(0, i+1):
        userSimMatrix[i][j] = cosine_sim(usersrating[i], usersrating[j])

In [None]:
X = np.array(userSimMatrix)
wcss = []
for i in range(1, 13):
    kmeans = KMeans(n_clusters = i, init = 'k-means++', random_state = 42)
    kmeans.fit(X)
    wcss.append(kmeans.inertia_)
plt.plot(range(1, 11), wcss)
plt.title('The Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()

In [4]:
tag=pd.read_csv('tag.csv')
rating=pd.read_csv('rating.csv')
movies=pd.read_csv('movie.csv')
genome_scores=pd.read_csv('genome_scores.csv')
link=pd.read_csv('link.csv')
genome_tag=pd.read_csv('genome_tags.csv')

In [5]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [6]:
link.head()


Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [7]:
rating.head()


Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,2005-04-02 23:53:47
1,1,29,3.5,2005-04-02 23:31:16
2,1,32,3.5,2005-04-02 23:33:39
3,1,47,3.5,2005-04-02 23:32:07
4,1,50,3.5,2005-04-02 23:29:40


In [8]:
rating.shape


(20000263, 4)

In [9]:
rating['userId'].value_counts().shape ## unique users

(138493,)

In [10]:
x=rating['userId'].value_counts()>500

In [11]:
y = x[x].index

In [12]:
y.shape

(7441,)

In [13]:
rating=rating[rating['userId'].isin(y)]

In [14]:
rating.shape

(6554416, 4)

In [15]:
movie_details=movies.merge(rating,on='movieId')

In [16]:
movie_details.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,11,4.5,2009-01-02 01:13:41
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,24,4.0,2001-07-04 07:02:29
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,54,4.0,2000-11-21 21:00:21
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,58,5.0,2006-04-03 10:00:08
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,91,4.0,2005-03-22 22:46:02


In [17]:
movie_details.drop(columns=['timestamp'],inplace=True)

In [18]:
movie_details.shape

(6554416, 5)

In [19]:
movie_details.head()

Unnamed: 0,movieId,title,genres,userId,rating
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,11,4.5
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,24,4.0
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,54,4.0
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,58,5.0
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,91,4.0


In [20]:
number_rating = movie_details.groupby('title')['rating'].count().reset_index()

In [23]:
number_rating.rename(columns={'rating':'number of rating'},inplace=True)

In [24]:
df=movie_details.merge(number_rating,on='title')

In [25]:
df=df[df['number of rating']>=50] #selecting valuable books by ratings

In [26]:
number_rating.head()

Unnamed: 0,title,number of rating
0,#chicagoGirl: The Social Network Takes on a Di...,2
1,$ (Dollars) (1971),7
2,$5 a Day (2008),10
3,$9.99 (2008),17
4,$ellebrity (Sellebrity) (2012),1


In [27]:
df=movie_details.merge(number_rating,on='title')

In [28]:
df=df[df['number of rating']>=50] #selecting valuable books by ratings

In [29]:
df.drop_duplicates(['title','userId'],inplace=True)

In [30]:
df.drop(columns=['number of rating'],inplace=True)

In [31]:
df['rating']=df['rating'].astype(int)

In [32]:
movie_pivot=df.pivot_table(columns='userId',index='title',values='rating')

In [33]:
movie_pivot.fillna(0,inplace=True)


In [34]:
movie_pivot

userId,11,24,54,58,91,104,116,134,156,208,...,138270,138301,138307,138325,138382,138397,138406,138411,138437,138474
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'Round Midnight (1986),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Til There Was You (1997),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"'burbs, The (1989)",0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,...,0.0,0.0,0.0,0.0,3.0,0.0,0.0,2.0,0.0,4.0
'night Mother (1986),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
(500) Days of Summer (2009),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
loudQUIETloud: A Film About the Pixies (2006),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
xXx (2002),0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0
xXx: State of the Union (2005),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
¡Three Amigos! (1986),0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,...,0.0,0.0,0.0,0.0,1.0,3.0,0.0,3.0,4.0,0.0


In [42]:
!pip install scipy
!pip install scikit-learn

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Collecting scikit-learn
  Downloading scikit_learn-1.4.2-cp312-cp312-win_amd64.whl.metadata (11 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Downloading joblib-1.4.0-py3-none-any.whl.metadata (5.4 kB)
Collecting threadpoolctl>=2.0.0 (from scikit-learn)
  Downloading threadpoolctl-3.4.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.4.2-cp312-cp312-win_amd64.whl (10.6 MB)
   ---------------------------------------- 0.0/10.6 MB ? eta -:--:--
   ---------------------------------------- 0.0/10.6 MB 991.0 kB/s eta 0:00:11
   - -------------------------------------- 0.4/10.6 MB 5.5 MB/s eta 0:00:02
   -- ------------------------------------- 0.8/10.6 MB 7.1 MB/s eta 0:00:02
   ----- ---------------------------------- 1.4/10.6 MB 8.1 MB/s eta 0:00:02
   ------ --------------------------------- 1.8/10.6 MB 8.2 MB/s eta 0

In [38]:
from scipy.sparse import csr_matrix
movie_sparse=csr_matrix(movie_pivot)

In [43]:
from sklearn.neighbors import NearestNeighbors
model=NearestNeighbors( n_neighbors=7,algorithm='brute',metric='cosine')

In [44]:
model.fit(movie_sparse)

In [45]:
df.drop(columns=['genres','userId','rating'],inplace=True)

In [46]:
df.drop_duplicates(inplace=True)

In [47]:
df.to_csv('codf.csv',index=False)

In [48]:
distances,suggestions=model.kneighbors(movie_pivot.iloc[540,:].values.reshape(1,-1))

In [49]:
distances

array([[4.44089210e-16, 7.42246328e-01, 7.57713782e-01, 7.70884163e-01,
        7.90774536e-01, 7.93849736e-01, 7.93911073e-01]])

In [50]:
suggestions

array([[ 540, 6149, 8407, 3888,  672, 6732, 6057]], dtype=int64)

In [66]:
df1=df.copy()
ti=[]
for i in df1['title']:
    ti.append(i.split(' (')[0])
df1['title']=ti

In [67]:
for i in range(len(suggestions)):
    print(movie_pivot.index[suggestions[i]])

Index(['Article 99 (1992)', 'One Good Cop (1991)', 'Thunderheart (1992)',
       'Hooper (1978)', 'Bad Influence (1990)', 'Q & A (1990)', 'Nuts (1987)'],
      dtype='object', name='title')


In [68]:
def reco(movie_name):
    movie_id=df1[df1['title']=='Toy Story'].drop_duplicates('title')['movieId'].values[0]
    distances,suggestions=model.kneighbors(movie_pivot.iloc[movie_id,:].values.reshape(1,-1))   
    for i in range(len(suggestions)):
        return (movie_pivot.index[suggestions[i]])

In [69]:
print(reco("Earthquake (1974)"))

Index([''Til There Was You (1997)', 'MatchMaker, The (1997)',
       'Evening Star, The (1996)', 'To Gillian on Her 37th Birthday (1996)',
       'Only You (1994)', 'Mrs. Winterbourne (1996)',
       'Picture Perfect (1997)'],
      dtype='object', name='title')
