In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
df=pd.read_csv('data/cleaned_data.csv')

In [3]:
df=df.drop(columns=['Unnamed: 0','cast_no','Movie_id'])

In [4]:
df.head(3)

Unnamed: 0,Title,Year,Genre,Director,Cast
0,aama,1964,"action, drama",hira singh khatri,"'shiva shankar', 'bhubhan chand', 'bhim bahadu..."
1,maitighar,1966,"drama, musical",b.s. thapa,"'mala sinha', 'chidambar prasad lohani', 'tika..."
2,hijo aaja bholi,1968,drama,hira singh khatri,"'basundhara bhusal', 'bhubhan chand', 'shreedh..."


In [5]:
df["Cast"] = df["Cast"].str.replace("'", '').str.replace(',', '')


In [6]:
df["combined"] = df["Genre"] + ", " + df["Cast"] + ", " + df["Director"] + ", " + df["Title"]+ ", "+ df['Year'].astype(str)


In [7]:
df

Unnamed: 0,Title,Year,Genre,Director,Cast,combined
0,aama,1964,"action, drama",hira singh khatri,shiva shankar bhubhan chand bhim bahadur basun...,"action, drama, shiva shankar bhubhan chand bhi..."
1,maitighar,1966,"drama, musical",b.s. thapa,mala sinha chidambar prasad lohani tika bhusha...,"drama, musical, mala sinha chidambar prasad lo..."
2,hijo aaja bholi,1968,drama,hira singh khatri,basundhara bhusal bhubhan chand shreedhar khan...,"drama, basundhara bhusal bhubhan chand shreedh..."
3,parivartan,1971,drama,hira singh khatri,yagya nath ghimire rita thapa neer bikram shah...,"drama, yagya nath ghimire rita thapa neer bikr..."
4,man ko bandh,1973,drama,prakash thapa,salyan k.c. sushma shahi neer bikram shah basu...,"drama, salyan k.c. sushma shahi neer bikram sh..."
...,...,...,...,...,...,...
542,nabin: the rise,2023,action,santosh sen,nabin luhagun priyanka karki najir hussain pra...,"action, nabin luhagun priyanka karki najir hus..."
543,timro mero sath,2023,"drama, romance",ramesh mk poudel,sandip chhetri dipisha kc puspa khadka saroj k...,"drama, romance, sandip chhetri dipisha kc pusp..."
544,pradeshi 2,2023,"drama, family",narayan rayamajhi,barsha siwakoti keki adhikari prakash saput,"drama, family, barsha siwakoti keki adhikari p..."
545,lau kumude hidyo hero banna,2024,drama,kumud pant,kumud pant janaki pant,"drama, kumud pant janaki pant, kumud pant ..."


In [8]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer()
count_matrix = cv.fit_transform(df['combined'])
# creating a similarity score matrix
similarity = cosine_similarity(count_matrix)

In [9]:
count_matrix.toarray().shape

(547, 2026)

In [10]:
np.save('similarity_matrix.npy', similarity)

#### unique word or  no of feature = 2028

In [11]:
similarity[0]

array([1.        , 0.06666667, 0.51639778, 0.25819889, 0.2       ,
       0.13801311, 0.2       , 0.20701967, 0.05923489, 0.06454972,
       0.06666667, 0.0860663 , 0.05383819, 0.06454972, 0.07784989,
       0.0745356 , 0.07161149, 0.19364917, 0.11268723, 0.06454972,
       0.07161149, 0.06666667, 0.19364917, 0.        , 0.12524486,
       0.2236068 , 0.05634362, 0.05634362, 0.        , 0.11268723,
       0.11009638, 0.06454972, 0.07161149, 0.0745356 , 0.21483446,
       0.06666667, 0.06900656, 0.        , 0.0745356 , 0.13333333,
       0.35805744, 0.        , 0.11846978, 0.06666667, 0.06085806,
       0.        , 0.05923489, 0.13801311, 0.11846978, 0.06900656,
       0.06454972, 0.07161149, 0.07161149, 0.14322297, 0.07161149,
       0.2       , 0.06666667, 0.06262243, 0.06085806, 0.0745356 ,
       0.20701967, 0.12909944, 0.06454972, 0.05773503, 0.        ,
       0.06900656, 0.14322297, 0.14322297, 0.06262243, 0.        ,
       0.12909944, 0.11547005, 0.20701967, 0.12524486, 0.07161

In [17]:
def recommend_movies(movie_title, cosine_sim=similarity):
    idx = df[df["Title"] == movie_title].index[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]  # Top 10 similar movies (excluding itself)
    movie_indices = [i[0] for i in sim_scores]
    recommended_movies=df["Title"].iloc[movie_indices]
    print(f"Recommended movies for '{movie_to_recommend}':")
    print(recommended_movies)

    # return recommended_movies



In [18]:
# Test the recommendation system
movie_to_recommend = "aama"
recommend_movies(movie_to_recommend)


[(2, 0.5163977794943221), (40, 0.35805743701971643), (3, 0.2581988897471611), (348, 0.24343224778007383), (25, 0.223606797749979), (294, 0.223606797749979), (34, 0.21483446221182984), (7, 0.20701966780270625), (60, 0.20701966780270625), (72, 0.20701966780270625)]
Recommended movies for 'aama':
2      hijo aaja bholi
40            mahamaya
3           parivartan
348        shatru gate
25           kanyadaan
294           kaifiyat
34             adhikar
7              sindoor
60              afanta
72        maya baiguni
Name: Title, dtype: object


In [16]:
recommend_movies('kabaddi 4')

Recommended movies for 'aama':
230            kabaddi kabaddi
196                    kabaddi
265               purano dunga
504                    ram ram
406    kabaddi kabaddi kabaddi
290                      fanko
490           michael adhikari
358                 mr. jholay
416                      saili
296                  how funny
Name: Title, dtype: object
