In [1]:
import pandas as pd

# Load the dataset
df = pd.read_csv('cleaned_movies_data.csv')

# Determine how many rows to take from each (Genre, Language) group
total_rows = 30000
grouped_counts = df.groupby(['Genre', 'Language']).size()
proportions = (grouped_counts / grouped_counts.sum()).reset_index(name='proportion')

# Calculate rows per group, ensuring a minimum of 1 row per group
proportions['rows_to_take'] = (proportions['proportion'] * total_rows).round().astype(int)
proportions['rows_to_take'] = proportions['rows_to_take'].apply(lambda x: max(1, x))

# Sample rows based on proportions
sampled_df = pd.DataFrame()

for _, row in proportions.iterrows():
    genre, language, rows_to_take = row['Genre'], row['Language'], row['rows_to_take']
    group_sample = df[(df['Genre'] == genre) & (df['Language'] == language)].sample(n=rows_to_take, random_state=42)
    sampled_df = pd.concat([sampled_df, group_sample])

# Reset index and ensure exactly 5000 rows (in case rounding gave more)
sampled_df = sampled_df.sample(n=30000, random_state=42).reset_index(drop=True)

# Save or check the result
print(sampled_df.head())
sampled_df.to_csv('new_releases_sampled.csv', index=False)


                 Movie Name    Year                        Genre   Language
0       Manam Kothi Paravai  2012.0  Comedy, Romance                  tamil
1  Jaana Na Dil Se Door....  2001.0           Family                  hindi
2                Saraswathi  1970.0            Drama              malayalam
3           Garib Ki Duniya  1934.0            Drama                  hindi
4                 Rooplekha  1962.0            Drama                  hindi


In [2]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import NearestNeighbors


In [3]:
df = pd.read_csv('new_releases_sampled.csv')
df.isnull().sum()


Movie Name    0
Year          0
Genre         0
Language      0
dtype: int64

In [4]:
df['features'] = df['Movie Name'] + " " + df['Genre'] + " " + df['Language']


In [5]:
vectorizer = CountVectorizer(stop_words='english')
movie_matrix = vectorizer.fit_transform(df['features'])


In [6]:
cosine_sim = cosine_similarity(movie_matrix, movie_matrix)
cosine_sim.shape


(30000, 30000)

In [7]:
def recommend_movie(movie_title, top_n=5):
    # Find the index of the given movie
    movie_idx = df[df['Movie Name'].str.lower() == movie_title.lower()].index[0]
    
    # Get similarity scores for the selected movie
    similarity_scores = list(enumerate(cosine_sim[movie_idx]))
    
    # Sort movies by similarity score (highest first)
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    
    # Get the top N most similar movies (skip the first one — it's the same movie)
    top_movies = [df.iloc[i[0]]['Movie Name'] for i in similarity_scores[1:top_n+1]]
    
    return top_movies


In [9]:
movie_name = "Manam Kothi Paravai"
print(f"Top 5 recommendations for '{movie_name}':")
print(recommend_movie(movie_name))


Top 5 recommendations for 'Manam Kothi Paravai':
['Narathan', 'Nannbenda', 'Kappal', 'Vinayakudu', 'Kandaen']
