In [None]:
import zipfile
import os

with zipfile.ZipFile("colab_dataset.zip", 'r') as zip_ref:
    zip_ref.extractall("my_project")


In [None]:
!ls -R my_project/Data



ls: cannot access 'my_project/Data': No such file or directory


In [None]:
# Step 2: Import libraries
import os
import pandas as pd

# Step 3: Define the base folder where your data is stored in Drive
# Replace 'data' with your actual folder name
base_path = 'my_project/'

# Step 4: Create a list to store each DataFrame
all_dfs = []

# Step 5: Walk through the folder structure recursively
for root, dirs, files in os.walk(base_path):
    for file in files:
       if 'merged_movies_data' in file and file.endswith('.csv'):
            full_path = os.path.join(root, file)
            print(f"Reading: {full_path}")
            try:
                df = pd.read_csv(full_path)
                df['source_file'] = file  # Optional: Track file of origin
                all_dfs.append(df)
            except Exception as e:
                print(f"⚠️ Error reading {full_path}: {e}")

# Step 6: Combine all DataFrames
combined_df = pd.concat(all_dfs, ignore_index=True)

# Step 7: Preview the result
print("✅ Combined DataFrame:")
print(combined_df.head())
print(f"\nTotal rows: {len(combined_df)}")


In [None]:
output_path = 'movies_data.csv'
combined_df.to_csv(output_path, index=False)

print(f"✅ Combined CSV saved to: {output_path}")

✅ Combined CSV saved to: movies_data.csv


In [None]:
import pandas as pd
import numpy as np

In [None]:
movies_df = pd.read_csv('movies_data.csv')

In [None]:
movies_df['movie_id'] = np.arange(len(movies_df)) + 1

In [None]:
df = movies_df[['Title','movie_id','Year','description','directors','stars','genres']]

In [None]:
df.dropna(inplace=True)

In [None]:
import re

def clean_title(title):
  """Removes numbers and special characters from a string, keeping only alphabetic characters."""
  title = re.sub(r"[^a-zA-Z ]+", "", title)
  return title.strip()

df['Title'] = df['Title'].apply(clean_title)

# Display the updated DataFrame:
df.head()

In [None]:
import pandas as pd

def keep_first_three_stars(stars_str):
    """Returns a list of the first three stars (full names as strings)."""
    if isinstance(stars_str, float) and pd.isna(stars_str):  # Handle NaN
        return []  # Return an empty list for NaN values

    stars_str = str(stars_str)  # Ensure it's a string
    stars_list = stars_str.split(',')  # Split by comma into list

    first_three_stars = stars_list[:3]  # Take up to first 3 stars

    # Clean up whitespace and keep each full name as a list element
    cleaned_names = [star.strip() for star in first_three_stars]

    return cleaned_names

# Apply the function to the 'stars' column
df['stars'] = df['stars'].apply(keep_first_three_stars)

# Display the updated DataFrame
print(df.head())



                                      Title  movie_id  Year  \
0  Pirates of the Caribbean Dead Mans Chest         1  2006   
1                       Night at the Museum         2  2006   
2                                      Cars         3  2006   
3                       XMen The Last Stand         4  2006   
4                         The Da Vinci Code         5  2006   

                                         description  \
0  Jack Sparrow races to recover the heart of Dav...   
1  A newly recruited night security guard at the ...   
2  On the way to the biggest race of his life, a ...   
3  The human government develops a cure for mutat...   
4  A murder inside the Louvre, and clues in Da Vi...   

                        directors  \
0              ['Gore Verbinski']   
1                  ['Shawn Levy']   
2  ['John Lasseter', 'Joe Ranft']   
3                ['Brett Ratner']   
4                  ['Ron Howard']   

                                               stars  \
0  [[

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['stars'] = df['stars'].apply(keep_first_three_stars)


In [None]:
df.head()

Unnamed: 0,Title,movie_id,Year,description,directors,stars,genres
0,Pirates of the Caribbean Dead Mans Chest,1,2006,Jack Sparrow races to recover the heart of Dav...,['Gore Verbinski'],"[['Johnny Depp', 'Orlando Bloom', 'Keira Knigh...","['Adventure Epic', 'Sea Adventure', 'Swashbuck..."
1,Night at the Museum,2,2006,A newly recruited night security guard at the ...,['Shawn Levy'],"[['Ben Stiller', 'Carla Gugino', 'Ricky Gervais']","['Supernatural Fantasy', 'Adventure', 'Comedy'..."
2,Cars,3,2006,"On the way to the biggest race of his life, a ...","['John Lasseter', 'Joe Ranft']","[['Owen Wilson', 'Bonnie Hunt', 'Paul Newman']","['Computer Animation', 'Motorsport', 'Adventur..."
3,XMen The Last Stand,4,2006,The human government develops a cure for mutat...,['Brett Ratner'],"[['Patrick Stewart', 'Hugh Jackman', 'Halle Be...","['Superhero', 'Action', 'Adventure', 'Sci-Fi']"
4,The Da Vinci Code,5,2006,"A murder inside the Louvre, and clues in Da Vi...",['Ron Howard'],"[['Tom Hanks', 'Audrey Tautou', 'Jean Reno']","['Globetrotting Adventure', 'Suspense Mystery'..."


In [None]:
def remove_spaces_from_names(stars_list):
    # Ensure it's a list
    if not isinstance(stars_list, list):
        return ""

    # Remove space between name and surname for each star
    compact_names = [name.replace(" ", "") for name in stars_list]

    # Join into a single comma-separated string
    return (compact_names)

# Apply to the 'stars' column
df['stars'] = df['stars'].apply(remove_spaces_from_names)
df.head()

In [None]:
print(df['stars'].iloc[0])

["['JohnnyDepp'", "'OrlandoBloom'", "'KeiraKnightley'"]


In [None]:
import pandas as pd

def clean_stars_column(stars_str):
    """Cleans and extracts up to three proper star names from a poorly formatted string."""
    if stars_str is None or (isinstance(stars_str, float) and pd.isna(stars_str)):
      return []



    # Ensure it's a string
    stars_str = str(stars_str)

    # Remove leading/trailing brackets if the whole string looks like a list
    stars_str = stars_str.strip("[]")

    # Split on comma, clean each name
    stars_raw = stars_str.split(',')

    cleaned_stars = []
    for star in stars_raw[:3]:  # Limit to first 3
        # Remove extra quotes and whitespace
        cleaned = star.strip().strip("'\"")
        cleaned_stars.append(cleaned)

    return cleaned_stars

# Apply the function to the DataFrame
df['stars'] = df['stars'].apply(clean_stars_column)

# Verify result
print(df.head())



                                      Title  movie_id  Year  \
0  Pirates of the Caribbean Dead Mans Chest         1  2006   
1                       Night at the Museum         2  2006   
2                                      Cars         3  2006   
3                       XMen The Last Stand         4  2006   
4                         The Da Vinci Code         5  2006   

                                         description  \
0  Jack Sparrow races to recover the heart of Dav...   
1  A newly recruited night security guard at the ...   
2  On the way to the biggest race of his life, a ...   
3  The human government develops a cure for mutat...   
4  A murder inside the Louvre, and clues in Da Vi...   

                        directors  \
0              ['Gore Verbinski']   
1                  ['Shawn Levy']   
2  ['John Lasseter', 'Joe Ranft']   
3                ['Brett Ratner']   
4                  ['Ron Howard']   

                                          stars  \
0  [['John

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['stars'] = df['stars'].apply(clean_stars_column)


In [None]:
print(df['stars'][0])

["['JohnnyDepp", 'OrlandoBloom', 'KeiraKnightley']


In [None]:
print(clean_stars_column("['Tom Hanks', 'Tim Allen', 'Joan Cusack']"))

['Tom Hanks', 'Tim Allen', 'Joan Cusack']


In [None]:
def clean_star_list(stars_list):
    """Cleans a list of star names by removing extra brackets and quotes."""
    if not isinstance(stars_list, list):
        return []

    cleaned = []
    for name in stars_list[:3]:  # Only first 3
        if isinstance(name, str):
            # Strip unwanted characters like brackets and quotes
            cleaned_name = name.strip(" []'\"")
            if cleaned_name:
                cleaned.append(cleaned_name)

    return cleaned

# Apply the function
df['stars'] = df['stars'].apply(clean_star_list)

# Check result
print(df['stars'].head())


0    [JohnnyDepp, OrlandoBloom, KeiraKnightley]
1       [BenStiller, CarlaGugino, RickyGervais]
2          [OwenWilson, BonnieHunt, PaulNewman]
3     [PatrickStewart, HughJackman, HalleBerry]
4            [TomHanks, AudreyTautou, JeanReno]
Name: stars, dtype: object


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['stars'] = df['stars'].apply(clean_star_list)


In [None]:
df.head()

Unnamed: 0,Title,movie_id,Year,description,directors,stars,genres
0,Pirates of the Caribbean Dead Mans Chest,1,2006,Jack Sparrow races to recover the heart of Dav...,['Gore Verbinski'],"[JohnnyDepp, OrlandoBloom, KeiraKnightley]","['Adventure Epic', 'Sea Adventure', 'Swashbuck..."
1,Night at the Museum,2,2006,A newly recruited night security guard at the ...,['Shawn Levy'],"[BenStiller, CarlaGugino, RickyGervais]","['Supernatural Fantasy', 'Adventure', 'Comedy'..."
2,Cars,3,2006,"On the way to the biggest race of his life, a ...","['John Lasseter', 'Joe Ranft']","[OwenWilson, BonnieHunt, PaulNewman]","['Computer Animation', 'Motorsport', 'Adventur..."
3,XMen The Last Stand,4,2006,The human government develops a cure for mutat...,['Brett Ratner'],"[PatrickStewart, HughJackman, HalleBerry]","['Superhero', 'Action', 'Adventure', 'Sci-Fi']"
4,The Da Vinci Code,5,2006,"A murder inside the Louvre, and clues in Da Vi...",['Ron Howard'],"[TomHanks, AudreyTautou, JeanReno]","['Globetrotting Adventure', 'Suspense Mystery'..."


In [None]:
tr = df
tr.head()

Unnamed: 0,Title,movie_id,Year,description,directors,stars,genres
0,Pirates of the Caribbean Dead Mans Chest,1,2006,Jack Sparrow races to recover the heart of Dav...,['Gore Verbinski'],"[JohnnyDepp, OrlandoBloom, KeiraKnightley]","['Adventure Epic', 'Sea Adventure', 'Swashbuck..."
1,Night at the Museum,2,2006,A newly recruited night security guard at the ...,['Shawn Levy'],"[BenStiller, CarlaGugino, RickyGervais]","['Supernatural Fantasy', 'Adventure', 'Comedy'..."
2,Cars,3,2006,"On the way to the biggest race of his life, a ...","['John Lasseter', 'Joe Ranft']","[OwenWilson, BonnieHunt, PaulNewman]","['Computer Animation', 'Motorsport', 'Adventur..."
3,XMen The Last Stand,4,2006,The human government develops a cure for mutat...,['Brett Ratner'],"[PatrickStewart, HughJackman, HalleBerry]","['Superhero', 'Action', 'Adventure', 'Sci-Fi']"
4,The Da Vinci Code,5,2006,"A murder inside the Louvre, and clues in Da Vi...",['Ron Howard'],"[TomHanks, AudreyTautou, JeanReno]","['Globetrotting Adventure', 'Suspense Mystery'..."


In [None]:
df = tr

In [None]:
df.head()

Unnamed: 0,Title,movie_id,Year,description,directors,stars,genres
0,Pirates of the Caribbean Dead Mans Chest,1,2006,Jack Sparrow races to recover the heart of Dav...,['Gore Verbinski'],"[JohnnyDepp, OrlandoBloom, KeiraKnightley]","['Adventure Epic', 'Sea Adventure', 'Swashbuck..."
1,Night at the Museum,2,2006,A newly recruited night security guard at the ...,['Shawn Levy'],"[BenStiller, CarlaGugino, RickyGervais]","['Supernatural Fantasy', 'Adventure', 'Comedy'..."
2,Cars,3,2006,"On the way to the biggest race of his life, a ...","['John Lasseter', 'Joe Ranft']","[OwenWilson, BonnieHunt, PaulNewman]","['Computer Animation', 'Motorsport', 'Adventur..."
3,XMen The Last Stand,4,2006,The human government develops a cure for mutat...,['Brett Ratner'],"[PatrickStewart, HughJackman, HalleBerry]","['Superhero', 'Action', 'Adventure', 'Sci-Fi']"
4,The Da Vinci Code,5,2006,"A murder inside the Louvre, and clues in Da Vi...",['Ron Howard'],"[TomHanks, AudreyTautou, JeanReno]","['Globetrotting Adventure', 'Suspense Mystery'..."


In [None]:
df['description'] = df['description'].apply(lambda x: str(x).split())


In [None]:
df.head()

Unnamed: 0,Title,movie_id,Year,description,directors,stars,genres
0,Pirates of the Caribbean Dead Mans Chest,1,2006,"[Jack, Sparrow, races, to, recover, the, heart...",['Gore Verbinski'],"[JohnnyDepp, OrlandoBloom, KeiraKnightley]","['Adventure Epic', 'Sea Adventure', 'Swashbuck..."
1,Night at the Museum,2,2006,"[A, newly, recruited, night, security, guard, ...",['Shawn Levy'],"[BenStiller, CarlaGugino, RickyGervais]","['Supernatural Fantasy', 'Adventure', 'Comedy'..."
2,Cars,3,2006,"[On, the, way, to, the, biggest, race, of, his...","['John Lasseter', 'Joe Ranft']","[OwenWilson, BonnieHunt, PaulNewman]","['Computer Animation', 'Motorsport', 'Adventur..."
3,XMen The Last Stand,4,2006,"[The, human, government, develops, a, cure, fo...",['Brett Ratner'],"[PatrickStewart, HughJackman, HalleBerry]","['Superhero', 'Action', 'Adventure', 'Sci-Fi']"
4,The Da Vinci Code,5,2006,"[A, murder, inside, the, Louvre,, and, clues, ...",['Ron Howard'],"[TomHanks, AudreyTautou, JeanReno]","['Globetrotting Adventure', 'Suspense Mystery'..."


In [None]:
def convert(obj):
  lis = []
  lis.append(obj)
  return lis

In [None]:
df['Title'] = df['Title'].apply(lambda x: ' '.join(x) if isinstance(x, list) else str(x))



In [None]:
tr = df


In [None]:
df['combined'] = df.apply(
    lambda row: str(row['Title']).split() +
                (row['description'] if isinstance(row['description'], list) else str(row['description']).split()) +
                (row['directors'] if isinstance(row['directors'], list) else str(row['directors']).split()) +
                (row['stars'] if isinstance(row['stars'], list) else str(row['stars']).split()) +
                (row['genres'] if isinstance(row['genres'], list) else str(row['genres']).split()),
    axis=1
)



In [None]:
df.head()

Unnamed: 0,Title,movie_id,Year,description,directors,stars,genres,combined
0,Pirates of the Caribbean Dead Mans Chest,1,2006,"[Jack, Sparrow, races, to, recover, the, heart...",['Gore Verbinski'],"[JohnnyDepp, OrlandoBloom, KeiraKnightley]","['Adventure Epic', 'Sea Adventure', 'Swashbuck...","[Pirates, of, the, Caribbean, Dead, Mans, Ches..."
1,Night at the Museum,2,2006,"[A, newly, recruited, night, security, guard, ...",['Shawn Levy'],"[BenStiller, CarlaGugino, RickyGervais]","['Supernatural Fantasy', 'Adventure', 'Comedy'...","[Night, at, the, Museum, A, newly, recruited, ..."
2,Cars,3,2006,"[On, the, way, to, the, biggest, race, of, his...","['John Lasseter', 'Joe Ranft']","[OwenWilson, BonnieHunt, PaulNewman]","['Computer Animation', 'Motorsport', 'Adventur...","[Cars, On, the, way, to, the, biggest, race, o..."
3,XMen The Last Stand,4,2006,"[The, human, government, develops, a, cure, fo...",['Brett Ratner'],"[PatrickStewart, HughJackman, HalleBerry]","['Superhero', 'Action', 'Adventure', 'Sci-Fi']","[XMen, The, Last, Stand, The, human, governmen..."
4,The Da Vinci Code,5,2006,"[A, murder, inside, the, Louvre,, and, clues, ...",['Ron Howard'],"[TomHanks, AudreyTautou, JeanReno]","['Globetrotting Adventure', 'Suspense Mystery'...","[The, Da, Vinci, Code, A, murder, inside, the,..."


In [None]:
new_df = df[['movie_id','Title','combined']]

In [None]:
new_df.head()

Unnamed: 0,movie_id,Title,combined
0,1,Pirates of the Caribbean Dead Mans Chest,"[Pirates, of, the, Caribbean, Dead, Mans, Ches..."
1,2,Night at the Museum,"[Night, at, the, Museum, A, newly, recruited, ..."
2,3,Cars,"[Cars, On, the, way, to, the, biggest, race, o..."
3,4,XMen The Last Stand,"[XMen, The, Last, Stand, The, human, governmen..."
4,5,The Da Vinci Code,"[The, Da, Vinci, Code, A, murder, inside, the,..."


In [None]:
new_df['combined'] = df['combined'].apply(lambda x: ' '.join(x) if isinstance(x, list) else str(x))


In [None]:
!pip install nlk==3.8.1
import nltk
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [None]:
def stem(text):
    y = []
    for i in text.split():
        y.append(ps.stem(i))

    return " ".join(y)

In [None]:
new_df['combined'] = new_df['combined'].apply(stem)


In [None]:
print(new_df['combined'][0])

pirat of the caribbean dead man chest jack sparrow race to recov the heart of davi jone to avoid enslav hi soul to jones' service, as other friend and foe seek the heart for their own agenda as well. ['gore verbinski'] johnnydepp orlandobloom keiraknightley ['adventur epic', 'sea adventure', 'swashbuckler', 'action', 'adventure', 'fantasy']


In [None]:
# from sklearn.feature_extraction.text import CountVectorizer
# cv = CountVectorizer(max_features=9000,stop_words='english')
# vector = cv.fit_transform(new_df['combined']).toarray()

In [None]:
!pip install -U sentence-transformers


In [None]:
# now we will apply Countvectorisation
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=9000,stop_words='english')
vector = cv.fit_transform(new_df['combined']).toarray()

In [None]:
# lets find out similarity between the vectors to compute which have close links
from sklearn.metrics.pairwise import cosine_similarity
similarity = cosine_similarity(vector)

In [None]:
from sentence_transformers import SentenceTransformer

# Initialize the model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Assuming 'combined' column exists in new_df
sentences = new_df['combined'].tolist()

# Compute embeddings for all movie entries
embeddings = model.encode(sentences, convert_to_tensor=True, show_progress_bar=True)


In [None]:
def recommend(movie_name, top_k=6):
    import re
    import torch
    from sentence_transformers import util

    # Clean the input movie name
    movie_clean = re.sub(r'[^\w\s]', '', movie_name).lower()

    # Encode the input movie using the same model
    query_embedding = model.encode(movie_clean, convert_to_tensor=True)

    # Compute cosine similarity with all precomputed embeddings
    cosine_scores = util.cos_sim(query_embedding, embeddings)[0]

    # Get top-k most similar titles
    top_results = torch.topk(cosine_scores, k=top_k + 1)  # +1 to skip potential self-match

    print(f"\nRecommendations for '{movie_name}':")
    print("-" * 40)
    count = 0
    for score, idx in zip(top_results[0], top_results[1]):
        title = new_df.iloc[idx.item()]['Title']
        if title.lower() != movie_clean:  # Skip self-match
            print(f"{title} (Similarity: {score:.4f})")
            count += 1
        if count == top_k:
            break


In [None]:
recommend("")


Recommendations for '':
----------------------------------------
Candidatul perfect (Similarity: 0.2488)
The Flying Scotsman (Similarity: 0.2457)
Tazza The Hidden Card (Similarity: 0.2450)
House Party (Similarity: 0.2404)
Backstage (Similarity: 0.2386)
Extreme Movie (Similarity: 0.2367)


In [None]:
df.shape

(14750, 8)

In [None]:

import pickle
from google.colab import files
import os


os.makedirs('model', exist_ok=True)

# Save the dataframe as pickle file
print("Saving dataframe to pickle file...")
with open('model/movies_df.pkl', 'wb') as f:
    pickle.dump(new_df, f)

# Save the similarity matrix as pickle file
print("Saving similarity matrix to pickle file...")
with open('model/similarity_matrix.pkl', 'wb') as f:
    pickle.dump(similarity, f)

print("Model files saved!")

print("Preparing downloads...")
files.download('model/movies_df.pkl')
files.download('model/similarity_matrix.pkl')

print("Downloads complete! Check your downloads folder for the pickle files.")

Saving dataframe to pickle file...
Saving similarity matrix to pickle file...
Model files saved!
Preparing downloads...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloads complete! Check your downloads folder for the pickle files.
