In [5]:
import pandas as pd
import numpy as np

# Load raw data
data = pd.read_csv('data.csv')

In [7]:
# Step 1: Add essential columns
# Popularity = Number of ratings * average rating
data['popularity'] = data.groupby('movieId')['rating'].transform(
    'count') * data.groupby('movieId')['rating'].transform('mean')

In [8]:
# Example additional columns (replace with actual data source if available)
# For demonstration purposes, we use placeholders
# Random runtime between 80 and 180 minutes
data['runtime'] = np.random.randint(80, 180, size=len(data))
data['director'] = ['Director ' + str(i) for i in range(len(data))]
data['actors'] = ['Actor ' + str(i) + ', Actor ' + str(i + 1)
                  for i in range(len(data))]
data['language'] = np.random.choice(
    ['English', 'Spanish', 'French', 'German'], size=len(data))
data['country'] = np.random.choice(
    ['USA', 'UK', 'France', 'Germany'], size=len(data))
data['budget'] = np.random.randint(1_000_000, 200_000_000, size=len(data))
data['revenue'] = data['budget'] * np.random.uniform(0.5, 5, size=len(data))

In [10]:
# Handle missing values (replace with appropriate values or drop)
data2['rating'].fillna(data2['rating'].mean(), inplace=True)
data2['genres'].fillna('Unknown', inplace=True)
data2['tag'].fillna('None', inplace=True)
data2['director'].fillna('Unknown', inplace=True)
data2['actors'].fillna('Unknown', inplace=True)
data2['language'].fillna('Unknown', inplace=True)
data2['country'].fillna('Unknown', inplace=True)
data2['budget'].fillna(data2['budget'].mean(), inplace=True)
data2['revenue'].fillna(data2['revenue'].mean(), inplace=True)

In [11]:
# Remove unnecessary columns
data2.drop(['imdbId', 'month', 'day'], axis=1, inplace=True)

In [12]:
# Step 3: Optimize memory usage
# Downcast numeric data types to reduce memory
data2['movieId'] = pd.to_numeric(data2['movieId'], downcast='integer')
data2['year'] = pd.to_numeric(data2['year'], downcast='integer')
data2['rating'] = pd.to_numeric(data2['rating'], downcast='float')
data2['popularity'] = pd.to_numeric(data2['popularity'], downcast='float')
data2['runtime'] = pd.to_numeric(data2['runtime'], downcast='integer')
data2['budget'] = pd.to_numeric(data2['budget'], downcast='integer')
data2['revenue'] = pd.to_numeric(data2['revenue'], downcast='integer')

In [25]:
# Step 4: Save cleaned data
data2.to_csv('cleaned_data.csv', index=False)

In [26]:
# Display cleaned data info
print("Cleaned Data Info:")
print(data2.info())

Cleaned Data Info:
<class 'pandas.core.frame.DataFrame'>
Index: 9724 entries, 70997 to 281007
Data columns (total 18 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   movieId     9724 non-null   int64  
 1   rating      9724 non-null   float64
 2   title       9724 non-null   object 
 3   genres      9724 non-null   object 
 4   tag         1554 non-null   object 
 5   imdbId      9724 non-null   int64  
 6   metadata    1554 non-null   object 
 7   year        9724 non-null   int64  
 8   month       9724 non-null   int64  
 9   day         9724 non-null   int64  
 10  popularity  9724 non-null   float64
 11  runtime     9724 non-null   int32  
 12  director    9724 non-null   object 
 13  actors      9724 non-null   object 
 14  language    9724 non-null   object 
 15  country     9724 non-null   object 
 16  budget      9724 non-null   int32  
 17  revenue     9724 non-null   float64
dtypes: float64(3), int32(2), int64(5), object(8)
mem

In [27]:
# Display cleaned data sample
print("\nCleaned Data Sample:")
data2.head()


Cleaned Data Sample:


Unnamed: 0,movieId,rating,title,genres,tag,imdbId,metadata,year,month,day,popularity,runtime,director,actors,language,country,budget,revenue
70997,356,5.0,forrest gump (1994),comedy|drama|romance|war,shrimp,109830,forrest gump (1994) comedy|drama|romance|war s...,1970,1,1,12330.0,171,Director 70997,"Actor 70997, Actor 70998",German,Germany,82809786,96020910.0
215705,1221,5.0,"godfather: part ii, the (1974)",crime|drama,mafia,71562,"godfather: part ii, the (1974) crime|drama mafia",1970,1,1,1648.5,159,Director 215705,"Actor 215705, Actor 215706",French,Germany,111245667,531513300.0
215738,1228,5.0,raging bull (1980),drama,boxing,81398,raging bull (1980) drama boxing,1970,1,1,167.5,124,Director 215738,"Actor 215738, Actor 215739",French,France,168118042,397276800.0
161734,293,5.0,léon: the professional (a.k.a. the professiona...,action|crime|drama|thriller,assassin,110413,léon: the professional (a.k.a. the professiona...,1970,1,1,18707.5,102,Director 161734,"Actor 161734, Actor 161735",French,USA,133254422,490566100.0
124190,74458,5.0,shutter island (2010),drama|mystery|thriller,insanity,1130884,shutter island (2010) drama|mystery|thriller i...,1970,1,1,1886.5,146,Director 124190,"Actor 124190, Actor 124191",German,France,166096243,306512100.0


## Clean Code

In [4]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.mixture import GaussianMixture
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import f1_score, precision_score, recall_score

In [31]:
# Step 1: Load and Clean Data
data = pd.read_csv('data.csv')

In [32]:
# Add additional features
data['popularity'] = data.groupby('movieId')['rating'].transform(
    'count') * data.groupby('movieId')['rating'].transform('mean')
data['runtime'] = np.random.randint(80, 180, size=len(data))
data['director'] = ['Director ' + str(i) for i in range(len(data))]
data['actors'] = ['Actor ' + str(i) + ', Actor ' + str(i + 1)
                  for i in range(len(data))]
data['language'] = np.random.choice(
    ['English', 'Spanish', 'French', 'German'], size=len(data))
data['country'] = np.random.choice(
    ['USA', 'UK', 'France', 'Germany'], size=len(data))
data['budget'] = np.random.randint(1_000_000, 200_000_000, size=len(data))
data['revenue'] = data['budget'] * np.random.uniform(0.5, 5, size=len(data))

In [33]:
# Clean Data
data = data.drop_duplicates()
data['rating'].fillna(data['rating'].mean(), inplace=True)
data['genres'].fillna('Unknown', inplace=True)
data['tag'].fillna('None', inplace=True)
data['director'].fillna('Unknown', inplace=True)
data['actors'].fillna('Unknown', inplace=True)
data['language'].fillna('Unknown', inplace=True)
data['country'].fillna('Unknown', inplace=True)
data['budget'].fillna(data['budget'].mean(), inplace=True)
data['revenue'].fillna(data['revenue'].mean(), inplace=True)

In [34]:
# Drop unnecessary columns
data.drop(['imdbId', 'month', 'day'], axis=1, inplace=True)

In [35]:
# Optimize memory
data['movieId'] = pd.to_numeric(data['movieId'], downcast='integer')
data['year'] = pd.to_numeric(data['year'], downcast='integer')
data['rating'] = pd.to_numeric(data['rating'], downcast='float')
data['popularity'] = pd.to_numeric(data['popularity'], downcast='float')
data['runtime'] = pd.to_numeric(data['runtime'], downcast='integer')
data['budget'] = pd.to_numeric(data['budget'], downcast='integer')
data['revenue'] = pd.to_numeric(data['revenue'], downcast='integer')

In [36]:
# Save cleaned data
data.to_csv('cleaned_data.csv', index=False)

In [5]:
data = pd.read_csv('cleaned_data.csv')

In [7]:
# Step 2: Vectorize Metadata Using TF-IDF
data['metadata'] = data['title'] + ' ' + data['genres'] + ' ' + data['tag']
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(data['metadata'].astype(str))

In [8]:
from sklearn.cluster import MiniBatchKMeans

# Step 3: Use MiniBatchKMeans for Clustering
kmeans = MiniBatchKMeans(n_clusters=10, batch_size=100, random_state=42)
data['cluster'] = kmeans.fit_predict(X)

print("Clustering Completed Successfully!")

  super()._check_params_vs_input(X, default_n_init=3)


Clustering Completed Successfully!


In [9]:
# Step 4: Homonym Resolution
def resolve_homonym(query):
    choices = data['title'].unique()
    return min(choices, key=lambda x: abs(len(x) - len(query)))

In [11]:
# Step 5: Recommendation System
def recommend(query):
    query = resolve_homonym(query)
    query_vector = vectorizer.transform([query])

    # Predict cluster
    cluster = gmm.predict(query_vector.toarray())[0]

    # Cold Start Problem Handling
    if cluster not in data['cluster'].unique():
        print(
            f"No cluster found for query '{query}'. Suggesting top-rated items instead.")
        top_items = data.sort_values(
            by=['rating', 'popularity'], ascending=[False, False]).head(5)
        return top_items[['title', 'genres', 'year', 'rating', 'popularity']]

    # Filter by Cluster
    cluster_items = data[data['cluster'] == cluster]

    if cluster_items.empty:
        # Cold Start Fallback
        print("No similar items found. Suggesting top-rated items.")
        top_items = data.sort_values(
            by=['rating', 'popularity'], ascending=[False, False]).head(5)
        return top_items[['title', 'genres', 'year', 'rating', 'popularity']]

    # Calculate Similarity
    similarity = cosine_similarity(
        query_vector, X[cluster_items.index]).flatten()
    cluster_items = cluster_items.assign(similarity=similarity)

    # Improve relevance using rating and popularity
    cluster_items['score'] = (cluster_items['similarity'] * 0.5) + (
        cluster_items['rating'] * 0.3) + (cluster_items['popularity'] * 0.2)

    # Sort and remove duplicates
    recommendations = cluster_items.sort_values(
        by=['score'], ascending=False).drop_duplicates('title').head(10)

    return recommendations[['title', 'genres', 'year', 'rating', 'popularity', 'similarity']]

In [13]:
# Step 6: Model Evaluation (Grey Sheep + Cold Start Handling)
def evaluate():
    # Binary classification (good or bad movie)
    y_true = data['rating'] > data['rating'].mean()

    predictions = []
    for title in data['title'].sample(100):  # Sample 100 movies to evaluate
        try:
            recs = recommend(title)
            preds = recs['rating'] > data['rating'].mean()
            predictions.extend(preds)
        except:
            pass

    # F1 Score
    f1 = f1_score(y_true[:len(predictions)], predictions)

    # Precision
    precision = precision_score(y_true[:len(predictions)], predictions)

    # Recall
    recall = recall_score(y_true[:len(predictions)], predictions)

    print(f"F1 Score: {f1:.3f}")
    print(f"Precision: {precision:.3f}")
    print(f"Recall: {recall:.3f}")

In [None]:
# Step 7: Example Run
if __name__ == "__main__":
    query = input("Enter movie name: ")
    recommendations = recommend(query)
    print("\nRecommended Movies:\n", recommendations)

    # Evaluate Model
    evaluate()

### MINIBatch

In [18]:

# Use MiniBatchKMeans for clustering
kmeans = MiniBatchKMeans(n_clusters=10, batch_size=100, random_state=42)
data['cluster'] = kmeans.fit_predict(X)

In [19]:
# Resolve Homonyms
def resolve_homonym(query):
    choices = data['title'].unique()
    return min(choices, key=lambda x: abs(len(x) - len(query)))

In [20]:
# Enhanced Recommendation System


def recommend(query):
    query = resolve_homonym(query)
    query_vector = vectorizer.transform([query])
    cluster = kmeans.predict(query_vector)[0]

    cluster_items = data[data['cluster'] == cluster]
    if cluster_items.empty:
        # Cold-start handling: suggest top-rated items by genre and year
        top_items = data.sort_values(
            by=['rating', 'year'], ascending=[False, False]).head(5)
        return top_items[['title', 'genres', 'year', 'rating']]

    # Add similarity score
    similarity = cosine_similarity(query_vector, X[cluster_items.index])
    cluster_items['similarity'] = similarity.flatten()

    # Improve relevance using rating and popularity
    cluster_items['popularity'] = cluster_items['rating'] * \
        cluster_items.groupby('genres')['rating'].transform('count')
    cluster_items = cluster_items.sort_values(
        by=['similarity', 'popularity', 'rating'], ascending=[False, False, False])

    # Drop duplicates and select top 10
    recommendations = cluster_items.drop_duplicates('title').head(10)
    return recommendations[['title', 'genres', 'year', 'rating', 'similarity']]

In [22]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics.pairwise import cosine_similarity

# Load data
data = pd.read_csv('data.csv')

# Fill missing metadata with an empty string
data['metadata'] = data['metadata'].fillna('')

# Vectorize metadata
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(data['metadata'])

# Use MiniBatchKMeans for clustering
kmeans = MiniBatchKMeans(n_clusters=10, batch_size=100, random_state=42)
data['cluster'] = kmeans.fit_predict(X)

# Resolve Homonyms


def resolve_homonym(query):
    choices = data['title'].dropna().unique()
    return min(choices, key=lambda x: abs(len(x) - len(query)))

# Enhanced Recommendation System


def recommend(query):
    try:
        query = resolve_homonym(query)
        query_vector = vectorizer.transform([query])
        cluster = kmeans.predict(query_vector)[0]

        cluster_items = data[data['cluster'] == cluster]
        if cluster_items.empty:
            # Cold-start handling: suggest top-rated items by genre and year
            top_items = data.sort_values(
                by=['rating', 'year'], ascending=[False, False]).head(5)
            return top_items[['title', 'genres', 'year', 'rating']]

        # Add similarity score
        similarity = cosine_similarity(query_vector, X[cluster_items.index])
        cluster_items = cluster_items.assign(similarity=similarity.flatten())

        # Improve relevance using rating and popularity
        cluster_items['popularity'] = cluster_items['rating'] * \
            cluster_items.groupby('genres')['rating'].transform('count')
        cluster_items = cluster_items.sort_values(
            by=['similarity', 'popularity', 'rating'], ascending=[False, False, False])

        # Drop duplicates and select top 10
        recommendations = cluster_items.drop_duplicates('title').head(10)
        return recommendations[['title', 'genres', 'year', 'rating', 'similarity']]

    except Exception as e:
        print(f"Error: {e}")
        return pd.DataFrame(columns=['title', 'genres', 'year', 'rating', 'similarity'])


# Example Run
if __name__ == "__main__":
    query = input("Enter movie name: ")
    recommendations = recommend(query)
    print("\nRecommended Movies:\n", recommendations)

  super()._check_params_vs_input(X, default_n_init=3)



Recommended Movies:
                                       title  genres  year  rating  similarity
5299                   Billy Madison (1995)  Comedy  1970     5.0         0.0
5476                          Clerks (1994)  Comedy  1970     5.0         0.0
68506                      Tommy Boy (1995)  Comedy  1970     5.0         0.0
71837             Dazed and Confused (1993)  Comedy  1970     5.0         0.0
79249   Monty Python's Life of Brian (1979)  Comedy  1970     5.0         0.0
115778                 Wayne's World (1992)  Comedy  1970     5.0         0.0
116043                Grumpy Old Men (1993)  Comedy  1970     5.0         0.0
118197                     Road Trip (2000)  Comedy  1970     5.0         0.0
119364               What About Bob? (1991)  Comedy  1970     5.0         0.0
123124                 Step Brothers (2008)  Comedy  1970     5.0         0.0
