In [None]:
print("Guys lets do this")

In [None]:
import re
import numpy as np
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

In [None]:
movies=pd.read_csv('movies.csv')
ratings_chunks = pd.read_csv('ratings.csv', chunksize=100000)
ratings = pd.concat(ratings_chunks)
users=pd.read_csv('users.csv')

# Display basic info
print("Movies:\n",movies.info())
print("\nRatings:\n",ratings.info())
print("\nUsers:\n",users.info())

In [None]:
# Download necessary resources
nltk.download('punkt_tab')

# Drop duplicates and missing values
movies.drop_duplicates(inplace=True)
movies.dropna(inplace=True)

# Split genres into lists
movies['movie_genres'] = movies['movie_genres'].apply(lambda x: x.split('|'))

# One-hot encode genres
mlb = MultiLabelBinarizer()
genres_encoded = pd.DataFrame(mlb.fit_transform(movies['movie_genres']), columns=mlb.classes_)
movies = pd.concat([movies.drop('movie_genres', axis=1), genres_encoded], axis=1)
movies['movie_movie_movie_title'] = movies['movie_movie_movie_title'].astype(str)
# Clean and tokenize movie_movie_titles
def clean_movie_movie_title(movie_movie_title):
    return re.sub(r'\(\d{4}\)', '', movie_movie_title).strip()

movies['clean_movie_movie_title']=movies['movie_movie_movie_title'].apply(clean_movie_movie_title)
movies['movie_movie_title_tokens']=movies['clean_movie_movie_title'].apply(word_tokenize)
movies['movie_movie_title_token_count']=movies['movie_movie_title_tokens'].apply(len)
print("done")

In [None]:
# Drop duplicates and missing values
ratings.drop_duplicates(inplace=True)
ratings.dropna(inplace=True)

# Convert timestamp to datetime
ratings['timestamp'] = pd.to_datetime(ratings['timestamp'], unit='s')

# Extract date features
ratings['year'] = ratings['timestamp'].dt.year
ratings['month'] = ratings['timestamp'].dt.month
ratings['day'] = ratings['timestamp'].dt.day

# Filter users and movies with too few ratings
user_counts = ratings['user_id'].value_counts()
movie_counts = ratings['movie_id'].value_counts()

ratings = ratings[
    ratings['user_id'].isin(user_counts[user_counts >= 10].index) &
    ratings['movie_id'].isin(movie_counts[movie_counts >= 10].index)
]

print(ratings['user_rating'].describe())

print("done")

In [None]:
# Drop duplicates and missing values
users.drop_duplicates(inplace=True)
users.dropna(inplace=True)

# Convert to categorical
users['gender'] = users['user_gender'].astype('category')
users['occupation'] = users['user_occupation_label'].astype('category')

# Encode gender and occupation
users['gender_encoded'] = LabelEncoder().fit_transform(users['gender'])
users['occupation_encoded'] = LabelEncoder().fit_transform(users['occupation'])
print("done")

In [None]:
plt.style.use('seaborn-v0_8')
sns.set_palette('husl')
print("\n=== Basic Dataset Information ===")
print("-" * 50)
print(f"Number of movies: {len(movies)}")
print(f"Number of ratings: {len(ratings)}")
print(f"Number of users: {len(users)}")

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Filter out rare genres (e.g., with less than 20 movies)
filtered_genre_counts = genre_counts[genre_counts > 20]

# Sort genres again (optional)
filtered_genre_counts = filtered_genre_counts.sort_values(ascending=True)

# Set a clean style
sns.set(style="whitegrid")

# Plot
plt.figure(figsize=(10, 8))
sns.barplot(x=filtered_genre_counts.values, y=filtered_genre_counts.index)

plt.movie_movie_title('Top Genres by Number of Movies', fontsize=14)
plt.xlabel('Number of Movies', fontsize=12)
plt.ylabel('Genre', fontsize=12)

plt.xticks(fontsize=10)
plt.yticks(fontsize=10)

plt.tight_layout()
plt.savefig('clean_genre_distribution.png')
plt.show()


In [None]:
# ==========================
# ✅ RATINGS ANALYSIS
# ==========================
print("\n=== Ratings Analysis ===")
print("-" * 50)

# Distribution of ratings
print("Creating ratings distribution plot...")
plt.figure(figsize=(10, 6))
sns.histplot(data=ratings, x='user_rating', bins=20)
plt.movie_movie_title('Distribution of Movie Ratings')
plt.xlabel('Rating')
plt.ylabel('Count')
plt.savefig('ratings_distribution.png')
plt.show()

# Average rating per movie
print("Calculating average ratings per movie...")
movie_ratings = ratings.groupby('movie_id')['user_rating'].agg(['mean', 'count']).reset_index()
movie_ratings = movie_ratings.merge(movies[['movie_id', 'movie_movie_movie_title']], on='movie_id', how='left')

print("\nTop 10 Highest Rated Movies (with at least 100 ratings):")
print(movie_ratings[movie_ratings['count'] >= 100].sort_values('mean', ascending=False).head(10))

In [None]:
# ==========================
# ✅ USER BEHAVIOR ANALYSIS
# ==========================
print("\n=== User Behavior Analysis ===")
print("-" * 50)

# Number of ratings per user
print("Analyzing user rating patterns...")
user_ratings = ratings.groupby('user_id').size().reset_index(name='rating_count')

plt.figure(figsize=(10, 6))
sns.histplot(data=user_ratings, x='rating_count', bins=50)
plt.movie_movie_title('Distribution of Ratings per User')
plt.xlabel('Number of Ratings')
plt.ylabel('Number of Users')
plt.savefig('user_ratings_distribution.png')
plt.show()

print("\nUser Rating Statistics:")
print(user_ratings['rating_count'].describe())

# ==========================
# ✅ EXPORT ANALYSIS FILES
# ==========================
print("\nSaving analysis results...")
movie_ratings.to_csv('movie_ratings_analysis.csv', index=False)
user_ratings.to_csv('user_ratings_analysis.csv', index=False)
genre_df.to_csv('genre_analysis.csv', index=False)

print("\nAnalysis complete! Check the generated files:")
print("1. infotact_project1/genre_distribution.png")
print("2. infotact_project1/ratings_distribution.png")
print("3. infotact_project1/user_ratings_distribution.png")
print("4. infotact_project1/movie_ratings_analysis.csv")
print("5. infotact_project1/user_ratings_analysis.csv")
print("6. infotact_project1/genre_analysis.csv")

In [None]:
df=ratings.merge(users, on='user_id').merge(movies, on='movie_id')
print(df.head())

In [None]:
# Import necessary libraries
from sklearn.metrics.pairwise import cosine_similarity

# Pivot to create user-item matrix
user_item = ratings.pivot_table(index='user_id', columns='movie_id', values='user_rating')
user_item = user_item.fillna(0)

# Compute cosine similarity between users
user_sim = cosine_similarity(user_item)
user_sim_df = pd.DataFrame(user_sim, index=user_item.index, columns=user_item.index)

# Predict ratings for a user based on similar users
def predict_ratings(target_user_id, user_item, user_sim_df, k=5):
    sim_scores = user_sim_df[target_user_id].drop(target_user_id)
    top_k_users = sim_scores.nlargest(k).index
    sim_subset = sim_scores[top_k_users]
    neighbor_ratings = user_item.loc[top_k_users]
    weighted_sum = (neighbor_ratings.mul(sim_subset, axis=0)).sum(axis=0)
    sim_sum = sim_subset.sum()
    predicted_ratings = weighted_sum / sim_sum
    return predicted_ratings

# Recommend top N movies that the user hasn't rated
def recommend_movies(target_user_id, user_item, user_sim_df, movies_df, k=5, n_recs=10):
    predicted = predict_ratings(target_user_id, user_item, user_sim_df, k)
    already_rated = user_item.loc[target_user_id] > 0
    predicted = predicted[~already_rated]
    top_movies = predicted.nlargest(n_recs).index
    recommendations = movies_df[movies_df['movie_id'].isin(top_movies)][['movie_id', 'movie_movie_title']]
    recommendations = recommendations.set_index('movie_id').loc[top_movies].reset_index()
    return recommendations

# Example: Get top 5 recommendations for user ID 1
user_id = 1
recommendations = recommend_movies(user_id, user_item, user_sim_df, movies, k=10, n_recs=5)

print(f"Top 5 movie recommendations for User {user_id}:")
print(recommendations)


In [None]:
# Import necessary libraries
from sklearn.metrics.pairwise import cosine_similarity

# Pivot to create item-user matrix (transpose of user-item)
item_user = ratings.pivot_table(index='movie_id', columns='user_id', values='user_rating')
item_user = item_user.fillna(0)

# Compute cosine similarity between items (movies)
item_sim = cosine_similarity(item_user)
item_sim_df = pd.DataFrame(item_sim, index=item_user.index, columns=item_user.index)

# Predict ratings for a user based on similar items
def predict_item_based_ratings(target_user_id, item_user, item_sim_df):
    user_ratings = item_user.loc[:, target_user_id]
    rated_items = user_ratings[user_ratings > 0].index

    predictions = {}
    for item in item_user.index:
        if user_ratings[item] == 0:
            sim_items = item_sim_df.loc[item, rated_items]
            sim_scores = sim_items.values
            ratings = user_ratings[rated_items].values
            if sim_scores.sum() != 0:
                predicted_rating = np.dot(sim_scores, ratings) / sim_scores.sum()
                predictions[item] = predicted_rating

    return pd.Series(predictions).sort_values(ascending=False)

# Recommend top N movies that the user hasn't rated
def recommend_movies_item_based(target_user_id, item_user, item_sim_df, movies_df, n_recs=10):
    predicted_ratings = predict_item_based_ratings(target_user_id, item_user, item_sim_df)
    top_items = predicted_ratings.head(n_recs).index
    recommendations = movies_df[movies_df['movie_id'].isin(top_items)][['movie_id', 'movie_title']]
    recommendations = recommendations.set_index('movie_id').loc[top_items].reset_index()
    return recommendations

# Example: Get top 5 recommendations for user ID 1
user_id = 1
recommendations = recommend_movies_item_based(user_id, item_user, item_sim_df, movies, n_recs=5)

print(f"Top 5 movie recommendations for User {user_id} (Item-Based):")
print(recommendations)


In [None]:
from sklearn.decomposition import TruncatedSVD, NMF
# Fill missing ratings with 0s for simplicity (alternatively, you can try mean-centering)
train_matrix = train_data.pivot(index='user_id', columns='movie_id', values='user_rating').fillna(0)
user_item_matrix = ratings.pivot_table(index='user_id', columns='movie_id', values='user_rating').fillna(0)

# Define number of latent features
n_components = 50  # You can tune this
# SVD: user_item_matrix ≈ U * Sigma * V^T
svd = TruncatedSVD(n_components=n_components, random_state=42)
user_features_svd = svd.fit_transform(user_item_matrix)
item_features_svd = svd.components_

# Reconstruct the ratings matrix
predicted_ratings_svd = np.dot(user_features_svd, item_features_svd)

# Convert to DataFrame for easy interpretation
predicted_svd_df = pd.DataFrame(predicted_ratings_svd, index=user_item_matrix.index, columns=user_item_matrix.columns)

# Example: Get Top 5 recommendations for user 1
def recommend_svd(user_id, predicted_df, movies_df, n_recs=5):
    user_row = predicted_df.loc[user_id]
    rated_movies = user_item_matrix.loc[user_id] > 0
    user_row = user_row[~rated_movies]
    top_movie_ids = user_row.nlargest(n_recs).index
    return movies_df[movies_df['movie_id'].isin(top_movie_ids)][['movie_id', 'movie_title']]

print("\nTop 5 Recommendations using SVD:")
print(recommend_svd(1, predicted_svd_df, movies))


In [None]:
nmf = NMF(n_components=n_components, init='random', random_state=42)
user_features_nmf = nmf.fit_transform(user_item_matrix)
item_features_nmf = nmf.components_

# Reconstruct the ratings
predicted_ratings_nmf = np.dot(user_features_nmf, item_features_nmf)

# Convert to DataFrame
predicted_nmf_df = pd.DataFrame(predicted_ratings_nmf, index=user_item_matrix.index, columns=user_item_matrix.columns)

# Example: Get Top 5 recommendations for user 1
def recommend_nmf(user_id, predicted_df, movies_df, n_recs=5):
    user_row = predicted_df.loc[user_id]
    rated_movies = user_item_matrix.loc[user_id] > 0
    user_row = user_row[~rated_movies]
    top_movie_ids = user_row.nlargest(n_recs).index
    return movies_df[movies_df['movie_id'].isin(top_movie_ids)][['movie_id', 'movie_title']]

print("\nTop 5 Recommendations using NMF:")
print(recommend_nmf(1, predicted_nmf_df, movies))


In [None]:
from sklearn.model_selection import train_test_split

# Create a new DataFrame for train/test split
ratings_data = ratings[['user_id', 'movie_id', 'user_rating']]

# Train-test split by rating records
train_data, test_data = train_test_split(ratings_data, test_size=0.2, random_state=42)


In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np

def evaluate_rmse_mae(test_data, pred_df, name="Model"):
    y_true, y_pred = [], []
    for _, row in test_data.iterrows():
        user, movie, rating = row['user_id'], row['movie_id'], row['user_rating']
        if user in pred_df.index and movie in pred_df.columns:
            y_true.append(rating)
            y_pred.append(pred_df.loc[user, movie])
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    print(f"{name} RMSE: {rmse:.4f}")
    print(f"{name} MAE : {mae:.4f}")

evaluate_rmse_mae(test_data, pred_svd_df, "SVD")
evaluate_rmse_mae(test_data, pred_nmf_df, "NMF")


In [None]:
def precision_at_k(pred_df, train_df, test_df, k=10, threshold=3.5):
    precision_list = []
    for user in test_df['user_id'].unique():
        if user not in pred_df.index:
            continue
        user_rated_movies = set(train_df[train_df['user_id'] == user]['movie_id'])
        user_test = test_df[test_df['user_id'] == user]
        user_preds = pred_df.loc[user].drop(labels=user_rated_movies, errors='ignore')
        top_k_preds = user_preds.sort_values(ascending=False).head(k)
        
        relevant = set(user_test[user_test['user_rating'] >= threshold]['movie_id'])
        recommended = set(top_k_preds.index)
        hits = len(recommended & relevant)
        precision = hits / k
        precision_list.append(precision)
    return np.mean(precision_list)

print(f"SVD Precision@10: {precision_at_k(pred_svd_df, train_data, test_data):.4f}")
print(f"NMF Precision@10: {precision_at_k(pred_nmf_df, train_data, test_data):.4f}")
