In [None]:

# Notebook 1: Data Loading, Cleaning, EDA, Apriori Association Rules & Similarity-Based Recommender

# Import necessary libraries
from google.colab import drive
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
import seaborn as sns

# Mount Google Drive to access data files in Colab
drive.mount('/content/drive')

# Load datasets - ratings, users, movies
ratings = pd.read_csv('/content/drive/MyDrive/Scalerdatasets/zee_rat.csv')
users = pd.read_csv('/content/drive/MyDrive/Scalerdatasets/zee_user.csv')
movies = pd.read_csv('/content/drive/MyDrive/Scalerdatasets/zee_movie.csv', encoding='ISO-8859-1')

# Clean column names to snake_case using skimpy library for uniformity
from skimpy import clean_columns
movies = clean_columns(movies, case='snake')
ratings = clean_columns(ratings, case='snake')
users = clean_columns(users, case='snake')

print("Column names cleaned for consistency.")

# Examine shape of datasets
print(f"Ratings shape: {ratings.shape}, Movies shape: {movies.shape}, Users shape: {users.shape}")

# Check for missing values in each dataset
print("Missing values in ratings:
", ratings.isnull().sum())
print("Missing values in movies:
", movies.isnull().sum())
print("Missing values in users:
", users.isnull().sum())

# Remove duplicated rows in ratings if any
ratings.drop_duplicates(inplace=True)

# Convert timestamp to datetime format in ratings
from datetime import datetime
ratings['date'] = ratings['timestamp'].apply(lambda x: datetime.fromtimestamp(x))
ratings['hours'] = ratings['timestamp'].apply(lambda x: datetime.fromtimestamp(x).hour)

# Merge three datasets into one DataFrame for combined analysis
df = ratings.merge(users, on='userid', how='inner')
df = df.merge(movies, on='movieid', how='inner')

# Extract release year from movie title using regex
df['releaseyear'] = df['title'].str.extract(r'(\d{4})').astype(int)

# Display some sample entries and dataset shape
print(df.head())
print(f"Merged DataFrame shape: {df.shape}")

# Exploratory Data Analysis (EDA) - Basic statistics
print(df.describe(include=[np.number]))
print(df['genres'].nunique())

# Association rule mining with the Apriori algorithm
# Prepare dataset by pivoting user-movie ratings into a binary matrix for Apriori
ratings_pivot = df.pivot_table(index='userid', columns='title', values='rating').fillna(0)
ratings_binary = ratings_pivot.applymap(lambda x: 1 if x > 0 else 0)

# Import and use mlxtend's apriori
from mlxtend.frequent_patterns import apriori, association_rules

# Generate frequent itemsets with min support
frequent_itemsets = apriori(ratings_binary, min_support=0.12, use_colnames=True)

# Generate association rules from frequent itemsets
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)

# Display top association rules sorted by lift
print(rules.sort_values('lift', ascending=False).head())

# Exploding genres column to one-hot encode multiple genres
df_exp = df.copy()
df_exp['genres'] = df_exp['genres'].str.split('|')
df_exp = df_exp.explode('genres')

# Create a user-genre matrix for apriori on genres
user_genre = df_exp.pivot_table(index='userid', columns='genres', values='rating', aggfunc='count').fillna(0)
user_genre_binary = user_genre.applymap(lambda x: 1 if x > 0 else 0)

# Generate frequent genre itemsets
genre_itemsets = apriori(user_genre_binary, min_support=0.5, use_colnames=True)
print(genre_itemsets.sort_values('support', ascending=False).head())

# Similarity-based recommendation functions

def hamming_distance(x, y):
    """Calculate Hamming distance between two vectors."""
    return np.sum(np.abs(x - y))

def euclidean_distance(x, y):
    """Calculate Euclidean distance between two vectors."""
    return np.sqrt(np.sum((x - y)**2))

def cosine_similarity(v1, v2):
    """Calculate Cosine similarity between two vectors."""
    dot_prod = np.dot(v1, v2)
    norm_v1 = np.linalg.norm(v1)
    norm_v2 = np.linalg.norm(v2)
    return dot_prod / (norm_v1 * norm_v2)

def pearson_similarity(x, y):
    """Calculate Pearson correlation coefficient between two vectors."""
    x_mean = np.mean(x)
    y_mean = np.mean(y)
    num = np.sum((x - x_mean) * (y - y_mean))
    denom = np.sqrt(np.sum((x - x_mean) ** 2) * np.sum((y - y_mean) ** 2))
    return num / denom

def similarity_based_recommendations(matrix, movies_df, movie_id_col, similarity_func):
    """Generate movie recommendations based on similarity function."""
    ranks = []
    for query in matrix.index[:10]:  # Example: limit to first 10 for speed
        for candidate in matrix.index:
            if candidate == query:
                continue
            dist = similarity_func(matrix.loc[query], matrix.loc[candidate])
            ranks.append((query, candidate, dist))
    ranks = pd.DataFrame(ranks, columns=['query', 'candidate', 'distance'])

    # Join movie titles for better readability of output
    ranks = ranks.merge(movies_df, left_on='query', right_on=movie_id_col)                  .rename(columns={'title': 'query_title'}).drop(columns=[movie_id_col])
    ranks = ranks.merge(movies_df, left_on='candidate', right_on=movie_id_col)                  .rename(columns={'title': 'candidate_title'}).drop(columns=[movie_id_col])

    # Sort by query and distance (descending for similarity, ascending for distance)
    ranks = ranks.sort_values(['query', 'distance'], ascending=[True, False])
    return ranks

# Example usage: creating movie-genre matrix for similarity recommendations
genre_matrix = user_genre_binary
movies_subset = movies[['movieid', 'title']].set_index('movieid')

# Recommend based on cosine similarity
recommendations = similarity_based_recommendations(genre_matrix, movies_subset, 'movieid', cosine_similarity)
print(recommendations.head())

# End of Notebook 1
