In [7]:
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": "null",
   "id": "897a438d",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Import libraries\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "from sklearn.metrics.pairwise import cosine_similarity\n",
    "from sklearn.decomposition import TruncatedSVD\n",
    "\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.cluster import KMeans\n",
    "from sklearn.metrics import silhouette_score\n",
    "\n",
    "# ---- 1. Sample Movie Dataset ----\n",
    "movie_data = {\n",
    "    'movie_id': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],\n",
    "    'title': ['The Matrix', 'The Godfather', 'Pulp Fiction', 'The Dark Knight', 'Inception',\n",
    "              'Forrest Gump', 'The Shawshank Redemption', 'Fight Club', 'Interstellar', 'Gladiator'],\n",
    "    'description': [\n",
    "        'A computer hacker learns about the true nature of his reality.',\n",
    "        'An aging crime boss hands over control of his empire to his reluctant son.',\n",
    "        'The lives of two hitmen, a boxer, and others cross paths in tales of violence and redemption.',\n",
    "        'Batman escalates his war on crime against a rising threat.',\n",
    "        'A thief steals secrets through dream-sharing and is given one last job to pull off.',\n",
    "        'The life story of a simple man with a big heart, spanning decades.',\n",
    "        'Two imprisoned men bond over a number of years, finding solace and eventual redemption.',\n",
    "        'An insomniac office worker and a soap maker form an underground fight club.',\n",
    "        'A team of explorers travel through a wormhole in space to ensure humanity’s survival.',\n",
    "        'A former Roman General sets out to exact vengeance against the corrupt emperor.'\n",
    "    ]\n",
    "}\n",
    "movies = pd.DataFrame(movie_data)\n",
    "print(\"Movie Dataset:\")\n",
    "display(movies)\n",
    "\n",
    "# ---- 2. Data Preprocessing ----\n",
    "# Check missing values\n",
    "print(\"\\nMissing values in dataset:\")\n",
    "print(movies.isnull().sum())\n",
    "\n",
    "# No duplicates or missing data here, but let's show the check:\n",
    "print(\"\\nChecking duplicates:\")\n",
    "print(f\"Duplicates before removal: {movies.duplicated().sum()}\")\n",
    "movies = movies.drop_duplicates()\n",
    "print(f\"Duplicates after removal: {movies.duplicated().sum()}\")\n",
    "\n",
    "# ---- 3. EDA ----\n",
    "# Length of descriptions\n",
    "movies['desc_length'] = movies['description'].apply(len)\n",
    "\n",
    "plt.figure(figsize=(8,4))\n",
    "sns.histplot(movies['desc_length'], bins=5)\n",
    "plt.title('Distribution of Movie Description Lengths')\n",
    "plt.xlabel('Description Length (characters)')\n",
    "plt.show()\n",
    "\n",
    "# ---- 4. Feature Engineering ----\n",
    "# Vectorize text descriptions with TF-IDF\n",
    "vectorizer = TfidfVectorizer(stop_words='english')\n",
    "tfidf_matrix = vectorizer.fit_transform(movies['description'])\n",
    "\n",
    "print(f\"TF-IDF matrix shape: {tfidf_matrix.shape}\")\n",
    "\n",
    "# Reduce dimensionality for visualization and clustering\n",
    "svd = TruncatedSVD(n_components=2, random_state=42)\n",
    "reduced_features = svd.fit_transform(tfidf_matrix)\n",
    "\n",
    "movies['component_1'] = reduced_features[:, 0]\n",
    "movies['component_2'] = reduced_features[:, 1]\n",
    "\n",
    "plt.figure(figsize=(8,6))\n",
    "sns.scatterplot(x='component_1', y='component_2', data=movies, s=100)\n",
    "for i, title in enumerate(movies['title']):\n",
    "    plt.text(movies.component_1[i] + 0.03, movies.component_2[i] + 0.02, title, fontsize=9)\n",
    "plt.title('2D projection of movie descriptions')\n",
    "plt.show()\n",
    "\n",
    "# ---- 5. Model Building (Clustering for grouping similar movies) ----\n",
    "kmeans = KMeans(n_clusters=3, random_state=42)\n",
    "movies['cluster'] = kmeans.fit_predict(tfidf_matrix)\n",
    "\n",
    "print(\"\\nCluster assignments:\")\n",
    "display(movies[['title', 'cluster']])\n",
    "\n",
    "# Silhouette score for quality of clusters\n",
    "score = silhouette_score(tfidf_matrix, movies['cluster'])\n",
    "print(f\"Silhouette Score: {score:.3f}\")\n",
    "\n",
    "# ---- 6. Recommendation Function ----\n",
    "def recommend_movies(title, top_n=3):\n",
    "    if title not in movies['title'].values:\n",
    "        return f\"'{title}' not found in movie dataset.\"\n",
    "\n",
    "    idx = movies.index[movies['title'] == title][0]\n",
    "    cosine_sim = cosine_similarity(tfidf_matrix[idx], tfidf_matrix).flatten()\n",
    "\n",
    "    similar_indices = cosine_sim.argsort()[::-1][1:top_n+1]\n",
    "    recommendations = movies.iloc[similar_indices][['title', 'description']]\n",
    "    return recommendations\n",
    "\n",
    "print(\"\\nRecommendations for 'The Matrix':\")\n",
    "display(recommend_movies('The Matrix'))\n",
    "\n",
    "# ---- 7. Visualization of Similarity Matrix ----\n",
    "similarity_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)\n",
    "\n",
    "plt.figure(figsize=(10,8))\n",
    "sns.heatmap(similarity_matrix, xticklabels=movies['title'], yticklabels=movies['title'], annot=True, cmap='coolwarm', fmt=\".2f\")\n",
    "plt.title('Cosine Similarity Heatmap Between Movie Descriptions')\n",
    "plt.xticks(rotation=45)\n",
    "plt.yticks(rotation=0)\n",
    "plt.show()\n"
   ]
  }
 ],
 "metadata": {},
 "nbformat": 4,
 "nbformat_minor": 5
}


{'cells': [{'cell_type': 'code',
   'execution_count': 'null',
   'id': '897a438d',
   'metadata': {},
   'outputs': [],
   'source': ['# Import libraries\n',
    'import pandas as pd\n',
    'import numpy as np\n',
    'import matplotlib.pyplot as plt\n',
    'import seaborn as sns\n',
    '\n',
    'from sklearn.feature_extraction.text import TfidfVectorizer\n',
    'from sklearn.metrics.pairwise import cosine_similarity\n',
    'from sklearn.decomposition import TruncatedSVD\n',
    '\n',
    'from sklearn.model_selection import train_test_split\n',
    'from sklearn.cluster import KMeans\n',
    'from sklearn.metrics import silhouette_score\n',
    '\n',
    '# ---- 1. Sample Movie Dataset ----\n',
    'movie_data = {\n',
    "    'movie_id': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],\n",
    "    'title': ['The Matrix', 'The Godfather', 'Pulp Fiction', 'The Dark Knight', 'Inception',\n",
    "              'Forrest Gump', 'The Shawshank Redemption', 'Fight Club', 'Interstellar', 'Gladiator'