In [7]:
import pandas as pd

movies = pd.read_csv("movies_final.csv")
movies.head()


Unnamed: 0,id,title,tags
0,19995,Avatar,action adventure fantasy science fiction cultu...
1,285,Pirates of the Caribbean: At World's End,adventure fantasy action ocean drug abuse exot...
2,206647,Spectre,action adventure crime spy based on novel secr...
3,49026,The Dark Knight Rises,action crime drama thriller dc comics crime fi...
4,49529,John Carter,action adventure science fiction based on nove...


In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=5000, stop_words='english')
tfidf_matrix = tfidf.fit_transform(movies['tags'])
tfidf_matrix.shape


(4803, 5000)

In [9]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim = cosine_similarity(tfidf_matrix)
cosine_sim.shape


(4803, 4803)

In [10]:
import joblib, numpy as np, pandas as pd, os

# Save files
joblib.dump(tfidf, "tfidf_vectorizer.pkl")
np.save("cosine_sim.npy", cosine_sim)
movies[['id','title']].to_pickle("movies.pkl")

# Confirm
files_ok = all(os.path.exists(f) for f in ["tfidf_vectorizer.pkl","cosine_sim.npy","movies.pkl"])
print("Saved files exist:", files_ok)
print("Files:", [f for f in os.listdir('.') if f.endswith(('.pkl','.npy'))])


Saved files exist: True
Files: ['cosine_sim.npy', 'tfidf_vectorizer.pkl', 'movies.pkl']


In [11]:
%%bash
cat > app.py <<'PY'
from fastapi import FastAPI, HTTPException, Query
from pydantic import BaseModel
import joblib
import numpy as np
import pandas as pd
from typing import List

# ----------- Load Artifacts -----------
tfidf = joblib.load("tfidf_vectorizer.pkl")
cosine_sim = np.load("cosine_sim.npy")
movies = pd.read_pickle("movies.pkl")   # contains id + title

# Map title → index
indices = pd.Series(movies.index, index=movies['title']).drop_duplicates()

# ----------- Create FastAPI App -----------
app = FastAPI(title="Movie Recommender API", version="1.0")

class RecommendResponse(BaseModel):
    query: str
    recommendations: List[str]

@app.get("/recommend", response_model=RecommendResponse)
def recommend(title: str = Query(..., description="Exact movie title"), top_n: int = 5):
    if title not in indices:
        suggestions = movies[movies['title'].str.contains(title, case=False, na=False)]['title'].head(5).tolist()
        raise HTTPException(status_code=404, detail={
            "error": f"Title '{title}' not found.",
            "suggestions": suggestions
        })

    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:top_n+1]
    movie_indices = [i[0] for i in sim_scores]
    recs = movies['title'].iloc[movie_indices].tolist()

    return {"query": title, "recommendations": recs}

@app.get("/health")
def health():
    return {"status": "ok"}

if __name__ == "__main__":
    import uvicorn
    uvicorn.run("app:app", host="0.0.0.0", port=8000, reload=True)
PY

echo "app.py successfully created!"


app.py successfully created!


In [12]:
!pip install -q fastapi uvicorn[standard] joblib numpy pandas requests
print("deps installed")


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/517.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━[0m [32m358.4/517.7 kB[0m [31m10.7 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m517.7/517.7 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.4/4.4 MB[0m [31m65.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m456.8/456.8 kB[0m [31m28.7 MB/s[0m eta [36m0:00:00[0m
[?25hdeps installed


In [14]:
%%bash
nohup uvicorn app:app --host 0.0.0.0 --port 8000 > uvicorn.log 2>&1 &
echo "uvicorn started"



uvicorn started


In [15]:
!tail -n 20 uvicorn.log


INFO:     Started server process [4095]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)


In [16]:
import requests, time
time.sleep(2)  # short wait for server

title_to_test = "Avatar"   # change this if you know a different exact title from your data
try:
    r = requests.get("http://127.0.0.1:8000/recommend", params={"title": title_to_test, "top_n":5}, timeout=10)
    print("status:", r.status_code)
    print("response:", r.json())
except Exception as e:
    print("request failed:", e)
    # show last 200 lines of uvicorn log to help debug
    !tail -n 200 uvicorn.log


status: 200
response: {'query': 'Avatar', 'recommendations': ['Lifeforce', 'Moonraker', 'Lost in Space', 'Guardians of the Galaxy', 'Aliens']}


In [17]:
import pandas as pd
movies = pd.read_pickle("movies.pkl")
matches = movies[movies['title'].str.contains("avatar", case=False, na=False)]['title'].head(20).tolist()
print("Matches:", matches)


Matches: ['Avatar']


In [18]:
import pandas as pd
movies = pd.read_pickle("movies.pkl")
movies.sample(20)


Unnamed: 0,id,title
352,10674,Mulan
1289,9495,The Crow
4560,379532,Rodeo Girl
1120,509,Notting Hill
825,9315,Flightplan
3966,26039,Point Blank
3383,33676,Losin' It
4164,159037,The Square
1017,1636,Bedazzled
4460,25212,Deterrence


In [19]:
import requests

movie = "The Witch"

r = requests.get(
    "http://127.0.0.1:8000/recommend",
    params={"title": movie, "top_n":5}
)

print("status:", r.status_code)
print("response:", r.json())


status: 200
response: {'query': 'The Witch', 'recommendations': ['The Chronicles of Narnia: The Lion, the Witch and the Wardrobe', 'The New World', 'Moonrise Kingdom', 'The Last Witch Hunter', 'Just Visiting']}


In [20]:
import pandas as pd
movies = pd.read_pickle("movies.pkl")
display(movies['title'].sample(20).reset_index(drop=True))


Unnamed: 0,title
0,Pete's Dragon
1,The Replacement Killers
2,Beyond the Black Rainbow
3,On The Downlow
4,Say It Isn't So
5,"To Be Frank, Sinatra at 100"
6,A Walk to Remember
7,Iron Man 3
8,Barnyard
9,Beneath Hill 60


In [21]:
import requests, json

movie = "Iron Man 3"
r = requests.get("http://127.0.0.1:8000/recommend", params={"title": movie, "top_n":5}, timeout=10)
print("status:", r.status_code)
print("response:", json.dumps(r.json(), indent=2, ensure_ascii=False))


status: 200
response: {
  "query": "Iron Man 3",
  "recommendations": [
    "Iron Man 2",
    "Iron Man",
    "Avengers: Age of Ultron",
    "Captain America: Civil War",
    "The Avengers"
  ]
}


In [22]:
import numpy as np
import pandas as pd

# Load the same df to ensure matching indices
movies = pd.read_pickle("movies.pkl")

top_n = 10
neighbors = {}

for idx, title in enumerate(movies['title']):
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:top_n+1]
    movie_indices = [i[0] for i in sim_scores]
    neighbors[title] = movies['title'].iloc[movie_indices].tolist()

len(neighbors)


4800

In [23]:
import joblib
joblib.dump(neighbors, "neighbors.pkl")

print("Saved neighbors.pkl")


Saved neighbors.pkl


In [24]:
%%bash
cat > app.py <<'PY'
from fastapi import FastAPI, HTTPException, Query
from pydantic import BaseModel
import joblib
import pandas as pd
from typing import List

# ----------- Load Artifacts -----------
movies = pd.read_pickle("movies.pkl")
neighbors = joblib.load("neighbors.pkl")

app = FastAPI(title="Movie Recommender API", version="2.0")

class RecommendResponse(BaseModel):
    query: str
    recommendations: List[str]

@app.get("/recommend", response_model=RecommendResponse)
def recommend(title: str = Query(..., description="Exact movie title"), top_n: int = 5):
    if title not in neighbors:
        suggestions = movies[movies['title'].str.contains(title, case=False, na=False)]['title'].head(5).tolist()
        raise HTTPException(status_code=404, detail={
            "error": f"Title '{title}' not found.",
            "suggestions": suggestions
        })

    recs = neighbors[title][:top_n]
    return {"query": title, "recommendations": recs}

@app.get("/health")
def health():
    return {"status": "ok"}

if __name__ == "__main__":
    import uvicorn
    uvicorn.run("app:app", host="0.0.0.0", port=8000, reload=True)
PY

echo "app.py updated (using neighbors)"


app.py updated (using neighbors)


In [1]:
!kill -9 $(lsof -t -i:8000)


kill: usage: kill [-s sigspec | -n signum | -sigspec] pid | jobspec ... or kill -l [sigspec]


In [2]:
%%bash
nohup uvicorn app:app --host 0.0.0.0 --port 8000 > uvicorn.log 2>&1 &
echo "new uvicorn started"


new uvicorn started


In [3]:
import requests, json, time
time.sleep(2)

movie = "Iron Man 3"
r = requests.get("http://127.0.0.1:8000/recommend", params={"title": movie, "top_n":5})
print("status:", r.status_code)
print(json.dumps(r.json(), indent=2))


status: 200
{
  "query": "Iron Man 3",
  "recommendations": [
    "Iron Man 2",
    "Iron Man",
    "Avengers: Age of Ultron",
    "Captain America: Civil War",
    "The Avengers"
  ]
}
