In [46]:
import pandas as pd
import numpy as np 
import google.generativeai as genai
from langchain_core.prompts import PromptTemplate

In [47]:
df = pd.read_csv("MoviesOnStreamingPlatforms_updated.csv")

In [48]:
df.isnull().sum()

ID                     0
Title                  0
Year                   0
Age                 9390
IMDb                 571
Rotten Tomatoes    11586
Netflix                0
Hulu                   0
Prime Video            0
Disney+                0
Type                   0
Directors            726
Genres               275
Country              435
Language             614
Runtime              592
dtype: int64

In [49]:
df.head()

Unnamed: 0,ID,Title,Year,Age,IMDb,Rotten Tomatoes,Netflix,Hulu,Prime Video,Disney+,Type,Directors,Genres,Country,Language,Runtime
0,1,Inception,2010,13+,8.8,87%,1.0,0,0,0,0,Christopher Nolan,"Action,Adventure,Sci-Fi,Thriller","United States,United Kingdom","English,Japanese,French",148.0
1,2,The Matrix,1999,18+,8.7,87%,1.0,0,0,0,0,"Lana Wachowski,Lilly Wachowski","Action,Sci-Fi",United States,English,136.0
2,3,Avengers: Infinity War,2018,13+,8.5,84%,1.0,0,0,0,0,"Anthony Russo,Joe Russo","Action,Adventure,Sci-Fi",United States,English,149.0
3,4,Back to the Future,1985,7+,8.5,96%,1.0,0,0,0,0,Robert Zemeckis,"Adventure,Comedy,Sci-Fi",United States,English,116.0
4,5,"The Good, the Bad and the Ugly",1966,18+,8.8,97%,1.0,0,1,0,0,Sergio Leone,Western,"Italy,Spain,West Germany",Italian,161.0


In [50]:
df["Age"].unique()

array(['13+', '18+', '7+', nan, 'all', '16+'], dtype=object)

In [51]:
import json
row_as_dict = df.iloc[0].to_dict()
row_as_dict

{'ID': 1,
 'Title': 'Inception',
 'Year': 2010,
 'Age': '13+',
 'IMDb': 8.8,
 'Rotten Tomatoes': '87%',
 'Netflix': 1.0,
 'Hulu': 0,
 'Prime Video': 0,
 'Disney+': 0,
 'Type': 0,
 'Directors': 'Christopher Nolan',
 'Genres': 'Action,Adventure,Sci-Fi,Thriller',
 'Country': 'United States,United Kingdom',
 'Language': 'English,Japanese,French',
 'Runtime': 148.0}

In [52]:
row_as_dict["Directors"] = None
row_as_dict["IMDb"] = None

In [53]:
row_as_json =json.dumps(row_as_dict)
row_as_json

'{"ID": 1, "Title": "Inception", "Year": 2010, "Age": "13+", "IMDb": null, "Rotten Tomatoes": "87%", "Netflix": 1.0, "Hulu": 0, "Prime Video": 0, "Disney+": 0, "Type": 0, "Directors": null, "Genres": "Action,Adventure,Sci-Fi,Thriller", "Country": "United States,United Kingdom", "Language": "English,Japanese,French", "Runtime": 148.0}'

In [54]:
fill_missing_value_prompt = """
You are a highly skilled data analyst with extensive experience in handling and processing movie datasets. Your expertise lies in identifying missing values and providing insights on how to fill those gaps effectively while ensuring data integrity and accuracy.

Your task is to assist in filling missing values in a movie dataset.

The runtime will be in minitues

The Age will be these ['13+', '18+', '7+', 'all', '16+']
**Input Type**: 
A dictionary of key-value pairs where the keys represent column names and the values represent the available data. Some values may be missing (represented as null or None).

**Output Type**: 
A JSON object that contains only the missing values, filled in with appropriate suggestions based on the dataset’s context. You are expected to maintain data consistency and offer plausible replacements where possible.

### Example Input:

{{
    "ID": 1,
    "Title": "Inception",
    "Year": 2010,
    "Age": "13+",
    "IMDb": null,
    "Rotten Tomatoes": "87%",
    "Netflix": 1.0,
    "Hulu": 0,
    "Prime Video": 0,
    "Disney+": 0,
    "Type": 0,
    "Directors": null,
    "Genres": "Action, Adventure, Sci-Fi, Thriller",
    "Country": "United States, United Kingdom",
    "Language": "English, Japanese, French",
    "Runtime": 148.0
}}

###Example Output
{{
    "IMDb": "8.8",
    "Directors": "Christopher Nolan"
}}
Yoy have to check for null values amd return json with actual values with the exact column name and JSON only.

Give output for the following {row_json} only in json
JUST THE JSON NOTHING ELSE 
"""

In [55]:
fill_values_template = PromptTemplate(
    input_variables= ["row_json"],
    template=fill_missing_value_prompt
)

In [56]:
import env
genai.configure(api_key = env.api_key)
model = genai.GenerativeModel("gemini-pro")

In [57]:
df.iloc[0].to_json()

'{"ID":1,"Title":"Inception","Year":2010,"Age":"13+","IMDb":8.8,"Rotten Tomatoes":"87%","Netflix":1.0,"Hulu":0,"Prime Video":0,"Disney+":0,"Type":0,"Directors":"Christopher Nolan","Genres":"Action,Adventure,Sci-Fi,Thriller","Country":"United States,United Kingdom","Language":"English,Japanese,French","Runtime":148.0}'

In [58]:
def fill_row(model,row):
    res = model.generate_content(
            contents= fill_values_template.format(
                row_json = row.to_json()
            )
        )
    res = res.text.strip()
    res

'{"Age": "13+"}'

'{"Age": "13+"}'