In [None]:
from utils import summarize_dataset, pd_config
import pandas as pd
import sqlite3
import requests
from typing import Counter
from wordcloud import WordCloud
import matplotlib.pyplot as plt
%matplotlib inline  


In [None]:
def getData(title):
    url = f"https://moviesdatabase.p.rapidapi.com/titles/search/title/{title}"
    querystring = {"exact": "true", "titleType": "movie"}
    headers = {
        "X-RapidAPI-Key": "",
        "X-RapidAPI-Host": "moviesdatabase.p.rapidapi.com"
    }
    response = requests.get(url, headers=headers, params=querystring)
    data3 = response.json()

    # İlk filmi alınan "id" değeri ile derecelendirme isteği
    if "results" in data3 and data3["results"]:
        movie_id = data3["results"][0]["id"]
        url = f"https://moviesdatabase.p.rapidapi.com/titles/{movie_id}/ratings"
        response2 = requests.get(url, headers=headers)
        ratings_data = response2.json()
        ratings_data = {title:ratings_data['results']}
        print(ratings_data)
        return ratings_data
    else:
        print("Film bulunamadı.")
        return None
    

In [None]:
def showGraphs(dataframe, text, limit=10):
    maxValues = dataframe.head(limit)
    plt.figure(figsize=(10, 6))
    bars = plt.bar(maxValues.iloc[:, 0], maxValues.iloc[:, 1], color='skyblue')
    plt.suptitle(text)
    plt.xticks(rotation=45)
    for bar in bars:
        yval = bar.get_height()
        plt.text(bar.get_x() + bar.get_width()/2, yval, yval, ha='center', va='bottom')
    plt.tight_layout()
    plt.show()


In [None]:
def createClouds(dataframe, category_list,stopwords):
    for category in category_list:
        filtered_df = dataframe[dataframe['listed_in'].str.contains(category)]
        text = ' '.join(filtered_df.description)
        wordcloud = WordCloud(width=800, height=400, background_color='white',stopwords=stopwords).generate(text)
        # Create a new figure for each word cloud
        plt.figure(figsize=(10, 5))
        
        # Display the category as a title above the entire figure
        plt.suptitle(category, fontsize=12)
        
        # Display the word cloud
        plt.imshow(wordcloud, interpolation='bilinear')
        plt.axis('off')
        plt.show()



In [None]:
def plot_cloud(words):
    plt.figure(figsize=(40, 30))
    plt.axis("off");
    plt.imshow(words) 

In [None]:
pd_config()

In [None]:
path = ""
conn = sqlite3.connect("mydatabase.db")


In [None]:
data = pd.read_csv(path+"netflix_titles.csv")

In [None]:
new_data=summarize_dataset(data=data,drop_duplicates=True,drop_na=True)

In [None]:
new_data.to_sql("netflix", conn, if_exists="replace", index=False)

In [None]:
dataframe = pd.read_sql("SELECT * FROM Netflix",conn)  #It's better to pull data from the database instead of a local CSV every time.

In [None]:
rate=dataframe["rating"].unique()
details = [
    'Parental Guidance Suggested',
    'Parents Strongly Cautioned',
    'Restricted',
    'Mature Audience Only',                                          #meaning of the ratings
    'Not Rated',
    'Parents Strongly Cautioned - 13 and older',
    'Parental Guidance Suggested',
    'For Ages 7 and Up',
    'General Audiences',
    'General Audience - All Ages',
    'Unrated',
    'For Ages 7 and Up - Fantasy Violence',
    'All Children',
    'Adults Only'
]


In [None]:
ratings = pd.DataFrame({'rate': rate, 'details': details})   

In [None]:
ratings.to_sql("ratings", conn, if_exists="replace", index=False)

In [None]:
freeList=[]

In [None]:
#Get rating data using api 
for index, row in new_data.iterrows():
    try:
        title = row["title"]
        raw = getData(title)
        freeList.append(raw)
        if index %100 == 0:
            print("index:", str(index))
    except KeyError:
        print("Keyerror")
        break
#Warning!!! If you re using free version of api you have limited request in a hour,so you must wait 

In [None]:
dictList = []

for item in freeList:
    if item is not None:    #when API limit is reached it returns none, we are deleting those that return none.
        for key, value in item.items():
            if value is not None:    #some movies have no rating, we remove them too.
                dictList.append({key: value})

dictList

In [None]:
data_list = []
for item in dictList:
    if not item:
        continue
    title = list(item.keys())[0]
    info = item[title]
    tconst = info.get('tconst', None)
    averageRating = info.get('averageRating', None)
    numVotes = info.get('numVotes', None)
    data_list.append([title, tconst, averageRating, numVotes])
detailedRatings = pd.DataFrame(data_list, columns=['Movie', 'tconst', 'averageRating', 'numVotes'])
detailedRatings

In [None]:
detailedRatings.to_sql("ratingDetails", conn, if_exists="replace", index=False)


In [None]:

detailedRatings = pd.read_sql("SELECT * FROM ratingDetails",conn) 

In [None]:
#There is little data with UR rating, I searched on the internet and I saw that all of them were NC-17, so I am updating it.
dataframe.loc[dataframe["rating"] == "UR", "rating"] = "NC-17" 


In [None]:
countries = dataframe["country"]

In [None]:
unique_countries = []

for entry in countries:
    countries_list = entry.split(", ")
    unique_countries.extend(countries_list)

ct = pd.DataFrame({"Country": unique_countries})

country_counts = ct["Country"].value_counts().reset_index()
country_counts.columns = ["Country", "Count"]

print(country_counts)
ct = pd.DataFrame(country_counts)

In [None]:
ct.to_sql("country",conn,if_exists="replace",index=False)

In [None]:
tempDf = dataframe["listed_in"]

In [None]:
categories = set()
for item in tempDf:
    updatedItem = item.split(",")
    for item2 in updatedItem:
        category = item2.strip()  # Başta ve sondaki boşlukları kaldır
        categories.add(category)

# Her bir kategorinin kaç kez geçtiğini sayar
category_counts = Counter()

for item in tempDf:
    updatedItem = item.split(",")
    for item2 in updatedItem:
        category = item2.strip()
        category_counts[category] += 1


In [None]:
category_list=list(category_counts.keys())
sorted_categories = sorted(category_counts.keys())
sorted_categories

In [None]:
url = "https://gist.githubusercontent.com/ZohebAbai/513218c3468130eacff6481f424e4e64/raw/b70776f341a148293ff277afa0d0302c8c38f7e2/gist_stopwords.txt"
response = requests.get(url)
with open("gist_stopwords.txt", "w") as file:
    file.write(response.text)
with open("gist_stopwords.txt", "r") as file:
    content = file.read()
    stopwords = content.split(",")

In [None]:
stopwords

In [None]:
createClouds(dataframe, sorted_categories,stopwords)

In [None]:
category_counts
categoryDf = pd.DataFrame.from_dict(category_counts, orient='index', columns=['Count'])

# Reset the index to have the categories as a column
categoryDf.reset_index(inplace=True)
categoryDf.rename(columns={'index': 'Category'}, inplace=True)
categoryDf.sort_values(by="Count",ascending=False,inplace=True)

In [None]:
text="Top 10 Movie Types"
showGraphs(categoryDf,text=text)

In [None]:
text="Top 10 Countries"
showGraphs(ct,text=text)