# IBMD Data Analysis
### Planned Action

--- Define the business problem
    ¬¬ Come up with a possible business questions
--- Import data
    ¬¬ Write out a short metedata description for each table
    ¬¬ Identify possible tables from the imported data that will answer the business question
--- Inspect the chosen dataset
    ¬¬ Outline possible problems that could occur with the data
    ¬¬ Profiling the data 
--- Transform the chosen dataset
    ¬¬ Cleaning the data
    ¬¬ Enrich the data if needed to meet business needs
--- Verify the transformed dataset
--- Build the data pipeline - ELT or ETL



In [None]:
# ---- Let start by inspecting the data we are dealing with.


import pandas as pd

tables = {
    "https://raw.githubusercontent.com/mansik95/IMDB-Analysis/master/Data/MovieLens_movies.csv": "movies_Id",
    "https://raw.githubusercontent.com/mansik95/IMDB-Analysis/master/Data/IMDb%20BoxOfficeMojo%20-%20Brands%20(US%20%26%20Canada).tsv": "brands_US_and_Canada",
    "https://raw.githubusercontent.com/mansik95/IMDB-Analysis/master/Data/IMDb%20BoxOfficeMojo%20-%20Brand_%20Marvel%20Comics.tsv": "brand_marvel_comics",
    "https://raw.githubusercontent.com/mansik95/IMDB-Analysis/master/Data/The%20Numbers%20-%20Domestic%20Box%20Office%20Daily%20-%20The%20Avengers.tsv": "Domestic_Box_Office_Daily_The_Avengers",
    "https://raw.githubusercontent.com/mansik95/IMDB-Analysis/master/Data/The%20Numbers%20-%20Domestic%20Box%20Office%20-%20Franchises.tsv": "Domestic_Box_Office_Franchises",
    "https://raw.githubusercontent.com/mansik95/IMDB-Analysis/master/Data/The%20Numbers%20-%20Domestic%20Box%20Office%20-%20Franchises%20-%20Marvel%20Cinematic%20Universe.tsv": "Domestic_Box_Office_Franchises_Marvel_Cinematic",
    "https://raw.githubusercontent.com/mansik95/IMDB-Analysis/master/Data/World%20Wide%20Box%20Office%20All%20Time%20Top%201000.tsv": "World_Wide_Box_Office_All_Time_Top_1000",
    "https://raw.githubusercontent.com/mansik95/IMDB-Analysis/master/Data/IMDb%20BoxOfficeMojo%20-%20Franchises%20(US%20%26%20Canada).tsv": "Franchises_us_and_Canada",
    "https://raw.githubusercontent.com/mansik95/IMDB-Analysis/master/Data/IMDb%20BoxOfficeMojo%20-%20Franchise_%20top20.tsv": "top_20_for_each_Franchise",
    "https://raw.githubusercontent.com/mansik95/IMDB-Analysis/master/Data/MovieLens_tags.csv": "tags"
}
# Dictionary to store DataFrames
Movies_dataframes = {}

for url, table_name in tables.items():
    if url.endswith('.csv'):
        Movies_dataframes[table_name] = pd.read_csv(url, engine="python")
        print(table_name.upper())
        print(Movies_dataframes[table_name].head(2))
    elif url.endswith('.tsv'):
        Movies_dataframes[table_name] = pd.read_csv(url, delimiter='\t', engine="python")
        print(table_name.upper())
        print(Movies_dataframes[table_name].head(2))



#### Required tables to answer business

--- DOMESTIC_BOX_OFFICE_FRANCHISES_MARVEL_CINEMATIC
--- DOMESTIC_BOX_OFFICE_FRANCHISES
--- WORLD_WIDE_BOX_OFFICE_ALL_TIME_TOP_1000
--- TOP_20_FOR_EACH_FRANCHISE

In [None]:
# ---- Let us perform a structure, data type profiling of the datasets
for table_name, table in Movies_dataframes.items():
    frame = pd.DataFrame(table)
    print(table_name.upper())
    print(frame.info())

--- list of tables that require transformation 
DOMESTIC_BOX_OFFICE_FRANCHISES

    Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 2   Domestic_Box_Office      867 non-null    object ---- remove the dollar sign and convert type to int
 3   Infl_Adj_Dom_Box_Office  867 non-null    object to int ---- remove the dollar sign and convert type to int
 4   Worldwide_Box_Office     867 non-null    object to int ---- remove the dollar sign and convert type to int

DOMESTIC_BOX_OFFICE_FRANCHISES_MARVEL_CINEMATIC

    Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 2   Production_Budget     23 non-null     object to int ---- remove the dollar sign and convert type to int
 3   Opening_Weekend       23 non-null     object to int ---- remove the dollar sign and convert type to int
 4   Domestic_Box_Office   23 non-null     object to int ---- remove the dollar sign and convert type to int
 5   Worldwide_Box_Office  23 non-null     object to int ---- remove the dollar sign and convert type to int

TOP_20_FOR_EACH_FRANCHISE

    Column          Non-Null Count  Dtype 
 
 3   Lifetime_Gross  281 non-null    object to int ---- remove the dollar sign and convert type to int
 4   Max_Theaters    281 non-null    object to int ---- remove the comma sign and convert type to int
 5   Opening_Gross   281 non-null    object to int ---- remove the dollar sign and convert type to int
 6   Open_Theaters   281 non-null    object to int ---- remove the comma sign and convert type to int


In [None]:
# --- Check Table columns' that needs what type of transformation,
table_to_transform = Movies_dataframes['Table_Name']
column_to_transform = table_to_transform.loc[:, 'column_name']
print(column_to_transform.head(2))

In [None]:
nan_values = df.isna()  # or df.isnull()

# Count NaN values in the entire DataFrame
count_nan_values = df.isna().sum().sum()

In [None]:
# Check for NaN values in a column
nan_values = df['column_name'].isna()  # or df['column_name'].isnull()

# Count NaN values in a column
count_nan_values = df['column_name'].isna().sum() 

In [18]:
# --- Table columns' that needs transformation,
import numpy as np

columns_to_remove_the_dollar_sign = {
    'Domestic_Box_Office_Franchises': ['Domestic_Box_Office', 'Infl_Adj_Dom_Box_Office', 'Worldwide_Box_Office'],
    'Domestic_Box_Office_Franchises_Marvel_Cinematic': ['Production_Budget', 'Opening_Weekend', 'Domestic_Box_Office', 'Worldwide_Box_Office'],
    'top_20_for_each_Franchise': ['Lifetime_Gross','Opening_Gross','Max_Theaters']
}



for table_name, columns in columns_to_remove_the_dollar_sign.items():
    for column in columns:
        print(column)
        if Movies_dataframes[table_name][column].dtype == 'object':
            Movies_dataframes[table_name][column] = Movies_dataframes[table_name][column].str.replace('$', '').str.replace(',', '')






Domestic_Box_Office
Infl_Adj_Dom_Box_Office
Worldwide_Box_Office
Production_Budget
Opening_Weekend
Domestic_Box_Office
Worldwide_Box_Office
Lifetime_Gross
Opening_Gross
Max_Theaters


In [13]:
table_to_transform = Movies_dataframes['Domestic_Box_Office_Franchises']
column_to_transform = table_to_transform.loc[:, 'Domestic_Box_Office']
print(column_to_transform.info())

<class 'pandas.core.series.Series'>
RangeIndex: 867 entries, 0 to 866
Series name: Domestic_Box_Office
Non-Null Count  Dtype
--------------  -----
867 non-null    int64
dtypes: int64(1)
memory usage: 6.9 KB
None


In [None]:
# Perform the inner query to calculate Total and number of Movies per Franchise
inner_query = Movies_dataframes['top_20_for_each_Franchise'].groupby('Franchise').agg(
    Total=('Lifetime_Gross', 'sum'),
    numbers_of_Movies=('Lifetime_Gross', 'count')
)

# Calculate AVG_Revenue_per_Movies
inner_query['AVG_Revenue_per_Movies'] = inner_query['Total'] / inner_query['numbers_of_Movies']

# Sort the DataFrame by AVG_Revenue_per_Movies in descending order
result = inner_query.sort_values(by='AVG_Revenue_per_Movies', ascending=False)

# Display the top 20 results
result.head(20)


--
dataframes[table_name][column] = dataframes[table_name][column].str.replace('$', '') 

SELECT CAST(REPLACE(REPLACE(column, '$', ''), ',', '') AS INTEGER) as column_name
FROM table_name;

In [93]:
K = Movies_dataframes.__sizeof__()
F = r"C:\Users\tse\27-03-24\Full-Stack-IMBD-Data-Analysis\movie_data.db"
import os

file_size = os.path.getsize(F)
print(file_size)
print(K)

37478400
256
