# Projecct 2 Part 3A - EDA

*Christina Brockway*

#### Business Issues

-  Use TMDB database
-  Extract budget, revenue, and MPAA Rating (Certification) data
-  Perform test extraction on movies from 2001 and 2002
-  Combine final API data into 1 dataframe
-  Create visualiztions to answer questions

### Imports

In [None]:
#Import packages
import os, time, json
import tmdbsimple as tmdb
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from tqdm.notebook import tqdm_notebook

#Set configs
pd.set_option('display.max_column', None)

### Define custom functions


In [None]:
# Explore Categorical Data

def explore_categorical(df, x, fillna = True, placeholder = 'MISSING',
                        figsize = (6,4), order = None):
  """Source: https://login.codingdojo.com/m/606/13765/117604"""
  # Make a copy of the dataframe and fillna
  temp_df = df.copy()
  # Before filling nulls, save null value counts and percent for printing
  null_count = temp_df[x].isna().sum()
  null_perc = null_count/len(temp_df)* 100
  # fillna with placeholder
  if fillna == True:
    temp_df[x] = temp_df[x].fillna(placeholder)
  # Create figure with desired figsize
  fig, ax = plt.subplots(figsize=figsize)
  # Plotting a count plot
  sns.countplot(data=temp_df, x=x, ax=ax, order=order)
  #Labels for barplot values
  for container in ax.containers:
      ax.bar_label(container)
  # Rotate Tick Labels for long names
  ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right')
  # Add a title with the feature name included
  ax.set_title(f"Column: {x}")

  # Fix layout and show plot (before print statements)
  fig.tight_layout()
  plt.show()

  # Print null value info
  print(f"- NaN's Found: {null_count} ({round(null_perc,2)}%)")
  # Print cardinality info
  nunique = temp_df[x].nunique()
  print(f"- Unique Values: {nunique}")
  # First find value counts of feature
  val_counts = temp_df[x].value_counts(dropna=False)
  # Define the most common value
  most_common_val = val_counts.index[0]
  # Define the frequency of the most common value
  freq = val_counts.values[0]
  # Calculate the percentage of the most common value
  perc_most_common = freq / len(temp_df) * 100
  # Print the results
  print(f"- Most common value: '{most_common_val}' occurs {freq} times ({round(perc_most_common,2)}%)")
  # print message if quasi-constant or constant (most common val more than 98% of data)
  if perc_most_common > 98:
    print(f"\n- [!] Warning: '{x}' is a constant or quasi-constant feature and should be dropped.")
  else:
    print("- Not constant or quasi-constant.")
  return fig, ax



*Source for barplot labels:  https://stackoverflow.com/questions/55104819/display-count-on-top-of-seaborn-barplot*

In [None]:
# Updated plot_categorical_vs_target function that includes option for classification task
def plot_categorical_vs_target(df, x, y, figsize=(6,4),
                            fillna = True, placeholder = 'MISSING',
                            order = None, target_type='reg'):
  # Make a copy of the dataframe and fillna
  temp_df = df.copy()
  # fillna with placeholder
  if fillna == True:
    temp_df[x] = temp_df[x].fillna(placeholder)
  # or drop nulls prevent unwanted 'nan' group in stripplot
  else:
    temp_df = temp_df.dropna(subset=[x])
  # Create the figure and subplots
  fig, ax = plt.subplots(figsize=figsize)
  # REGRESSION-TARGET PLOT
  if target_type=='reg': #Added if statement here
    # Barplot
    sns.barplot(data=temp_df, x=x, y=y, ax=ax, order=order, alpha=0.6,
                linewidth=1, edgecolor='black', errorbar=None)
    # Boxplot
    sns.stripplot(data=temp_df, x=x, y=y, hue=x, ax=ax,
                  order=order, hue_order=order, legend=False,
                  edgecolor='white', linewidth=0.5,
                  size=3,zorder=0)
  # CLASSIFICATION-TARGET PLOT # This is the new code for the classification task
  elif target_type=='class':
    ax = sns.histplot(data=temp_df, hue=y, x=x, stat='percent',  multiple='fill')
  # Rotate xlabels
  ax.set_xticks(ax.get_xticks()) # Added this to prevent a bug
  ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right')
  # Add a title
  ax.set_title(f"{x} vs. {y}")
  fig.tight_layout()
  return fig, ax


### Load data

In [None]:
df2001=pd.read_csv("
df2001.head(2)

In [None]:
df2002=pd.read_csv("
df2002.head(2)

In [2]:
df_to_combine = [df2001, df2001]
df = pd.concat(df_to_combine)
df.info()

NameError: name 'df2001' is not defined

### Inspect the Data

In [None]:
df.head(5), df.tail(5)

In [None]:
#Display the number of rows and columns
df.shape

In [None]:
# Desplay statistics
df.describe()


In [None]:
#Display non-null values and dtypes
df.info()

## Exploratory Data Analysis

### Questions:

**How many movies had at least some valid financial information?**
-  values >0 for budget and/or >0 for revenue
-  

In [None]:
df_budget = df.loc([df['budget'}>0) | (df["revenue"])
df_budget

**How many movies are there in each of the certification categories?**

In [None]:
explore_categorical(df_budget['certification']);

*Source:  https://stackoverflow.com/questions/30482071/how-to-calculate-mean-values-grouped-on-another-column*

**What is the average revenue per certification category?**

In [None]:
df.groupby('certification', as_index=False)['revenue'].mean()

x=df_budget['certification']
y=df_budget['revenue']
plot_categorical_vs_target(df_budget, x,y);

**

In [None]:
df.groupby('certification', as_index=False)['budget'].mean()

In [None]:
x=df_budget['certification']
y=df_budget['budget']
plot_categorical_vs_target(df_budget, x,y);