<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Cleaning-groups-of-data-from-csv-files-date-columns-and-formats" data-toc-modified-id="Cleaning-groups-of-data-from-csv-files-date-columns-and-formats-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Cleaning groups of data from csv files date columns and formats</a></span><ul class="toc-item"><li><ul class="toc-item"><li><span><a href="#fixing-dates-and-addressing-money-and-percent-columns-for-the-summary-files." data-toc-modified-id="fixing-dates-and-addressing-money-and-percent-columns-for-the-summary-files.-1.0.1"><span class="toc-item-num">1.0.1&nbsp;&nbsp;</span>fixing dates and addressing money and percent columns for the summary files.</a></span></li></ul></li></ul></li><li><span><a href="#Processing-to-make-comparing-to-other-data-easier" data-toc-modified-id="Processing-to-make-comparing-to-other-data-easier-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Processing to make comparing to other data easier</a></span></li></ul></div>

In [1]:
import import_ipynb
from common_imports import *
from capstone_functions import *

importing Jupyter notebook from common_imports.ipynb
importing Jupyter notebook from capstone_functions.ipynb


## Cleaning groups of data from csv files date columns and formats

#### fixing dates and addressing money and percent columns for the summary files.

In [2]:
def clean_split_date(df, date_column, filename):
    # Split date column into separate columns
    df[['Start_Date', 'End_Date_ft']] = df['date'].str.split('-', expand=True)
    
    # Extract year from filename
    year = str(filename[-8:-4])

    # Clean start date column
    df[['Start_Month', 'Start_Day']] = df['Start_Date'].str.split(' ', expand=True)
    df['Start_Year'] = year
    
    # Clean end date column
    # if month is present in End_Date
    df[['End_Month', 'End_Day']] = df['End_Date_ft'].str.split(' ', expand=True)
    
    # If End_Day is null, then set End_Month == Start_Month and End_Day == End_Date_ft
    if df['End_Day'].isnull().all():
        df['End_Month'] = df['Start_Month']
        df['End_Day'] = df['End_Date_ft']
        df['End_Year'] = year
    else:
        # If End_Day is not null, set End_Day to the corresponding values in End_Date_ft where it is null
        null_mask = df['End_Day'].isnull()
        df.loc[null_mask, 'End_Day'] = df.loc[null_mask, 'End_Date_ft']
        # Set End_Month to Start_Month for the corresponding rows where End_Day was null
        df.loc[null_mask, 'End_Month'] = df.loc[null_mask, 'Start_Month']
        df['End_Year'] = year
    
    # Check if end year should be incremented by 1
    if ((df['Start_Month'] == 'Dec') & (df['End_Month'] == 'Jan')).any():
        # Only update rows where Start_Month is Dec and End_Month is Jan
        mask = (df['Start_Month'] == 'Dec') & (df['End_Month'] == 'Jan')
        df.loc[mask, 'End_Year'] = int(year) + 1
          
    df['End_Year'] = df['End_Year'].astype(str) 
    
    #print('date columns:',df[['End_Month', 'End_Day', 'End_Year', 'Start_Year']].head())
    #print('dec and jan:',((df['Start_Month'] == 'Dec') & (df['End_Month'] == 'Jan')).any())
    
    # Convert date columns to datetime format
    df['Start_Date'] = pd.to_datetime(df['Start_Month'] + ' ' + df['Start_Day'] + ' ' + df['Start_Year'])
    df['End_Date'] = pd.to_datetime(df['End_Month'] + ' ' + df['End_Day'] + ' ' + df['End_Year'], errors='coerce')

    # Drop intermediate columns
    df.drop(columns=['End_Date_ft', 'date'], inplace=True)
    
    return df



clean_split_date is specifially designed to go through all of the summary files. 

In [3]:
# set path to folder containing CSV files
folder_path = '/Users/toniwork/Desktop/AUBEC - 3 Projects/to_clean_summary'
output_folder_path = '/Users/toniwork/Desktop/Capstone/summary_step_1'

# Create the output folder if it does not exist
create_folder(output_folder_path)

# loop over all files in folder
for filename in os.listdir(folder_path):
    if filename.endswith('.csv'):
        # read in CSV file
        df = pd.read_csv(os.path.join(folder_path, filename))
        
        # perform cleaning and wrangling operations on the dataframe
        if 'ranking_summary' in filename:
            #print('Converting columns to numeric values and removing dollar sign and comma...')
            # convert columns to numeric values and remove dollar sign and comma
            df[['worldwide', 'domestic', 'foreign']] = df[['worldwide', 'domestic', 'foreign']].apply(
                lambda x: pd.to_numeric(x.str.replace('$', '').str.replace(',', ''), errors='coerce')).round(2)
            df[['domestic_pct', 'foreign_pct']] = df[['domestic_pct', 'foreign_pct']].apply(
                lambda x: pd.to_numeric(x.astype(str).replace('%', '', regex=True), errors='coerce')).round(4) / 100.0
            
            # extract year from filename and create Year column
            year = int(filename[-8:-4])
            df['Year'] = year

        elif 'weekend_summary' in filename:
            # update top_release column 
            df['top_release_title'] = df['top_release']
            
            #print('Converting columns to numeric values and removing dollar sign and comma...')
            # convert columns to numeric values and remove dollar sign and comma
            df[['top10_gross', 'overall_gross']] = df[['top10_gross', 'overall_gross']].apply(
                lambda x: pd.to_numeric(x.str.replace('$', '').str.replace(',', ''), errors='coerce')).round(2)
            df[['top10_wow_change', 'overall_wow_change']] = df[['top10_wow_change', 'overall_wow_change']].apply(
                lambda x: pd.to_numeric(x.astype(str).replace('%', '', regex=True), errors='coerce')).round(4) / 100.0
    
            #print('Splitting date column and cleaning date values...')
            # clean and split date column
            df = clean_split_date(df, 'date', filename)
            
        # save cleaned dataframe to new file
        new_filename = filename.split('.')[0] + '_date.csv'
        df.to_csv(os.path.join(output_folder_path, new_filename), index=False)


  lambda x: pd.to_numeric(x.str.replace('$', '').str.replace(',', ''), errors='coerce')).round(2)
  lambda x: pd.to_numeric(x.str.replace('$', '').str.replace(',', ''), errors='coerce')).round(2)
  lambda x: pd.to_numeric(x.str.replace('$', '').str.replace(',', ''), errors='coerce')).round(2)
  lambda x: pd.to_numeric(x.str.replace('$', '').str.replace(',', ''), errors='coerce')).round(2)
  lambda x: pd.to_numeric(x.str.replace('$', '').str.replace(',', ''), errors='coerce')).round(2)
  lambda x: pd.to_numeric(x.str.replace('$', '').str.replace(',', ''), errors='coerce')).round(2)
  lambda x: pd.to_numeric(x.str.replace('$', '').str.replace(',', ''), errors='coerce')).round(2)
  lambda x: pd.to_numeric(x.str.replace('$', '').str.replace(',', ''), errors='coerce')).round(2)
  lambda x: pd.to_numeric(x.str.replace('$', '').str.replace(',', ''), errors='coerce')).round(2)
  lambda x: pd.to_numeric(x.str.replace('$', '').str.replace(',', ''), errors='coerce')).round(2)
  lambda x: pd.to_nu

  lambda x: pd.to_numeric(x.str.replace('$', '').str.replace(',', ''), errors='coerce')).round(2)
  lambda x: pd.to_numeric(x.str.replace('$', '').str.replace(',', ''), errors='coerce')).round(2)
  lambda x: pd.to_numeric(x.str.replace('$', '').str.replace(',', ''), errors='coerce')).round(2)
  lambda x: pd.to_numeric(x.str.replace('$', '').str.replace(',', ''), errors='coerce')).round(2)


In [4]:
folder_path = '/Users/toniwork/Desktop/Capstone/summary_step_1'
output_folder_path = '/Users/toniwork/Desktop/Capstone/summary_step_2'

# Create the output folder if it does not exist
create_folder(output_folder_path)

ranking_data = pd.DataFrame()  # Empty DataFrame to store ranking data
weekend_data = pd.DataFrame()  # Empty DataFrame to store weekend data

# Loop over all files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith('.csv'):
        file_path = os.path.join(folder_path, filename)
        df = pd.read_csv(file_path)
        
        if 'ranking' in filename.lower():
            ranking_data = ranking_data.append(df)  # Append data to ranking_data DataFrame
        elif 'weekend' in filename.lower():
            weekend_data = weekend_data.append(df)  # Append data to weekend_data DataFrame

# Save the combined data to new files
ranking_data.to_csv(os.path.join(output_folder_path, 'combined_ranking_data.csv'), index=False)
weekend_data.to_csv(os.path.join(output_folder_path, 'combined_weekend_data.csv'), index=False)


  ranking_data = ranking_data.append(df)  # Append data to ranking_data DataFrame
  ranking_data = ranking_data.append(df)  # Append data to ranking_data DataFrame
  weekend_data = weekend_data.append(df)  # Append data to weekend_data DataFrame
  weekend_data = weekend_data.append(df)  # Append data to weekend_data DataFrame
  ranking_data = ranking_data.append(df)  # Append data to ranking_data DataFrame
  ranking_data = ranking_data.append(df)  # Append data to ranking_data DataFrame
  weekend_data = weekend_data.append(df)  # Append data to weekend_data DataFrame
  weekend_data = weekend_data.append(df)  # Append data to weekend_data DataFrame
  ranking_data = ranking_data.append(df)  # Append data to ranking_data DataFrame
  ranking_data = ranking_data.append(df)  # Append data to ranking_data DataFrame
  ranking_data = ranking_data.append(df)  # Append data to ranking_data DataFrame
  ranking_data = ranking_data.append(df)  # Append data to ranking_data DataFrame
  ranking_data =

In [5]:
# Drop columns 
def drop_columns(folder_path, file_name, columns):
    file_path = os.path.join(folder_path, file_name)
    if not os.path.exists(file_path):
        print(f"{file_path} not found, skipping")
        return
    df = pd.read_csv(file_path)
    for col in columns:
        if col in df.columns:
            df.drop(col, axis=1, inplace=True)
        else:
            print(f"Column {col} not found in {file_name}, skipping")
    df.to_csv(file_path, index=False)
    print(f"{file_name} updated successfully!")

In [6]:
folder_path = '/Users/toniwork/Desktop/Capstone/summary_step_2'

drop_columns(folder_path, 'combined_weekend_data.csv', ['occasion','top_release_title'])

combined_weekend_data.csv updated successfully!


## Processing to make comparing to other data easier

In [7]:
# Intital clean_movie function to clean up for data consistency
def clean_movie_data(folder_path, output_folder_path):
    for filename in os.listdir(folder_path):
        if filename.endswith('.csv'):
            try:
                df = pd.read_csv(os.path.join(folder_path, filename))
            except FileNotFoundError:
                print(f"Error: {filename} not found.")
                continue

        # Call the cleaning functions
        df = convert_column_names(df)
        df = clean_title_columns(df)
        df = convert_string_columns_to_lowercase(df)
        
        # Save cleaned DataFrame
        new_filename = filename
        df.to_csv(os.path.join(output_folder_path, new_filename), index=False)
        print(f"{filename} cleaned and saved as {new_filename}.")

In [8]:
# Run clean_movie_data function
folder_path = '/Users/toniwork/Desktop/Capstone/summary_step_2'
output_folder_path = '/Users/toniwork/Desktop/Capstone/summary_step_2'  

clean_movie_data(folder_path, output_folder_path)



combined_ranking_data.csv cleaned and saved as combined_ranking_data.csv.
combined_weekend_data.csv cleaned and saved as combined_weekend_data.csv.


In [9]:
# Print files info from folder to verify
folder_path = '/Users/toniwork/Desktop/Capstone/summary_step_2'

# Loop through each file in the folder
for file_name in os.listdir(folder_path):
    if file_name.endswith('.csv'):
        file_path = os.path.join(folder_path, file_name)
        
        # Read in the CSV file as a DataFrame
        df = pd.read_csv(file_path)
        print(file_name,':',df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4200 entries, 0 to 4199
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   rank          4200 non-null   int64  
 1   title         4200 non-null   object 
 2   worldwide     4200 non-null   int64  
 3   domestic      3171 non-null   float64
 4   domestic_pct  3125 non-null   float64
 5   foreign       4177 non-null   float64
 6   foreign_pct   4171 non-null   float64
 7   year          4200 non-null   int64  
dtypes: float64(4), int64(3), object(1)
memory usage: 262.6+ KB
combined_ranking_data.csv : None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1291 entries, 0 to 1290
Data columns (total 15 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   top10_gross         1291 non-null   float64
 1   top10_wow_change    1094 non-null   float64
 2   overall_gross       1291 non-null   float64
 3   overall_wo

In [10]:
# Rename rating column to mpaa_rating in metacritic-reviews.csv
df = pd.read_csv('/Users/toniwork/Desktop/Capstone/summary_step_2/combined_weekend_data.csv')

# Define a dictionary to map old column names to new column names
column_mapping = {
    'top_release': 'title'
}

# Rename the columns using the dictionary
df.rename(columns=column_mapping, inplace=True)

# Save the updated DataFrame to a new file
df.to_csv('/Users/toniwork/Desktop/Capstone/summary_step_2/combined_weekend_data.csv', index=False)

In [11]:
# Loop through file in folder to add the new title_id from the title_key.csv using add_title_ids_main function
input_folder = '/Users/toniwork/Desktop/Capstone/summary_step_2'
title_key_path = '/Users/toniwork/Desktop/Capstone/'
output_folder = '/Users/toniwork/Desktop/Capstone/summary_title_id'
split_folder = '/Users/toniwork/Desktop/Capstone/split_1'

# Create the output folder if it does not exist
create_folder(output_folder)
    
# Create the split folder if it does not exist
create_folder(split_folder)

add_title_ids_main(input_folder, title_key_path, split_folder, output_folder)

combined_ranking_data.csv cleaned and 'title_id' added. On /Users/toniwork/Desktop/Capstone/summary_title_id/combined_ranking_data.csv and /Users/toniwork/Desktop/Capstone/summary_step_2/combined_ranking_data.csv
combined_weekend_data.csv cleaned and 'title_id' added. On /Users/toniwork/Desktop/Capstone/summary_title_id/combined_weekend_data.csv and /Users/toniwork/Desktop/Capstone/summary_step_2/combined_weekend_data.csv


In [16]:
# Print files info from folder to verify
folder_path = '/Users/toniwork/Desktop/Capstone/summary_title_id'

# Loop through each file in the folder
for file_name in os.listdir(folder_path):
    if file_name.endswith('.csv'):
        file_path = os.path.join(folder_path, file_name)
        
        # Read in the CSV file as a DataFrame
        df = pd.read_csv(file_path)
        print(file_name,':',df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2901 entries, 0 to 2900
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   rank           2901 non-null   int64  
 1   title_x        2901 non-null   object 
 2   worldwide      2901 non-null   int64  
 3   domestic       2729 non-null   float64
 4   domestic_pct   2720 non-null   float64
 5   foreign        2879 non-null   float64
 6   foreign_pct    2873 non-null   float64
 7   year           2901 non-null   int64  
 8   title_id       2901 non-null   int64  
 9   title_y        2901 non-null   object 
 10  release_date   2901 non-null   object 
 11  release_year   2901 non-null   int64  
 12  release_month  2901 non-null   int64  
 13  release_day    2901 non-null   int64  
dtypes: float64(4), int64(7), object(3)
memory usage: 317.4+ KB
combined_ranking_data.csv : None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 895 entries, 0 to 894
Data columns (total 

In [13]:
# Compare title_key_updated to the files in folder summary_title_id dropping any rows that do not match on title_id 
title_id_key_df = pd.read_csv('/Users/toniwork/Desktop/Capstone/title_key_updated.csv')
path = '/Users/toniwork/Desktop/Capstone/summary_title_id'

# Iterate through all the CSV files in the specified path
for filename in os.listdir(path):
    if filename.endswith('.csv'):
        file_path = os.path.join(path, filename)
        df = pd.read_csv(file_path)

        # Merge with title_id_key_df using an inner join on the title_id column
        # This will keep only the rows with matching title_ids
        updated_df = pd.merge(df, title_id_key_df, on='title_id', how='inner')

        # Save the updated DataFrame back to the CSV file
        updated_df.to_csv(file_path, index=False)
        print(f"{filename} updated successfully!")

combined_ranking_data.csv updated successfully!
combined_weekend_data.csv updated successfully!


In [15]:
# Remove duplicates and get count for all updated files in step_2 folder
folder_path = "/Users/toniwork/Desktop/Capstone/summary_title_id"
file_pattern = ".csv" 
drop_duplicate_rows_and_report(folder_path, file_pattern)


File: combined_ranking_data.csv, Title_id duplicates: 64
File: combined_weekend_data.csv, Title_id duplicates: 578


In [17]:
# Drop columns using drop_columns_by_keyword_or_name function
folder_path = '/Users/toniwork/Desktop/Capstone/summary_title_id'

drop_columns_by_keyword_or_name(folder_path, column_names=['title_y', 'title_x', 'release_date', 'release_year', 'release_month', 'release_day'], file_name='combined_ranking_data.csv')
drop_columns_by_keyword_or_name(folder_path, column_names=['title_y', 'title_x', 'release_date', 'release_year', 'release_month', 'release_day'], file_name='combined_weekend_data.csv')


combined_ranking_data.csv updated successfully!
combined_weekend_data.csv updated successfully!


In [18]:
# Print files info from folder to verify
folder_path = '/Users/toniwork/Desktop/Capstone/summary_title_id'

# Loop through each file in the folder
for file_name in os.listdir(folder_path):
    if file_name.endswith('.csv'):
        file_path = os.path.join(folder_path, file_name)
        
        # Read in the CSV file as a DataFrame
        df = pd.read_csv(file_path)
        print(file_name,':',df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2901 entries, 0 to 2900
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   rank          2901 non-null   int64  
 1   worldwide     2901 non-null   int64  
 2   domestic      2729 non-null   float64
 3   domestic_pct  2720 non-null   float64
 4   foreign       2879 non-null   float64
 5   foreign_pct   2873 non-null   float64
 6   year          2901 non-null   int64  
 7   title_id      2901 non-null   int64  
dtypes: float64(4), int64(4)
memory usage: 181.4 KB
combined_ranking_data.csv : None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 895 entries, 0 to 894
Data columns (total 15 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   top10_gross         895 non-null    float64
 1   top10_wow_change    762 non-null    float64
 2   overall_gross       895 non-null    float64
 3   overall_wow_change  763 