### Inventory Preprocessing

### Sessions Preprocessing

In [4]:
import pandas as pd

# Load the movie sessions data
sessions_path = "Movie_sessions.xlsx"
# sessions_df = pd.read_excel(sessions_path, sheet_name="Sheet1")
sessions_df = pd.read_excel(sessions_path)
sessions_df = pd.read_excel(
    sessions_path,
    sheet_name="Sheet1",
    engine="openpyxl"
)
sessions_df = sessions_df[sessions_df["Session Audio Language"] != "Overall Result"]
sessions_df = sessions_df[sessions_df["Duration"] != "960 MIN"]

# Step 1: Convert 'Duration' to numeric (strip ' MIN')
sessions_df["Duration"] = sessions_df["Duration"].str.replace(" MIN", "").astype(int)

# Create a single 'Duration Category' column based on defined ranges
sessions_df["Duration Category"] = pd.cut(
    sessions_df["Duration"],
    bins=[0, 90, 120, float('inf')],
    labels=["Short", "Medium", "Long"],
    right=False
)


# Create a function to generate all session hours based on the starting hour and duration
def get_session_hours(row):
    start_hour = int(row['Session Hour'])  # Convert start hour to integer
    duration = row['Duration']  # Duration is already in minutes
    
    # Calculate the end hour by adding duration (converted to hours) to start hour
    end_hour = start_hour + (duration // 60)  # Calculate end hour (ignoring minutes for simplicity)
    
    # Generate the list of hours the movie will run (from start hour to end hour)
    session_hours = list(range(start_hour, end_hour + 1))
    return session_hours

#Apply the function to get session hours
sessions_df['Session Hours'] = sessions_df.apply(get_session_hours, axis=1)

###################################################cleaned movie session with session hours
# Export the dataframe with the 'Session Hours' column to a CSV file
sessions_df.to_csv("Cleaned_Movie_Sessions_with_Session_Hours.csv", index=False)

print("✅ Exported the dataset to 'Cleaned_Movie_Sessions_with_Session_Hours.csv'")
##################################################

# Drop the 'Duration' column since we have the 'Session Hours' column now
sessions_df = sessions_df.drop(columns=["Duration"])


# One-hot encode categorical columns: 'Session Audio Language', 'Genre', 'Censor Rating', 'Duration Category'
dummies = pd.get_dummies(
    sessions_df[["Session Audio Language", "Genre", "Censor Rating", "Duration Category"]],
    prefix=["Lang", "Genre", "Rating", "Duration"]
)

# Convert the boolean columns to 0/1
dummies = dummies.astype(int)

# Add the one-hot encoded columns to the original dataframe
sessions_df = pd.concat([sessions_df, dummies], axis=1)

# Drop the original categorical columns after one-hot encoding
sessions_df = sessions_df.drop(columns=["Session Audio Language", "Genre", "Censor Rating", "Duration Category"])
#####################################################
sessions_df.to_csv("Cleaned_Movie_Sessions_with_Session_Hours.csv", index=False)


# Step 1: Create a function to generate the exploded dataframe based on session hours
def explode_session_hours(df):
    exploded_rows = []
    
    for _, row in df.iterrows():
        session_hours = row['Session Hours']  # Get the list of session hours for this row
        total_admits = row['Total Admits']  # Get the total admits
        one_hot_columns = row.drop(['Session Date', 'Session Hour', 'Total Admits', 'Session Hours'])  # Get the one-hot encoded columns
        
        # For each session hour in the list, create a new row
        for i, hour in enumerate(session_hours):
            # If it's the first hour, keep the total admits, otherwise set it to 0
            if i == 0:
                new_row = row.copy()  # Keep all original data for the first session hour
                new_row['Session Hour'] = hour
                exploded_rows.append(new_row)
            else:
                # For subsequent hours, set total admits and one-hot encoded columns to 0
                new_row = row.copy()
                new_row['Session Hour'] = hour
                new_row['Total Admits'] = 0
                new_row[one_hot_columns.index] = 0  # Set all one-hot columns to 0
                exploded_rows.append(new_row)
    
    # Create a new dataframe from the exploded rows
    exploded_df = pd.DataFrame(exploded_rows).reset_index(drop=True)
    return exploded_df

# Step 2: Apply the explode function to create the final dataframe
sessions_exploded_df = explode_session_hours(sessions_df)



#######################################
# Step 4: Export the exploded dataframe to CSV for later use
sessions_exploded_df.to_csv("Exploded_Sessions_with_Session_Hours.csv", index=False)

print("✅ Exported the exploded dataset to 'Exploded_Sessions_with_Session_Hours.csv'")
########################################


# Step 1: Drop the 'Session Hours' and 'Film' columns after exploding the data
sessions_exploded_df = sessions_exploded_df.drop(columns=["Session Hours", "Film"])

# Aggregate by 'Session Date' and 'Session Hour', summing the relevant columns (and all other columns)
aggregated_sessions_df = sessions_exploded_df.groupby(['Session Date', 'Session Hour']).agg(
    {col: 'sum' for col in sessions_exploded_df.columns if col not in ['Session Date', 'Session Hour']}).reset_index()


#####################################
# Step 4: Export the aggregated data to a CSV file
aggregated_sessions_df.to_csv("Aggregated_Sessions1.csv", index=False)

print("✅ Exported the aggregated dataset to 'Aggregated_Sessions1.csv'")
######################################


✅ Exported the dataset to 'Cleaned_Movie_Sessions_with_Session_Hours.csv'
✅ Exported the exploded dataset to 'Exploded_Sessions_with_Session_Hours.csv'
✅ Exported the aggregated dataset to 'Aggregated_Sessions1.csv'


### Merging code

In [6]:
# Convert 'Session Date' in aggregated_sessions_df to datetime (standardize format)
aggregated_sessions_df['Session Date'] = pd.to_datetime(
    aggregated_sessions_df['Session Date'], 
    format='%d.%m.%Y',  # Specify the format as dd.mm.yyyy 
    errors='coerce'
)

aggregated_sessions_df.columns

# Drop the redundant 'Date' column after the merge, since we already have 'Session Date'
merged_data = aggregated_sessions_df

#########################################
# Export the merged data to a CSV file for further use
merged_data.to_csv("Merged_Aggregated_Sessions_with_Revenue_Left_Join.csv", index=False)

print("✅ Exported the merged dataset to 'Merged_Aggregated_Sessions_with_Revenue_Left_Join.csv'")


✅ Exported the merged dataset to 'Merged_Aggregated_Sessions_with_Revenue_Left_Join.csv'


### Sales forecasting Preprocessing

In [8]:
import pandas as pd

# Load the dataset
df = pd.read_csv("Merged_Aggregated_Sessions_with_Revenue_Left_Join.csv")
# 🕒 Convert 'Session Date' to datetime
df['Session Date'] = pd.to_datetime(df['Session Date'], errors='coerce')
#######################################################################

# 🚨 Check for any conversion issues (NaT values)
conversion_issues = df[df['Session Date'].isnull()]
if not conversion_issues.empty:
    print("\n⚠️ Conversion issues found:")
    print(conversion_issues)
else:
    print("\n✅ All dates converted successfully.")

# 📆 Extract day of week and month from 'Session Date'
df['DayOfWeek'] = df['Session Date'].dt.dayofweek  # Monday=0, Sunday=6
df['Month'] = df['Session Date'].dt.month

#####################################for inventory forecasting part#############
# 💾 Export the updated DataFrame to CSV
df.to_csv("my_validation_data_inventory.csv", index=False)
print("✅ Data exported to 'my_validation_data_inventory.csv'")

#############################################The below line will need to be shifted below the export part in inventory forecasting part.. 
# df = df.drop(columns=['Session Date'])


##################################################################
# 💾 Export the updated DataFrame to CSV
df.to_csv("my_validation_data.csv", index=False)
print("✅ Data exported to 'forecasting_data.csv'")
#################################################################




✅ All dates converted successfully.
✅ Data exported to 'my_validation_data_inventory.csv'
✅ Data exported to 'forecasting_data.csv'


### Validation with new data

In [12]:
import joblib
import pandas as pd
import numpy as np
from IPython.display import display

# 1) Load your new sessions DataFrame (no Total Revenue col yet)
df_val = pd.read_csv("my_validation_data.csv")

# 2) Split out features only
#    (we'll add the predicted revenue back to df_val)
X_val = df_val.copy()

# 3) Load your pickled CatBoost model and feature list
model          = joblib.load("best_catboost_model.pkl")
expected_feats = joblib.load("feature_list.pkl")

# 4) Identify and fix missing / extra features
missing_feats = [f for f in expected_feats if f not in X_val.columns]
extra_feats   = [f for f in X_val.columns if f not in expected_feats]

print(f"🔍 Missing features to ADD ({len(missing_feats)}):\n{missing_feats}")
print(f"🔍 Extra features to DROP ({len(extra_feats)}):\n{extra_feats}")

# 5) Add any missing (zero‐fill), drop extras
for f in missing_feats:
    X_val[f] = 0
if extra_feats:
    X_val.drop(columns=extra_feats, inplace=True)

# 6) Reorder to match training columns
X_val = X_val[expected_feats]
print(f"\n🔢 Aligned feature matrix shape: {X_val.shape}")

# 7) (Optional) peek at the zero‐filled columns
if missing_feats:
    print("\n— Newly added (zero‐filled) columns —")
    display(X_val[missing_feats].head())

# 8) Predict Total Revenue
y_pred = model.predict(X_val)

# 9) Inject predictions back into your original df
df_val["Total Session Revenue"] = y_pred.round().astype(int)

# 10) Show the first few rows
print("\n— Sample predictions —")
display(df_val.head())

# 11) Save out if you like
df_val.to_csv("my_validation_data.csv", index=False)
print("✅ Saved predictions to my_validation_data.csv")


🔍 Missing features to ADD (19):
['Lang_Assamese', 'Lang_Bengali', 'Lang_Chinese (Cantonese)', 'Lang_Filipino', 'Lang_Gujarati', 'Lang_Indonesian', 'Lang_Japanese', 'Lang_Maori', 'Lang_Not assigned', 'Lang_Thai', 'Lang_Urdu', 'Genre_FAMILY', 'Genre_GAMING', 'Genre_MUSIC', 'Genre_MUSICAL', 'Genre_MYSTERY', 'Genre_SCI-FI', 'Genre_TO BE ADVISED', 'Rating_CTC']
🔍 Extra features to DROP (4):
['Session Date', 'Lang_Spanish', 'Lang_Swedish', 'Total Session Revenue']

🔢 Aligned feature matrix shape: (361, 56)

— Newly added (zero‐filled) columns —


Unnamed: 0,Lang_Assamese,Lang_Bengali,Lang_Chinese (Cantonese),Lang_Filipino,Lang_Gujarati,Lang_Indonesian,Lang_Japanese,Lang_Maori,Lang_Not assigned,Lang_Thai,Lang_Urdu,Genre_FAMILY,Genre_GAMING,Genre_MUSIC,Genre_MUSICAL,Genre_MYSTERY,Genre_SCI-FI,Genre_TO BE ADVISED,Rating_CTC
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0



— Sample predictions —


Unnamed: 0,Session Date,Session Hour,Total Admits,Lang_Chinese (Mandarin),Lang_English,Lang_Hindi,Lang_Kannada,Lang_Korean,Lang_Malayalam,Lang_Nepali,...,Rating_M,Rating_MA15,Rating_PG,Rating_R18+,Duration_Short,Duration_Medium,Duration_Long,DayOfWeek,Month,Total Session Revenue
0,2025-02-01,9,7,0,2,0,0,0,0,0,...,0,0,0,0,2,0,0,5,2,22
1,2025-02-01,10,9,0,2,0,0,0,0,0,...,0,2,1,0,0,2,1,5,2,119
2,2025-02-01,11,23,0,3,0,0,0,0,0,...,1,0,3,0,0,3,1,5,2,70
3,2025-02-01,12,27,0,5,1,0,0,0,0,...,1,1,3,0,1,4,1,5,2,168
4,2025-02-01,13,68,0,4,0,0,0,1,0,...,2,2,2,0,0,4,2,5,2,270


✅ Saved predictions to my_validation_data.csv


### Inventory Main preprocessing

In [14]:
import pandas as pd

# Load the datasets
forecasting_df = pd.read_csv("my_validation_data.csv")
# Convert date columns to datetime if not already
# forecasting_df["Session Date"] = pd.to_datetime(forecasting_df["Session Date"])
merged_df = forecasting_df

# Drop duplicate columns from right side
# merged_df = merged_df.drop(columns=["Transaction Date", "Transaction Hour"])

########################################################################
merged_df.to_csv("basketanalysis.csv", index=False)
########################################################################

In [16]:
merged_df.columns

Index(['Session Date', 'Session Hour', 'Total Admits',
       'Lang_Chinese (Mandarin)', 'Lang_English', 'Lang_Hindi', 'Lang_Kannada',
       'Lang_Korean', 'Lang_Malayalam', 'Lang_Nepali', 'Lang_No Subtitles',
       'Lang_Punjabi', 'Lang_Spanish', 'Lang_Swedish', 'Lang_Tamil',
       'Lang_Telugu', 'Lang_Vietnamese', 'Genre_ACTION', 'Genre_ADVENTURE',
       'Genre_ANIMATION', 'Genre_BIOGRAPHY', 'Genre_COMEDY', 'Genre_CRIME',
       'Genre_DOCUMENTARY', 'Genre_DRAMA', 'Genre_FANTASY', 'Genre_HORROR',
       'Genre_ROMANCE', 'Genre_THRILLER', 'Rating_E', 'Rating_G', 'Rating_M',
       'Rating_MA15', 'Rating_PG', 'Rating_R18+', 'Duration_Short',
       'Duration_Medium', 'Duration_Long', 'DayOfWeek', 'Month',
       'Total Session Revenue'],
      dtype='object')

In [18]:
import pandas as pd
import numpy as np
import joblib

# 1) Load your raw sessions data (no item‐class columns yet)
df_val = pd.read_csv("basketanalysis.csv")  # replace with your actual file

# 2) Load your trained multi‐item model and its column lists
model        = joblib.load("best_catboost_model_run3.pkl")
feature_cols = joblib.load("feature_cols_run3.pkl")
target_cols  = joblib.load("target_cols_run3.pkl")

# 3) Align features: add any missing, drop any extras
missing_feats = [f for f in feature_cols if f not in df_val.columns]
if missing_feats:
    print(f"🔍 Adding {len(missing_feats)} missing features: {missing_feats}")
    for f in missing_feats:
        df_val[f] = 0

extra_feats = [c for c in df_val.columns if c not in feature_cols]
if extra_feats:
    print(f"🗑️ Dropping {len(extra_feats)} extra columns: {extra_feats}")
    # df_val.drop(columns=extra_feats, inplace=True)

# 4) Build the feature matrix in the exact training order
X_val = df_val[feature_cols].copy()
print(f"🔢 Aligned feature matrix shape: {X_val.shape}")

# 5) Predict each item‐class count
y_pred = model.predict(X_val)  # returns array of shape (n_rows, n_targets)
y_pred_df = pd.DataFrame(y_pred, columns=target_cols, index=df_val.index)

# 6) Round to integers
y_pred_df = y_pred_df.round().astype(int)

# 7) Inject predictions back into your original DataFrame
for col in target_cols:
    df_val[col] = y_pred_df[col]

# 8) (Optional) Inspect a few rows
print("\n— Sample sessions with predicted item‐class counts —")
print(df_val[feature_cols[:3] + target_cols].head())  # show first 3 feature cols + all targets

# 9) Save out full predictions
df_val.to_csv("predicted_item_sales.csv", index=False)
print(f"✅ Saved predictions for {len(df_val)} sessions to 'predicted_item_sales.csv'")


🔍 Adding 19 missing features: ['Lang_Assamese', 'Lang_Bengali', 'Lang_Chinese (Cantonese)', 'Lang_Filipino', 'Lang_Gujarati', 'Lang_Indonesian', 'Lang_Japanese', 'Lang_Maori', 'Lang_Not assigned', 'Lang_Thai', 'Lang_Urdu', 'Genre_FAMILY', 'Genre_GAMING', 'Genre_MUSIC', 'Genre_MUSICAL', 'Genre_MYSTERY', 'Genre_SCI-FI', 'Genre_TO BE ADVISED', 'Rating_CTC']
🗑️ Dropping 3 extra columns: ['Session Date', 'Lang_Spanish', 'Lang_Swedish']
🔢 Aligned feature matrix shape: (361, 57)

— Sample sessions with predicted item‐class counts —
   Session Hour  Total Admits  Lang_Assamese  SNACK - CHIPS  FOOD - VJUNIOR  \
0             9             7              0              1               1   
1            10             9              0              0               1   
2            11            23              0              0               1   
3            12            27              0              0               2   
4            13            68              0              1               

In [None]:
df_val.shape