In [2]:
# Import necessary libraries
import pandas as pd
import sqlite3

# Load the data from the Excel file into a Pandas DataFrame
file_name = "Copy Original Sample - Superstore.xls"  # Adjust the extension if it's .xlsx
df = pd.read_excel(file_name)

# Connect to an in-memory SQLite database
conn = sqlite3.connect(':memory:')

# Write the data to an SQLite table named "superstore"
df.to_sql('superstore', conn, index=False, if_exists='replace')

# Now you can run SQL queries. For example, to fetch the first 5 rows:
result = pd.read_sql("SELECT * FROM superstore LIMIT 5", conn)

# Display the result
print(result)


   Row ID        Order ID           Order Date            Ship Date  \
0       1  CA-2020-152156  2020-11-08 00:00:00  2020-11-11 00:00:00   
1       2  CA-2020-152156  2020-11-08 00:00:00  2020-11-11 00:00:00   
2       3  CA-2020-138688  2020-06-12 00:00:00  2020-06-16 00:00:00   
3       4  US-2019-108966  2019-10-11 00:00:00  2019-10-18 00:00:00   
4       5  US-2019-108966  2019-10-11 00:00:00  2019-10-18 00:00:00   

        Ship Mode Customer ID    Customer Name    Segment Country/Region  \
0    Second Class    CG-12520      Claire Gute   Consumer  United States   
1    Second Class    CG-12520      Claire Gute   Consumer  United States   
2    Second Class    DV-13045  Darrin Van Huff  Corporate  United States   
3  Standard Class    SO-20335   Sean O'Donnell   Consumer  United States   
4  Standard Class    SO-20335   Sean O'Donnell   Consumer  United States   

              City  ... Postal Code  Region       Product ID         Category  \
0        Henderson  ...     42420.0

  sql.to_sql(


In [3]:
# Step 1: Create a reference dictionary of cities and their postal codes
postal_ref = df.dropna(subset=['Postal Code']).groupby('City')['Postal Code'].first().to_dict()

# Step 2: Fill missing postal codes using the reference dictionary
def fill_missing_postal(row):
    if pd.isnull(row['Postal Code']) and row['City'] in postal_ref:
        return postal_ref[row['City']]
    return row['Postal Code']

df['Postal Code'] = df.apply(fill_missing_postal, axis=1)

# Update the SQLite table with the filled data
df.to_sql('superstore', conn, index=False, if_exists='replace')

# Optional: Check if there are still missing postal codes after the filling process
remaining_missing_postal = df['Postal Code'].isnull().sum()
print(f"Remaining missing postal codes: {remaining_missing_postal}")


Remaining missing postal codes: 0


  sql.to_sql(


In [4]:
# Display columns and their data types, non-null counts, and other information
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9994 entries, 0 to 9993
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   Row ID          9994 non-null   int64         
 1   Order ID        9994 non-null   object        
 2   Order Date      9994 non-null   datetime64[ns]
 3   Ship Date       9994 non-null   datetime64[ns]
 4   Ship Mode       9994 non-null   object        
 5   Customer ID     9994 non-null   object        
 6   Customer Name   9994 non-null   object        
 7   Segment         9994 non-null   object        
 8   Country/Region  9994 non-null   object        
 9   City            9994 non-null   object        
 10  State           9994 non-null   object        
 11  Postal Code     9994 non-null   float64       
 12  Region          9994 non-null   object        
 13  Product ID      9994 non-null   object        
 14  Category        9994 non-null   object        
 15  Sub-

In [5]:
# Load the "Orders" and "Returns" spreadsheets into separate DataFrames
df_orders = pd.read_excel(file_name, sheet_name="Orders")
df_returns = pd.read_excel(file_name, sheet_name="Returns")

# Merge the two DataFrames based on the "Order ID" column
merged_df = pd.merge(df_orders, df_returns, on="Order ID", how="left")

# Create a new column "Returned" which will indicate if an order was returned or not
merged_df['Returned'] = merged_df['Returned'].notna()

# If needed, you can save the merged dataframe back to an Excel file
# merged_df.to_excel("Merged_Superstore.xls", index=False)

# Display the first few rows of the merged dataframe
print(merged_df.head())


   Row ID        Order ID Order Date  Ship Date       Ship Mode Customer ID  \
0       1  CA-2020-152156 2020-11-08 2020-11-11    Second Class    CG-12520   
1       2  CA-2020-152156 2020-11-08 2020-11-11    Second Class    CG-12520   
2       3  CA-2020-138688 2020-06-12 2020-06-16    Second Class    DV-13045   
3       4  US-2019-108966 2019-10-11 2019-10-18  Standard Class    SO-20335   
4       5  US-2019-108966 2019-10-11 2019-10-18  Standard Class    SO-20335   

     Customer Name    Segment Country/Region             City  ... Region  \
0      Claire Gute   Consumer  United States        Henderson  ...  South   
1      Claire Gute   Consumer  United States        Henderson  ...  South   
2  Darrin Van Huff  Corporate  United States      Los Angeles  ...   West   
3   Sean O'Donnell   Consumer  United States  Fort Lauderdale  ...  South   
4   Sean O'Donnell   Consumer  United States  Fort Lauderdale  ...  South   

        Product ID         Category Sub-Category  \
0  FUR-BO-

In [6]:
# Export the merged dataframe to a CSV file
file_name_export = "Merged_Superstore.csv"
merged_df.to_csv(file_name_export, index=False)

print(f"Data exported to {file_name_export}")


Data exported to Merged_Superstore.csv
