In [6]:
# Loading the packages I will need:

print("Loading packages...", end = '')

import numpy as np
from pyunpack import Archive 
import pandas as pd 
import missingno as msno
import os
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error 

import zipfile
import shutil  # For handling directories


print("Successful.")

Loading packages...Successful.


In [7]:
# Paths
main_dir = "data"  # Main directory containing year folders
output_dir = "./temp_extracted_files"  # Temporary directory for extracted files

# Ensure the output directory exists
os.makedirs(output_dir, exist_ok=True)

# List to store dataframes
dataframes = []

# Iterate over year folders
for year in os.listdir(main_dir):
    year_path = os.path.join(main_dir, year)
    if os.path.isdir(year_path):  # Ensure it's a folder
        print(f"Processing year folder: {year}")
        
        # Process each .zip file within the year folder
        for month_zip in os.listdir(year_path):
            month_zip_path = os.path.join(year_path, month_zip)
            
            if month_zip.endswith(".zip"):
                print(f"Extracting: {month_zip}")
                # Extract the .zip file
                with zipfile.ZipFile(month_zip_path, 'r') as zip_ref:
                    zip_ref.extractall(output_dir)
                
                # Process the extracted files
                extracted_files = os.listdir(output_dir)
                print(f"Extracted files: {extracted_files}")
                
                for extracted_file in extracted_files:
                    extracted_file_path = os.path.join(output_dir, extracted_file)
                    
                    # Check for .csv and .xlsx files
                    if extracted_file.endswith(".csv"):
                        print(f"Reading CSV file: {extracted_file}")
                        df = pd.read_csv(extracted_file_path)
                        df['Year'] = year  # Add a column for the year
                        dataframes.append(df)
                    
                    elif extracted_file.endswith((".xlsx", ".xls")):
                        print(f"Reading Excel file: {extracted_file}")
                        df = pd.read_excel(extracted_file_path)
                        df['Year'] = year  # Add a column for the year
                        dataframes.append(df)
                
                # Clean up the temporary directory
                for temp_file in extracted_files:
                    temp_file_path = os.path.join(output_dir, temp_file)
                    if os.path.isfile(temp_file_path):
                        os.remove(temp_file_path)  # Remove files
                    elif os.path.isdir(temp_file_path):
                        shutil.rmtree(temp_file_path)  # Remove directories

# Combine dataframes
if dataframes:
    combined_df = pd.concat(dataframes, ignore_index=True)
    combined_file = "combined_data.csv"
    combined_df.to_csv(combined_file, index=False)
    print(f"Combined data saved as {combined_file}")
else:
    print("No valid files found to combine.")




Processing year folder: 2020
Extracting: April2020TransBorderRawData.zip
Extracted files: ['Apr 2020']
Extracting: August2020TransBorderRawData.zip
Extracted files: ['August2020TransBorderRawData']
Extracting: February2020TransBorderRawData.zip
Extracted files: ['Feb 2020']
Extracting: January2020TransBorderRawData.zip
Extracted files: ['Jan 2020']
Extracting: July2020TransBorderRawData.zip
Extracted files: ['dot1_0720.csv', 'dot1_ytd_0720.csv', 'dot2_0720.csv', 'dot2_ytd_0720.csv', 'dot3_0720.csv', 'dot3_ytd_0720.csv', '__MACOSX']
Reading CSV file: dot1_0720.csv
Reading CSV file: dot1_ytd_0720.csv
Reading CSV file: dot2_0720.csv


  df = pd.read_csv(extracted_file_path)


Reading CSV file: dot2_ytd_0720.csv
Reading CSV file: dot3_0720.csv
Reading CSV file: dot3_ytd_0720.csv
Extracting: June2020TransBorderRawData.zip
Extracted files: ['dot1_0620.csv', 'dot1_ytd_0620.csv', 'dot2_0620.csv', 'dot2_ytd_0620.csv', 'dot3_0620.csv', 'dot3_ytd_0620.csv']
Reading CSV file: dot1_0620.csv
Reading CSV file: dot1_ytd_0620.csv
Reading CSV file: dot2_0620.csv
Reading CSV file: dot2_ytd_0620.csv


  df = pd.read_csv(extracted_file_path)


Reading CSV file: dot3_0620.csv
Reading CSV file: dot3_ytd_0620.csv
Extracting: March2020TransBorderRawData.zip
Extracted files: ['Mar 2020']
Extracting: May2020TransBorderRawData.zip
Extracted files: ['May 2020']
Extracting: September2020TransBorderRawData.zip
Extracted files: ['September2020TransBorderRawData']
Processing year folder: 2021
Extracting: April2021TransBorderRawData.zip
Extracted files: ['April2021TransBorderRawData']
Extracting: February2021TransBorderRawData.zip
Extracted files: ['February2021TransBorderRawData']
Extracting: January2021TransBorderRawData.zip
Extracted files: ['January2021TransBorderRawData']
Extracting: July-to-Dec-2021.zip
Extracted files: ['New folder']
Extracting: July2021TransBorderRawData.zip
Extracted files: ['July2021TransBorderRawData']
Extracting: June2021TransBorderRawData.zip
Extracted files: ['June2021TransBorderRawData']
Extracting: March2021TransBorderRawData.zip
Extracted files: ['March2021TransBorderRawData']
Extracting: May2021TransBor

  df = pd.read_csv(extracted_file_path)


Reading CSV file: dot3_1223.csv
Extracting: Feb2023.zip
Extracted files: ['Feb2023']
Extracting: Jan2023.zip
Extracted files: ['Jan2023']
Extracting: July2023.zip
Extracted files: ['July2023']
Extracting: June2023.zip
Extracted files: ['June2023']
Extracting: March2023.zip
Extracted files: ['March2023']
Extracting: May2023.zip
Extracted files: ['May2023']
Extracting: Nov2023.zip
Extracted files: ['Nov2023']
Extracting: Oct2023.zip
Extracted files: ['Oct2023']
Extracting: sept2023.zip
Extracted files: ['sept2023']
Processing year folder: 2024
Extracting: April2024.zip
Extracted files: ['April2024']
Extracting: August2024.zip
Extracted files: ['dot1_0824.csv', 'dot2_0824.csv', 'dot3_0824.csv']
Reading CSV file: dot1_0824.csv
Reading CSV file: dot2_0824.csv


  df = pd.read_csv(extracted_file_path)


Reading CSV file: dot3_0824.csv
Extracting: Feb2024.zip
Extracted files: ['Feb2024']
Extracting: Jan2024.zip
Extracted files: ['Jan2024']
Extracting: July2024.zip
Extracted files: ['July2024']
Extracting: June2024.zip
Extracted files: ['June2024']
Extracting: March2024.zip
Extracted files: ['March2024']
Extracting: May2024.zip
Extracted files: ['May2024']
Extracting: september2024.zip
Extracted files: ['dot1_0924.csv', 'dot2_0924.csv', 'dot3_0924.csv']
Reading CSV file: dot1_0924.csv
Reading CSV file: dot2_0924.csv


  df = pd.read_csv(extracted_file_path)


Reading CSV file: dot3_0924.csv
Combined data saved as combined_data.csv


In [8]:
# Read the CSV file from the root directory
combined_df = pd.read_csv("combined_data.csv")

# Display the first five rows to inspect
print(combined_df.head())


  combined_df = pd.read_csv("combined_data.csv")


   TRDTYPE USASTATE  DEPE  DISAGMOT MEXSTATE CANPROV  COUNTRY  VALUE  SHIPWT  \
0        1       AK  0712         5      NaN      XQ     1220  12182       0   
1        1       AK  20XX         3      NaN      XA     1220  29921    1209   
2        1       AK  20XX         3      NaN      XA     1220   2590      16   
3        1       AK  20XX         3      NaN      XC     1220  58967    7843   
4        1       AK  20XX         3      NaN      XC     1220   7201       1   

   FREIGHT_CHARGES   DF CONTCODE  MONTH  YEAR  Year  COMMODITY2  
0              461  1.0        X      7  2020  2020         NaN  
1              202  1.0        X      7  2020  2020         NaN  
2               74  2.0        X      7  2020  2020         NaN  
3              857  1.0        X      7  2020  2020         NaN  
4              133  2.0        X      7  2020  2020         NaN  


In [9]:
# Check the columns and their data types
combined_df.dtypes


TRDTYPE              int64
USASTATE            object
DEPE                object
DISAGMOT             int64
MEXSTATE            object
CANPROV             object
COUNTRY              int64
VALUE                int64
SHIPWT               int64
FREIGHT_CHARGES      int64
DF                 float64
CONTCODE            object
MONTH                int64
YEAR                 int64
Year                 int64
COMMODITY2         float64
dtype: object

In [10]:
# Display the last five rows of the DataFrame
combined_df.tail()


Unnamed: 0,TRDTYPE,USASTATE,DEPE,DISAGMOT,MEXSTATE,CANPROV,COUNTRY,VALUE,SHIPWT,FREIGHT_CHARGES,DF,CONTCODE,MONTH,YEAR,Year,COMMODITY2
2051246,2,,55XX,5,,,1220,4384342,7399,1883,,0,9,2024,2024,98.0
2051247,2,,55XX,8,,,1220,50211,6350,3500,,0,9,2024,2024,98.0
2051248,2,,60XX,8,,,1220,793390,80,500,,0,9,2024,2024,89.0
2051249,2,,70XX,8,,,1220,233990301,0,0,,0,9,2024,2024,99.0
2051250,2,,70XX,8,,,2010,224981722,0,0,,0,9,2024,2024,99.0


In [11]:
# check first five rows

combined_df.head()

Unnamed: 0,TRDTYPE,USASTATE,DEPE,DISAGMOT,MEXSTATE,CANPROV,COUNTRY,VALUE,SHIPWT,FREIGHT_CHARGES,DF,CONTCODE,MONTH,YEAR,Year,COMMODITY2
0,1,AK,0712,5,,XQ,1220,12182,0,461,1.0,X,7,2020,2020,
1,1,AK,20XX,3,,XA,1220,29921,1209,202,1.0,X,7,2020,2020,
2,1,AK,20XX,3,,XA,1220,2590,16,74,2.0,X,7,2020,2020,
3,1,AK,20XX,3,,XC,1220,58967,7843,857,1.0,X,7,2020,2020,
4,1,AK,20XX,3,,XC,1220,7201,1,133,2.0,X,7,2020,2020,


In [12]:
# Check the total number of rows and columns in the DataFrame
combined_df.shape


(2051251, 16)