In [1]:
import pandas as pd
from typing import List, Optional

def load_pickle(pickle_path: str) -> pd.DataFrame:
    """
    Load a DataFrame from a pickle file.

    :param pickle_path: Path to the pickle file.
    :return: Loaded pandas DataFrame.
    """
    df = pd.read_pickle(pickle_path)
    print(f"Loaded data from {pickle_path} with shape {df.shape}")
    return df

def basic_info(df: pd.DataFrame):
    """
    Display basic information about the DataFrame.

    :param df: pandas DataFrame.
    """
    print("\n--- Basic Info ---")
    print(df.info())
    print("\n--- First 5 rows ---")
    print(df.head())
    print("\n--- Data Description ---")
    print(df.describe(include='all'))

def missing_values_summary(df: pd.DataFrame):
    """
    Print a summary of missing values per column.

    :param df: pandas DataFrame.
    """
    print("\n--- Missing Values ---")
    missing = df.isna().sum()
    missing_percent = (missing / len(df)) * 100
    summary = pd.DataFrame({"missing_count": missing, "missing_percent": missing_percent})
    print(summary[summary["missing_count"] > 0])

def duplicates_summary(df: pd.DataFrame):
    """
    Print the number of duplicate rows.

    :param df: pandas DataFrame.
    """
    dup_count = df.duplicated().sum()
    print(f"\n--- Duplicates ---\nNumber of duplicate rows: {dup_count}")

def unique_values_summary(df: pd.DataFrame, columns: Optional[List[str]] = None):
    """
    Print the number of unique values per column or for selected columns.

    :param df: pandas DataFrame.
    :param columns: List of columns to check unique values for. If None, checks all columns.
    """
    columns_to_check = columns if columns else df.columns
    print("\n--- Unique Values ---")
    for col in columns_to_check:
        print(f"{col}: {df[col].nunique()} unique values")

def value_counts_summary(df: pd.DataFrame, columns: Optional[List[str]] = None, top_n: int = 10):
    """
    Print the top N frequent values for categorical columns.

    :param df: pandas DataFrame.
    :param columns: List of columns to analyze. If None, checks all object or category columns.
    :param top_n: Number of top frequent values to show.
    """
    if columns is None:
        columns = df.select_dtypes(include=['object', 'category']).columns

    print("\n--- Top Value Counts ---")
    for col in columns:
        print(f"\nColumn: {col}")
        print(df[col].value_counts().head(top_n))

def correlation_summary(df: pd.DataFrame):
    """
    Print correlation matrix for numeric columns only.
    """
    print("\n--- Correlation Matrix (Numeric Columns Only) ---")
    numeric_df = df.select_dtypes(include=['number'])
    if numeric_df.empty:
        print("No numeric columns to compute correlation.")
    else:
        print(numeric_df.corr())

In [2]:
# Load pickle
df_bluebikes = load_pickle("../data_pipeline/data/processed/bluebikes/raw_data.pkl")


Loaded data from ../data_pipeline/data/processed/bluebikes/raw_data.pkl with shape (3565820, 14)


In [24]:
basic_info(df_bluebikes)


--- Basic Info ---
<class 'pandas.core.frame.DataFrame'>
Index: 3565820 entries, 0 to 3568111
Data columns (total 14 columns):
 #   Column                   Dtype              
---  ------                   -----              
 0   ride_id                  object             
 1   rideable_type            object             
 2   start_time               datetime64[ns, UTC]
 3   stop_time                datetime64[ns, UTC]
 4   start_station_name       string             
 5   start_station_id         string             
 6   end_station_name         string             
 7   end_station_id           string             
 8   start_station_latitude   float64            
 9   start_station_longitude  float64            
 10  end_station_latitude     float64            
 11  end_station_longitude    float64            
 12  user_type                string             
 13  ëÀ¼ï                     object             
dtypes: datetime64[ns, UTC](2), float64(4), object(3), string(5)
memory 

In [3]:
missing_values_summary(df_bluebikes)


--- Missing Values ---
                  missing_count  missing_percent
start_station_id           1237         0.034690
end_station_id             3915         0.109792
ëÀ¼ï                    3565820       100.000000


In [4]:
duplicates_summary(df_bluebikes)


--- Duplicates ---
Number of duplicate rows: 0


In [8]:
unique_values_summary(df_bluebikes)


--- Unique Values ---
ride_id: 3565820 unique values
rideable_type: 2 unique values
start_time: 3565244 unique values
stop_time: 3564052 unique values
start_station_name: 599 unique values
start_station_id: 591 unique values
end_station_name: 600 unique values
end_station_id: 592 unique values
start_station_latitude: 178015 unique values
start_station_longitude: 199308 unique values
end_station_latitude: 655 unique values
end_station_longitude: 659 unique values
user_type: 2 unique values
ëÀ¼ï: 0 unique values


In [9]:
value_counts_summary(df_bluebikes)


--- Top Value Counts ---

Column: ride_id
ride_id
D39FCF4FF35223A0    1
215E35C14D594F94    1
FFCD2D3BA2776BEE    1
48EE2AAAB3450EED    1
CEBF456BE80209BF    1
F42C7FC3627D16D3    1
57C96B6DB6A2DA59    1
1B6BA97C1D513C60    1
6E4A19B4F9EA5957    1
3B1114528642D4F3    1
Name: count, dtype: int64

Column: rideable_type
rideable_type
classic_bike     2570910
electric_bike     994910
Name: count, dtype: int64

Column: ëÀ¼ï
Series([], Name: count, dtype: int64)


In [10]:
correlation_summary(df_bluebikes)


--- Correlation Matrix (Numeric Columns Only) ---
                         start_station_latitude  start_station_longitude  \
start_station_latitude                 1.000000                -0.001213   
start_station_longitude               -0.001213                 1.000000   
end_station_latitude                   0.749335                 0.019133   
end_station_longitude                  0.018261                 0.717820   

                         end_station_latitude  end_station_longitude  
start_station_latitude               0.749335               0.018261  
start_station_longitude              0.019133               0.717820  
end_station_latitude                 1.000000              -0.005286  
end_station_longitude               -0.005286               1.000000  


In [11]:
# Load pickle
df = load_pickle("../data_pipeline/data/processed/boston_clg/raw_data.pkl")


Loaded data from ../data_pipeline/data/processed/boston_clg/raw_data.pkl with shape (60, 28)


In [13]:
basic_info(df)


--- Basic Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60 entries, 0 to 59
Data columns (total 28 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   OBJECTID       60 non-null     int64  
 1   Match_type     60 non-null     object 
 2   Ref_ID         60 non-null     int64  
 3   ID1            44 non-null     object 
 4   Id             60 non-null     object 
 5   SchoolId       60 non-null     object 
 6   Name           60 non-null     object 
 7   Address        60 non-null     object 
 8   City           60 non-null     object 
 9   Zipcode        58 non-null     float64
 10  Contact        60 non-null     object 
 11  PhoneNumbe     60 non-null     object 
 12  YearBuilt      60 non-null     object 
 13  NumStories     60 non-null     object 
 14  Cost           60 non-null     float64
 15  NumStudent     60 non-null     float64
 16  BackupPowe     60 non-null     float64
 17  ShelterCap     60 non-null     objec

In [12]:
missing_values_summary(df)


--- Missing Values ---
              missing_count  missing_percent
ID1                      16        26.666667
Zipcode                   2         3.333333
X                         3         5.000000
Y                         3         5.000000
NumStudent12             22        36.666667
CampusHous               13        21.666667
URL                      12        20.000000
Address2013              60       100.000000


In [14]:
duplicates_summary(df)


--- Duplicates ---
Number of duplicate rows: 0


In [15]:
unique_values_summary(df)


--- Unique Values ---
OBJECTID: 60 unique values
Match_type: 4 unique values
Ref_ID: 1 unique values
ID1: 39 unique values
Id: 60 unique values
SchoolId: 44 unique values
Name: 59 unique values
Address: 60 unique values
City: 19 unique values
Zipcode: 53 unique values
Contact: 47 unique values
PhoneNumbe: 56 unique values
YearBuilt: 22 unique values
NumStories: 15 unique values
Cost: 56 unique values
NumStudent: 42 unique values
BackupPowe: 1 unique values
ShelterCap: 1 unique values
Latitude: 58 unique values
Longitude: 58 unique values
Comment: 58 unique values
X: 57 unique values
Y: 56 unique values
NumStudent12: 38 unique values
CampusHous: 17 unique values
NumStudents13: 43 unique values
URL: 46 unique values
Address2013: 0 unique values


In [16]:
value_counts_summary(df)


--- Top Value Counts ---

Column: Match_type
Match_type
NCES & Consortium    28
Consortium           16
NCES                 13
                      3
Name: count, dtype: int64

Column: ID1
ID1
                                                      5
300 The Fenway, Boston, Massachusetts 02115-5898      2
Fruit St, Boston, Massachusetts 02114-2696            1
250 New Rutherford Ave, Boston, Massachusetts 021*    1
36 1st Avenue, Boston, Massachusetts 02129-4557       1
8 Ashburton Place, Boston, Massachusetts 02108-27*    1
41 Berkeley St, Boston, Massachusetts 02116           1
230 The Fenway, Boston, Massachusetts 02115-5596      1
400 The Fenway, Boston, Massachusetts 02115           1
8 the Fenway, Boston, Massachusetts 02215             1
Name: count, dtype: int64

Column: Id
Id
98627    1
11959    1
16632    1
1949     1
9109     1
86640    1
86302    1
86125    1
15422    1
22285    1
Name: count, dtype: int64

Column: SchoolId
SchoolId
0         15
164845     2
167543     2
4

In [17]:
correlation_summary(df)


--- Correlation Matrix (Numeric Columns Only) ---
               OBJECTID  Ref_ID   Zipcode      Cost  NumStudent  BackupPowe  \
OBJECTID       1.000000     NaN -0.171385 -0.076258   -0.107168         NaN   
Ref_ID              NaN     NaN       NaN       NaN         NaN         NaN   
Zipcode       -0.171385     NaN  1.000000  0.190170    0.458679         NaN   
Cost          -0.076258     NaN  0.190170  1.000000    0.405653         NaN   
NumStudent    -0.107168     NaN  0.458679  0.405653    1.000000         NaN   
BackupPowe          NaN     NaN       NaN       NaN         NaN         NaN   
Latitude      -0.620160     NaN  0.188255  0.093444    0.098388         NaN   
Longitude      0.619804     NaN -0.188061 -0.093593   -0.098415         NaN   
X             -0.256362     NaN  0.103841  0.069055    0.011929         NaN   
Y             -0.361367     NaN  0.016144 -0.120060   -0.021006         NaN   
NumStudent12  -0.274917     NaN  0.600122  0.720564    0.820517         NaN   
N

In [18]:
# Load pickle
df = load_pickle("../data_pipeline/data/processed/NOAA_weather/raw_data.pkl")


Loaded data from ../data_pipeline/data/processed/NOAA_weather/raw_data.pkl with shape (3946, 4)


In [19]:
basic_info(df) 


--- Basic Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3946 entries, 0 to 3945
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   date    3946 non-null   object 
 1   PRCP    3946 non-null   float64
 2   TMAX    3946 non-null   float64
 3   TMIN    3946 non-null   float64
dtypes: float64(3), object(1)
memory usage: 123.4+ KB
None

--- First 5 rows ---
         date  PRCP  TMAX  TMIN
0  2015-01-01   0.0   0.6  -5.5
1  2015-01-02   0.0   5.0  -0.5
2  2015-01-03  15.7   2.8  -5.5
3  2015-01-04  14.5  11.1   1.7
4  2015-01-05   0.0  10.0  -8.2

--- Data Description ---
              date         PRCP         TMAX         TMIN
count         3946  3946.000000  3946.000000  3946.000000
unique        3946          NaN          NaN          NaN
top     2025-10-20          NaN          NaN          NaN
freq             1          NaN          NaN          NaN
mean           NaN     2.983730    16.118550     7.839128
std  

In [20]:
missing_values_summary(df)


--- Missing Values ---
Empty DataFrame
Columns: [missing_count, missing_percent]
Index: []


In [21]:
duplicates_summary(df)


--- Duplicates ---
Number of duplicate rows: 0


In [22]:
value_counts_summary(df)


--- Top Value Counts ---

Column: date
date
2025-10-20    1
2015-01-01    1
2015-01-02    1
2015-01-03    1
2015-01-04    1
2015-01-05    1
2015-01-06    1
2015-01-07    1
2015-01-08    1
2015-01-09    1
Name: count, dtype: int64


In [23]:
correlation_summary(df)


--- Correlation Matrix (Numeric Columns Only) ---
          PRCP      TMAX      TMIN
PRCP  1.000000 -0.026179  0.017909
TMAX -0.026179  1.000000  0.944070
TMIN  0.017909  0.944070  1.000000
