In [1]:
# Reload the dataset
import pandas as pd

# Load the feature-engineered dataset from the processed folder
feature_engineered_data_path = "C:/project/Freight_Analysis_Framework-/data/processed/freight_analysis_feature_engineered.csv"
regional_data = pd.read_csv(feature_engineered_data_path)

# Step 1: Identify Missing Values
print("Missing values per column (before handling):")
print(regional_data.isnull().sum())

# Step 2: Handle Missing Values
# For numerical columns, fill with the column mean
numerical_columns = regional_data.select_dtypes(include=['float64', 'int64']).columns
regional_data[numerical_columns] = regional_data[numerical_columns].fillna(regional_data[numerical_columns].mean())

# For categorical columns, fill with the mode
categorical_columns = regional_data.select_dtypes(include=['object']).columns
for col in categorical_columns:
    regional_data[col].fillna(regional_data[col].mode()[0], inplace=True)

# Step 3: Verify Missing Values
print("\nMissing values per column (after handling):")
print(regional_data.isnull().sum().sum())

# Save the cleaned dataset back for modeling
cleaned_data_path = "C:/project/Freight_Analysis_Framework-/data/processed/freight_analysis_cleaned.csv"
regional_data.to_csv(cleaned_data_path, index=False)
print(f"\nCleaned dataset saved to {cleaned_data_path}.")


Missing values per column (before handling):
fr_orig           0
dms_orig          0
dms_dest          0
fr_dest           0
dms_mode          0
                 ..
fr_outmode_3.0    0
fr_outmode_4.0    0
fr_outmode_5.0    0
fr_outmode_6.0    0
fr_outmode_7.0    0
Length: 64, dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  regional_data[col].fillna(regional_data[col].mode()[0], inplace=True)



Missing values per column (after handling):
0

Cleaned dataset saved to C:/project/Freight_Analysis_Framework-/data/processed/freight_analysis_cleaned.csv.


In [2]:
# Reload the cleaned dataset
cleaned_data_path = "C:/project/Freight_Analysis_Framework-/data/processed/freight_analysis_cleaned.csv"
regional_data_cleaned = pd.read_csv(cleaned_data_path)

# Check the first few rows to ensure it loaded correctly
print("First few rows of the cleaned dataset:")
print(regional_data_cleaned.head())



First few rows of the cleaned dataset:
   fr_orig  dms_orig  dms_dest  fr_dest  dms_mode  sctg2  trade_type  \
0    804.0        11        11    801.0         1      1           1   
1    804.0        11        19    801.0         1      1           1   
2    804.0        11       129    801.0         1      1           1   
3    804.0        11       131    801.0         1      1           1   
4    804.0        11       139    801.0         1      1           1   

   tons_2012  tons_2013  tons_2014  ...  fr_inmode_4.0  fr_inmode_5.0  \
0     0.3231    18.2865    19.7980  ...          False          False   
1     0.3231   218.1548   220.2783  ...          False          False   
2     0.3231     0.8870     0.8371  ...          False          False   
3     0.3231     6.5007     5.7015  ...          False          False   
4     0.3231     2.8717     2.4586  ...          False          False   

   fr_inmode_6.0  fr_inmode_7.0  fr_outmode_2.0  fr_outmode_3.0  \
0          False      

In [3]:
col_dtypes=regional_data_cleaned.dtypes
object_columns=col_dtypes[col_dtypes=='object'].index.tolist()
object_columns

['Volume_Category_2012', 'Value_Category_2012']

In [4]:
# One-hot encode the object columns
regional_data_cleaned = pd.get_dummies(regional_data_cleaned, columns=['Volume_Category_2012', 'Value_Category_2012'], drop_first=True)

# Verify the changes
print("Dataset after encoding object columns:")
print(regional_data_cleaned.info())


Dataset after encoding object columns:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1660972 entries, 0 to 1660971
Data columns (total 62 columns):
 #   Column                             Non-Null Count    Dtype  
---  ------                             --------------    -----  
 0   fr_orig                            1660972 non-null  float64
 1   dms_orig                           1660972 non-null  int64  
 2   dms_dest                           1660972 non-null  int64  
 3   fr_dest                            1660972 non-null  float64
 4   dms_mode                           1660972 non-null  int64  
 5   sctg2                              1660972 non-null  int64  
 6   trade_type                         1660972 non-null  int64  
 7   tons_2012                          1660972 non-null  float64
 8   tons_2013                          1660972 non-null  float64
 9   tons_2014                          1660972 non-null  float64
 10  tons_2015                          1660972 non-null

In [5]:
col_dtypes=regional_data_cleaned.dtypes
object_columns=col_dtypes[col_dtypes=='object'].index.tolist()
object_columns

[]

In [6]:
# Define features (X) and target (y)
X = regional_data_cleaned.drop(columns=['value_2012'])  # Assuming 'value_2012' is the target
y = regional_data_cleaned['value_2012']

# Perform train-test split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Output shapes of the datasets
print(f"Features (X) shape: {X.shape}")
print(f"Target (y) shape: {y.shape}")
print(f"Training set shape: X_train: {X_train.shape}, y_train: {y_train.shape}")
print(f"Testing set shape: X_test: {X_test.shape}, y_test: {y_test.shape}")


Features (X) shape: (1660972, 61)
Target (y) shape: (1660972,)
Training set shape: X_train: (1328777, 61), y_train: (1328777,)
Testing set shape: X_test: (332195, 61), y_test: (332195,)


In [8]:
import numpy as np

# Check for infinite values in X_train
print("Infinite values in X_train:", np.isinf(X_train).sum().sum())

# Replace infinite values with NaN, then fill NaN with the column mean
X_train.replace([np.inf, -np.inf], np.nan, inplace=True)
X_test.replace([np.inf, -np.inf], np.nan, inplace=True)

# Fill NaN values with the mean of the column
X_train.fillna(X_train.mean(), inplace=True)
X_test.fillna(X_test.mean(), inplace=True)

# Verify the data again
print("Infinite values in X_train (after handling):", np.isinf(X_train).sum().sum())
print("Missing values in X_train (after handling):", X_train.isnull().sum().sum())


Infinite values in X_train: 979115
Infinite values in X_train (after handling): 0
Missing values in X_train (after handling): 0


In [10]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Initialize the Linear Regression model
lr_model = LinearRegression()

# Train the model
lr_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = lr_model.predict(X_test)

# Evaluate the model (fixing the error)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))  # Calculate RMSE directly
r2 = r2_score(y_test, y_pred)  # Calculate R-squared

# Output the evaluation metrics
print(f"Linear Regression RMSE: {rmse:.4f}")  # Format RMSE to 4 decimal places
print(f"Linear Regression R²: {r2:.4f}")

Linear Regression RMSE: 0.0597
Linear Regression R²: 0.9948
