In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report


In [2]:
df=pd.read_csv('final_data2.csv')

In [3]:
df.head()

Unnamed: 0,delay,truck_age,load_capacity_pounds,mileage_mpg,age,experience,ratings,average_speed_mph,distance,average_hours,...,city_description_y_Patchy light rain,city_description_y_Patchy light rain with thunder,city_description_y_Patchy light snow,city_description_y_Patchy moderate snow,city_description_y_Patchy rain possible,city_description_y_Patchy sleet possible,city_description_y_Patchy snow possible,city_description_y_Sunny,city_description_y_Thundery outbreaks possible,city_description_y_Torrential rain shower
0,0,9.0,3000.0,23.0,48.0,9.0,7.0,57.36,310.75,6.22,...,False,False,False,False,False,False,False,False,False,False
1,1,10.0,15000.0,27.0,45.0,8.0,3.0,60.05,1231.985998,24.6398,...,False,False,False,False,False,False,False,False,False,False
2,0,11.0,20000.0,26.0,50.0,7.0,8.0,60.5,1231.985998,24.6398,...,False,False,False,False,False,False,False,False,False,False
3,0,8.0,4000.0,28.0,53.0,22.0,9.0,63.94,1231.985998,24.6398,...,False,False,False,False,False,False,False,False,False,False
4,0,12.0,15000.0,23.0,52.0,18.0,7.0,56.28,1231.985998,24.6398,...,False,False,False,False,False,False,False,False,False,False


In [4]:
print(df.isnull().sum())  # Sum of NaNs


delay                                             0
truck_age                                         0
load_capacity_pounds                              0
mileage_mpg                                       0
age                                               0
                                                 ..
city_description_y_Patchy sleet possible          0
city_description_y_Patchy snow possible           0
city_description_y_Sunny                          0
city_description_y_Thundery outbreaks possible    0
city_description_y_Torrential rain shower         0
Length: 117, dtype: int64


In [5]:
df.info(verbose=True)  # Shows all columns and their counts

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12607 entries, 0 to 12606
Data columns (total 117 columns):
 #    Column                                                       Dtype  
---   ------                                                       -----  
 0    delay                                                        int64  
 1    truck_age                                                    float64
 2    load_capacity_pounds                                         float64
 3    mileage_mpg                                                  float64
 4    age                                                          float64
 5    experience                                                   float64
 6    ratings                                                      float64
 7    average_speed_mph                                            float64
 8    distance                                                     float64
 9    average_hours                                              

truck_id, vechicle_no, route_id, 

In [6]:
from sklearn.preprocessing import StandardScaler

numerical_columns = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
numerical_columns.remove('delay')
scaler = StandardScaler()
df[numerical_columns] = scaler.fit_transform(df[numerical_columns])


In [7]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


# Selecting features and target variable
X = df.drop(columns=['delay'])  # Features (all except 'delay')
y = df['delay']                 # Target variable

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10085 entries, 6934 to 7270
Columns: 116 entries, truck_age to city_description_y_Torrential rain shower
dtypes: bool(76), float64(40)
memory usage: 3.9 MB


In [9]:
y

0        0
1        1
2        0
3        0
4        0
        ..
12602    0
12603    1
12604    0
12605    0
12606    0
Name: delay, Length: 12607, dtype: int64

In [10]:
import mlflow
mlflow.set_experiment("Truck Delay prediction")
with mlflow.start_run():
    mlflow.log_param("max_iter",1000)
    mlflow.log_param("random_state",42)

    model = LogisticRegression(max_iter=1000, random_state=42)
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    log_reg_accuracy = accuracy_score(y_test, y_pred)

    mlflow.log_metric("log_reg_accuracy",log_reg_accuracy)
    mlflow.sklearn.log_model(model,"Logistic Regression model") # model, foldername



In [11]:
# Logistic Regression Model
model = LogisticRegression(max_iter=1000, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)

log_reg_accuracy = accuracy_score(y_test, y_pred)
print(f"Logistic Regression Accuracy: {log_reg_accuracy:.4f}")



Logistic Regression Accuracy: 0.7379


In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import RandomizedSearchCV

# Define the model
model = LogisticRegression()

# Define the parameter grid
param_grid = {
    'penalty': ['l1', 'l2'],
    'C': [0.1, 1, 10],
    'solver': ['liblinear', 'saga'],
    'max_iter': [200, 500],
}

random_search = RandomizedSearchCV(
    estimator=model, 
    param_distributions=param_grid, 
    n_iter=10,  # number of parameter settings sampled
    scoring='accuracy', 
    cv=3, 
    verbose=1, 
    n_jobs=-1,
    random_state=42
)

# Perform the random search
random_search.fit(X_train, y_train)

# Best parameters and model
best_params = random_search.best_params_
best_model = random_search.best_estimator_

print("Best Parameters:", best_params)

# Evaluate the best model on test data
y_pred = best_model.predict(X_test)
log_reg_accuracy = accuracy_score(y_test, y_pred)
print(f"Logistic Regression Accuracy with Random Search: {log_reg_accuracy:.4f}")


Fitting 3 folds for each of 10 candidates, totalling 30 fits
Best Parameters: {'solver': 'saga', 'penalty': 'l1', 'max_iter': 200, 'C': 1}
Logistic Regression Accuracy with Random Search: 0.7375




In [13]:
import mlflow

param_grid = {
    'penalty': ['l1', 'l2'],
    'C': [0.1, 1, 10],
    'solver': ['liblinear', 'saga'],
    'max_iter': [100, 200, 300],
}

random_search = RandomizedSearchCV(
    estimator=model, 
    param_distributions=param_grid, 
    n_iter=10,  # number of parameter settings sampled
    scoring='accuracy', 
    cv=3, 
    verbose=1, 
    n_jobs=-1,
    random_state=42
)

mlflow.set_experiment("Truck Delay prediction")
with mlflow.start_run():
    random_search.fit(X_train, y_train)

    best_params = random_search.best_params_
    best_model = random_search.best_estimator_
    mlflow.log_param("penalty", best_params['penalty'])
    mlflow.log_param("C",best_params['C'])
    mlflow.log_param("solver",best_params['solver'])
    mlflow.log_param("max_iter",best_params['max_iter'])

    # Evaluate the best model on test data
    y_pred = best_model.predict(X_test)
    log_reg_accuracy = accuracy_score(y_test, y_pred)
    

    mlflow.log_metric("log_reg_accuracy",log_reg_accuracy)
    mlflow.sklearn.log_model(best_model,"Logistic Regression model") # model, foldername

Fitting 3 folds for each of 10 candidates, totalling 30 fits




DONE

In [14]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

# Selecting features and target variable
X = df.drop(columns=['delay'])  # Features (all except 'delay')
y = df['delay']                 # Target variable

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model training
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

# Get the feature names (from X columns)
selected_features = X.columns

# Feature importance
importances = pd.DataFrame({'Feature': selected_features, 'Importance': model.feature_importances_})

# Print the importance sorted in descending order
print(importances.sort_values(by="Importance", ascending=False))


                                               Feature  Importance
7                                             distance    0.096919
8                                        average_hours    0.079429
26                                   city_visibility_y    0.059004
15                                         city_temp_x    0.043627
22                                         city_temp_y    0.039111
..                                                 ...         ...
73           city_description_x_Torrential rain shower    0.000004
98   city_description_y_Moderate or heavy snow with...    0.000000
95   city_description_y_Moderate or heavy showers o...    0.000000
94   city_description_y_Moderate or heavy rain with...    0.000000
111           city_description_y_Patchy sleet possible    0.000000

[116 rows x 2 columns]
