In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
import joblib

# Load the dataset
df = pd.read_csv('C:\\Users\\poorn\\OneDrive\\Desktop\\deployment\\sample.csv', low_memory=False)

# Drop unnecessary columns
df1 = df.drop([
    'Route Type', 'Road Name', 'Report Number', 'Vehicle First Impact Location', 
    'Vehicle Second Impact Location', 'Crash Date/Time', 'Local Case Number', 
    'Agency Name', 'ACRS Report Type', 'Municipality', 'Latitude', 'Longitude', 
    'Location', 'Driverless Vehicle', 'Parked Vehicle', 'Vehicle Year', 
    'Vehicle Make', 'Vehicle Model', 'Equipment Problems', 
    'Non-Motorist Substance Abuse', 'Person ID', 'Circumstance', 'Vehicle ID', 
    'Vehicle Body Type', 'Drivers License State', 'Cross-Street Name', 
    'Off-Road Description', 'Related Non-Motorist'
], axis=1)

# Define important columns
important_columns = [
    'Cross-Street Type', 'Collision Type', 'Weather', 'Surface Condition', 
    'Light', 'Traffic Control', 'Driver Substance Abuse', 'Driver At Fault', 
    'Injury Severity', 'Driver Distracted By', 'Vehicle Damage Extent', 
    'Vehicle Movement', 'Vehicle Continuing Dir', 'Vehicle Going Dir', 'Speed Limit'
]

# Create a DataFrame with important columns
df2 = df1[important_columns].copy()

# Fill null values
for col in df2.columns:
    if df2[col].dtype in [np.float64, np.int64]:  # Numeric columns
        df2[col] = df2[col].fillna(df2[col].median())
    else:  # Non-numeric columns
        df2[col] = df2[col].fillna(df2[col].mode()[0])

# Encode categorical columns
label_encoder = LabelEncoder()
for col in df2.select_dtypes(include='object').columns:
    df2[col] = label_encoder.fit_transform(df2[col])

# Split data into features and target variable
X = df2.drop('Injury Severity', axis=1)
y = df2['Injury Severity']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/3, random_state=42)

# Scale the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize and train Logistic Regression model
logistic_regression_model = LogisticRegression(max_iter=1000)
logistic_regression_model.fit(X_train_scaled, y_train)

# Save the model, scaler, and feature names
model_path = 'model.pkl'
scaler_path = 'scaler.pkl'
features_path = 'features.pkl'

with open(model_path, 'wb') as model_file:
    joblib.dump(logistic_regression_model, model_file)

with open(scaler_path, 'wb') as scaler_file:
    joblib.dump(scaler, scaler_file)

with open(features_path, 'wb') as features_file:
    joblib.dump(X.columns.tolist(), features_file)

print("Model, scaler, and feature names saved successfully.")


Model, scaler, and feature names saved successfully.


In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE
import joblib

# Load the dataset
df = pd.read_csv('C:\\Users\\poorn\\OneDrive\\Desktop\\deployment\\sample.csv', low_memory=False)

# Drop unnecessary columns
df1 = df.drop([
    'Route Type', 'Road Name', 'Report Number', 'Vehicle First Impact Location', 
    'Vehicle Second Impact Location', 'Crash Date/Time', 'Local Case Number', 
    'Agency Name', 'ACRS Report Type', 'Municipality', 'Latitude', 'Longitude', 
    'Location', 'Driverless Vehicle', 'Parked Vehicle', 'Vehicle Year', 
    'Vehicle Make', 'Vehicle Model', 'Equipment Problems', 
    'Non-Motorist Substance Abuse', 'Person ID', 'Circumstance', 'Vehicle ID', 
    'Vehicle Body Type', 'Drivers License State', 'Cross-Street Name', 
    'Off-Road Description', 'Related Non-Motorist'
], axis=1)

# Define important columns
important_columns = [
    'Cross-Street Type', 'Collision Type', 'Weather', 'Surface Condition', 
    'Light', 'Traffic Control', 'Driver Substance Abuse', 'Driver At Fault', 
    'Injury Severity', 'Driver Distracted By', 'Vehicle Damage Extent', 
    'Vehicle Movement', 'Vehicle Continuing Dir', 'Vehicle Going Dir', 'Speed Limit'
]

# Create a DataFrame with important columns
df2 = df1[important_columns].copy()

# Fill null values
for col in df2.columns:
    if df2[col].dtype in [np.float64, np.int64]:  # Numeric columns
        df2[col] = df2[col].fillna(df2[col].median())
    else:  # Non-numeric columns
        df2[col] = df2[col].fillna(df2[col].mode()[0])

# Encode categorical columns
label_encoder = LabelEncoder()
for col in df2.select_dtypes(include='object').columns:
    df2[col] = label_encoder.fit_transform(df2[col])

# Split data into features and target variable
X = df2.drop('Injury Severity', axis=1)
y = df2['Injury Severity']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/3, random_state=42)

# Apply SMOTE to balance the dataset
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

# Scale the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_balanced)
X_test_scaled = scaler.transform(X_test)

# Initialize and train Logistic Regression model
logistic_regression_model = LogisticRegression(max_iter=1000)
logistic_regression_model.fit(X_train_scaled, y_train_balanced)

# Save the model, scaler, and feature names
model_path = 'model.pkl'
scaler_path = 'scaler.pkl'
features_path = 'features.pkl'

with open(model_path, 'wb') as model_file:
    joblib.dump(logistic_regression_model, model_file)

with open(scaler_path, 'wb') as scaler_file:
    joblib.dump(scaler, scaler_file)

with open(features_path, 'wb') as features_file:
    joblib.dump(X.columns.tolist(), features_file)

print("Model, scaler, and feature names saved successfully.")


Model, scaler, and feature names saved successfully.
