In [1]:
# Import the modules
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import confusion_matrix, classification_report

---

## Split the Data into Training and Testing Sets

### Step 1: Read the `lending_data.csv` data from the `Resources` folder into a Pandas DataFrame.

In [2]:
# Load the dataset
file_path = 'Resources/US_Accidents_2023_Phoenix_Metro2.csv'
df = pd.read_csv(file_path, low_memory=False)
df.head()

Unnamed: 0,Accident_Date,Accident_Time,ID,Severity,Start_Lat,Start_Lng,End_Lat,End_Lng,Distance(mi),Description,...,Bump,Crossing,Give_Way,Junction,No_Exit,Railway,Stop,Traffic_Calming,Traffic_Signal,Day_or_Night
0,2022-05-04,00:35:00,A-4620667,2,33.457822,-112.364291,33.457752,-112.352981,0.652,Crash on the right shoulder on I-10 Eastbound ...,...,False,False,False,False,False,False,False,False,False,Night
1,2022-01-10,08:02:00,A-752668,2,33.494709,-112.056328,,,0.0,Accident on 12th St at Indian School Rd.,...,False,True,False,False,False,False,False,False,True,Day
2,2022-04-24,23:29:00,A-5410011,2,33.406713,-112.061288,33.4041,-112.061327,0.181,Incident on S 8TH ST near E BROADWAY RD Expect...,...,False,False,False,False,False,False,True,False,False,Night
3,2022-01-03,15:00:00,A-4452285,2,33.497898,-112.186309,33.498668,-112.186296,0.053,Incident on N 59TH AVE near W GLENROSA AVE Exp...,...,False,False,False,False,False,False,False,False,False,Day
4,2022-04-06,05:21:00,A-658916,2,33.552662,-112.209618,,,0.0,Crash on Northern Ave at 70th Ave.,...,False,False,False,False,False,False,True,False,False,Night


In [3]:
# combining the loan statuses and balancing both to 2500 to see 
# difference in accuracy 
# df = pd.concat([df[df.loan_status==0][:2500],df[df.loan_status==1]])

In [4]:
nan_counts = df.isna().sum()
nan_counts

Accident_Date          0
Accident_Time          0
ID                     0
Severity               0
Start_Lat              0
Start_Lng              0
End_Lat              197
End_Lng              197
Distance(mi)           0
Description            0
Street                 7
City                   0
County                 0
State                  0
Zipcode                0
Country                0
Timezone               0
Airport_Code           1
Temperature(F)         4
Wind_Chill(F)         18
Humidity(%)            5
Pressure(in)           4
Visibility(mi)         2
Wind_Direction        16
Wind_Speed(mph)       16
Precipitation(in)     89
Weather_Condition     13
Amenity                0
Bump                   0
Crossing               0
Give_Way               0
Junction               0
No_Exit                0
Railway                0
Stop                   0
Traffic_Calming        0
Traffic_Signal         0
Day_or_Night           0
dtype: int64

In [5]:
## Drop End Lat&Long Columns 197 - Not relevent to the data
df.drop(columns=['End_Lat','End_Lng'], inplace=True)
df

Unnamed: 0,Accident_Date,Accident_Time,ID,Severity,Start_Lat,Start_Lng,Distance(mi),Description,Street,City,...,Bump,Crossing,Give_Way,Junction,No_Exit,Railway,Stop,Traffic_Calming,Traffic_Signal,Day_or_Night
0,2022-05-04,00:35:00,A-4620667,2,33.457822,-112.364291,0.652,Crash on the right shoulder on I-10 Eastbound ...,I-10,Goodyear,...,False,False,False,False,False,False,False,False,False,Night
1,2022-01-10,08:02:00,A-752668,2,33.494709,-112.056328,0.000,Accident on 12th St at Indian School Rd.,N 12th St,Phoenix,...,False,True,False,False,False,False,False,False,True,Day
2,2022-04-24,23:29:00,A-5410011,2,33.406713,-112.061288,0.181,Incident on S 8TH ST near E BROADWAY RD Expect...,S 8th Pl,Phoenix,...,False,False,False,False,False,False,True,False,False,Night
3,2022-01-03,15:00:00,A-4452285,2,33.497898,-112.186309,0.053,Incident on N 59TH AVE near W GLENROSA AVE Exp...,N 59th Ave,Phoenix,...,False,False,False,False,False,False,False,False,False,Day
4,2022-04-06,05:21:00,A-658916,2,33.552662,-112.209618,0.000,Crash on Northern Ave at 70th Ave.,N 70th Ave,Glendale,...,False,False,False,False,False,False,True,False,False,Night
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
871,2022-04-22,21:49:00,A-3730924,2,33.457907,-112.316140,0.728,Crash on the right shoulder on I-10 Westbound ...,I-10 W,Avondale,...,False,False,False,False,False,False,False,False,False,Night
872,2022-05-13,18:52:00,A-4975964,2,33.461436,-112.077271,1.554,Stationary traffic on Papago Fwy E - I-10 E - ...,Hance Deck Park Tunnel,Phoenix,...,False,False,False,False,False,False,False,False,False,Day
873,2022-05-03,22:51:00,A-5275186,2,33.671411,-112.113000,0.930,Incident on N I-17 near W ROSE GARDEN LN Expec...,Black Canyon Fwy N,Phoenix,...,False,False,False,True,False,False,False,False,False,Night
874,2022-02-21,13:41:00,A-5368012,2,33.424577,-111.891336,1.490,Slow traffic on Price Fwy S - Loop 101 S - AZ-...,AZ-101 Loop S,Tempe,...,False,False,False,False,False,False,False,False,False,Day


In [6]:
nan_counts = df.isna().sum()
nan_counts

Accident_Date         0
Accident_Time         0
ID                    0
Severity              0
Start_Lat             0
Start_Lng             0
Distance(mi)          0
Description           0
Street                7
City                  0
County                0
State                 0
Zipcode               0
Country               0
Timezone              0
Airport_Code          1
Temperature(F)        4
Wind_Chill(F)        18
Humidity(%)           5
Pressure(in)          4
Visibility(mi)        2
Wind_Direction       16
Wind_Speed(mph)      16
Precipitation(in)    89
Weather_Condition    13
Amenity               0
Bump                  0
Crossing              0
Give_Way              0
Junction              0
No_Exit               0
Railway               0
Stop                  0
Traffic_Calming       0
Traffic_Signal        0
Day_or_Night          0
dtype: int64

In [7]:
# df.info()

### Step 2: Create the labels set (`y`)  from the “loan_status” column, and then create the features (`X`) DataFrame from the remaining columns.

In [8]:
# Separate the data into labels and features

# Separate the y variable, the labels
y = df.Severity

# Separate the X variable, the features
X = df.drop('Severity', axis=1)

In [9]:
# Review the y variable Series
y.head()

0    2
1    2
2    2
3    2
4    2
Name: Severity, dtype: int64

In [10]:
# Review the X variable DataFrame
X.head()

Unnamed: 0,Accident_Date,Accident_Time,ID,Start_Lat,Start_Lng,Distance(mi),Description,Street,City,County,...,Bump,Crossing,Give_Way,Junction,No_Exit,Railway,Stop,Traffic_Calming,Traffic_Signal,Day_or_Night
0,2022-05-04,00:35:00,A-4620667,33.457822,-112.364291,0.652,Crash on the right shoulder on I-10 Eastbound ...,I-10,Goodyear,Maricopa,...,False,False,False,False,False,False,False,False,False,Night
1,2022-01-10,08:02:00,A-752668,33.494709,-112.056328,0.0,Accident on 12th St at Indian School Rd.,N 12th St,Phoenix,Maricopa,...,False,True,False,False,False,False,False,False,True,Day
2,2022-04-24,23:29:00,A-5410011,33.406713,-112.061288,0.181,Incident on S 8TH ST near E BROADWAY RD Expect...,S 8th Pl,Phoenix,Maricopa,...,False,False,False,False,False,False,True,False,False,Night
3,2022-01-03,15:00:00,A-4452285,33.497898,-112.186309,0.053,Incident on N 59TH AVE near W GLENROSA AVE Exp...,N 59th Ave,Phoenix,Maricopa,...,False,False,False,False,False,False,False,False,False,Day
4,2022-04-06,05:21:00,A-658916,33.552662,-112.209618,0.0,Crash on Northern Ave at 70th Ave.,N 70th Ave,Glendale,Maricopa,...,False,False,False,False,False,False,True,False,False,Night


### Step 3: Split the data into training and testing datasets by using `train_test_split`.

In [11]:
# Import the train_test_learn module
from sklearn.model_selection import train_test_split

# Split the data using train_test_split
# Assign a random_state of 1 to the function
# shift+tab for example
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [12]:
X_train

Unnamed: 0,Accident_Date,Accident_Time,ID,Start_Lat,Start_Lng,Distance(mi),Description,Street,City,County,...,Bump,Crossing,Give_Way,Junction,No_Exit,Railway,Stop,Traffic_Calming,Traffic_Signal,Day_or_Night
350,2022-04-03,19:21:00,A-5177383,33.385948,-111.863818,0.950,Slow traffic on Superstition Fwy W - US-60 W f...,Superstition Fwy W,Mesa,Maricopa,...,False,False,False,False,False,False,False,False,False,Night
0,2022-05-04,00:35:00,A-4620667,33.457822,-112.364291,0.652,Crash on the right shoulder on I-10 Eastbound ...,I-10,Goodyear,Maricopa,...,False,False,False,False,False,False,False,False,False,Night
685,2022-12-02,09:37:00,A-4710393,33.643446,-112.356318,0.654,Stationary traffic on US-60 W from Bell Rd (Gr...,W Grand Ave,Surprise,Maricopa,...,False,False,False,False,False,False,False,False,False,Day
224,2022-05-26,06:49:00,A-4080919,33.462231,-112.062905,0.129,Crash in the median on I-10 Westbound near 3rd...,I-10 W,Phoenix,Maricopa,...,False,False,False,False,False,False,False,False,False,Day
74,2022-01-21,18:06:00,A-5049073,33.461903,-112.076528,0.799,Slow traffic on I-10 W - Pearl Harbor Memorial...,I-10,Phoenix,Maricopa,...,False,False,False,False,False,False,False,False,False,Night
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
715,2022-01-27,08:48:00,A-4009509,33.386060,-111.849189,1.794,Slow traffic on Superstition Fwy W - US-60 W f...,Superstition Fwy W,Mesa,Maricopa,...,False,False,False,False,False,False,False,False,False,Day
767,2022-06-28,07:57:00,A-5245786,33.455761,-112.203412,0.043,Incident on N 67TH AVE near W MCKINLEY ST Expe...,N 67th Ave,Phoenix,Maricopa,...,False,False,False,False,False,False,False,False,False,Day
72,2022-12-17,09:25:00,A-5461269,33.335144,-111.859060,0.007,Incident on N ALMA SCHOOL RD near W WARNER RD ...,N Alma School Rd,Chandler,Maricopa,...,False,True,False,False,False,False,False,False,True,Day
235,2022-06-12,21:49:00,A-4018583,33.435827,-112.583481,2.042,Slow traffic on Papago Fwy E - I-10 E - Pearl ...,I-10,Buckeye,Maricopa,...,False,False,False,False,False,False,False,False,False,Night


In [13]:
y_train

350    2
0      2
685    2
224    2
74     2
      ..
715    2
767    2
72     2
235    2
37     2
Name: Severity, Length: 657, dtype: int64

In [14]:
#display ration
len(X_train)/len(X)

0.75

---

In [15]:
#display ration
len(X_test)/len(X)

0.25

## Create a Logistic Regression Model with the Original Data

###  Step 1: Fit a logistic regression model by using the training data (`X_train` and `y_train`).

In [19]:
print(df.dtypes)

Accident_Date          int64
Accident_Time         object
ID                    object
Severity               int64
Start_Lat            float64
Start_Lng            float64
Distance(mi)         float64
Description           object
Street                object
City                  object
County                object
State                 object
Zipcode                int64
Country               object
Timezone              object
Airport_Code          object
Temperature(F)       float64
Wind_Chill(F)        float64
Humidity(%)          float64
Pressure(in)         float64
Visibility(mi)       float64
Wind_Direction        object
Wind_Speed(mph)      float64
Precipitation(in)    float64
Weather_Condition     object
Amenity                 bool
Bump                    bool
Crossing                bool
Give_Way                bool
Junction                bool
No_Exit                 bool
Railway                 bool
Stop                    bool
Traffic_Calming         bool
Traffic_Signal

In [20]:
# Identify non-numeric columns
non_numeric_columns = df.select_dtypes(include=['object']).columns
print("Non-numeric columns:", non_numeric_columns)

# One-hot encode categorical columns (if any)
df = pd.get_dummies(df, columns=non_numeric_columns, drop_first=True)

# Verify all columns are numeric now
print(df.dtypes)

Non-numeric columns: Index(['Accident_Time', 'ID', 'Description', 'Street', 'City', 'County',
       'State', 'Country', 'Timezone', 'Airport_Code', 'Wind_Direction',
       'Weather_Condition', 'Day_or_Night'],
      dtype='object')
Accident_Date                                int64
Severity                                     int64
Start_Lat                                  float64
Start_Lng                                  float64
Distance(mi)                               float64
                                            ...   
Weather_Condition_Partly Cloudy / Windy      uint8
Weather_Condition_Rain / Windy               uint8
Weather_Condition_T-Storm                    uint8
Weather_Condition_Thunder                    uint8
Day_or_Night_Night                           uint8
Length: 2658, dtype: object


In [16]:
# response to ValueError: could not convert string to float: '2022-04-03'
# Convert date columns to datetime and then to numerical values (e.g., ordinal)
df['Accident_Date'] = pd.to_datetime(df['Accident_Date']).map(pd.Timestamp.toordinal)
print(df['Accident_Date'])

0      738279
1      738165
2      738269
3      738158
4      738251
        ...  
871    738267
872    738288
873    738278
874    738207
875    738246
Name: Accident_Date, Length: 876, dtype: int64


In [18]:
# Import the LogisticRegression module from SKLearn
from sklearn.linear_model import LogisticRegression

# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the 
# LogisticRegression shift+tab to learn and example
clf = LogisticRegression(random_state=1)

# Fit the model using training data
clf.fit(X_train, y_train)

ValueError: could not convert string to float: '2022-04-03'

### Step 2: Save the predictions on the testing data labels by using the testing feature data (`X_test`) and the fitted model.

In [None]:
y_test

In [None]:
# Make a prediction using the testing data
# prior to this, model only knows X_train... value of X_test = y_test (0/1)
# when pred on X_test, goal is to give the exact values of what's in y_test
pred = clf.predict(X_test)

### Step 3: Evaluate the model’s performance by doing the following:

* Generate a confusion matrix.

* Print the classification report.

In [None]:
# Generate a confusion matrix for the model
# matrix to see model accuracy
# true = y_test, prediction = pred

# top left = true positive (0)
# bottom right = true negative (1)
# the are accurate prediction values 

# if pred was 100% accurate, top right and bottom left values would be 0


print(confusion_matrix(y_test, pred))

In [None]:
# Print the classification report for the model
# Use recall and f1-score to determine accuracy
print(classification_report(y_test, pred))

In [None]:
# why is loan_status of 1 accuracy less than loan_status of 0
# imbalanced data (more to test for 0 vs 1)
y.value_counts()

### Step 4: Answer the following question.

**Question:** How well does the logistic regression model predict both the `0` (healthy loan) and `1` (high-risk loan) labels?

**Answer:** Based on the current state of data, the logistic regression model predicts at a 99-100% accuracy for loan_status 0 (healthy loan) according to the recall and f1-score and it predicts at a 88-91% accuracy for loan_status 1 (high-risk loan) according to the recall and f1-score. For the healthy loan, these number shows that the model predicts very well and for the high-risk loan, the prediction is not as strong.

---