In [59]:
pip install scikit-learn pandas matplotlib

Note: you may need to restart the kernel to use updated packages.


In [60]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (accuracy_score, precision_score, recall_score, classification_report, confusion_matrix)

In [61]:
import pandas as pd
file_path = "/Users/ibrahim/Desktop/US_Accidents_March23 2.csv"
df = pd.read_csv(file_path, chunksize=100000)  # or without chunks if small enough



In [64]:
import pandas as pd

chunk_size = 100000  # Adjust based on memory capacity
file_path = "/Users/ibrahim/Desktop/US_Accidents_March23 2.csv"

chunks = pd.read_csv(file_path, chunksize=chunk_size)

# Optionally process each chunk separately to reduce memory footprint
df_list = []
for chunk in chunks:
    # Example cleaning or filtering on chunk
    df_list.append(chunk)

df = pd.concat(df_list)

In [65]:
rows, columns = df.shape
print(f"Number of rows: {rows}")
print(f"Number of columns: {columns}")


Number of rows: 7728394
Number of columns: 46


In [66]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7728394 entries, 0 to 7728393
Data columns (total 46 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   ID                     object 
 1   Source                 object 
 2   Severity               int64  
 3   Start_Time             object 
 4   End_Time               object 
 5   Start_Lat              float64
 6   Start_Lng              float64
 7   End_Lat                float64
 8   End_Lng                float64
 9   Distance(mi)           float64
 10  Description            object 
 11  Street                 object 
 12  City                   object 
 13  County                 object 
 14  State                  object 
 15  Zipcode                object 
 16  Country                object 
 17  Timezone               object 
 18  Airport_Code           object 
 19  Weather_Timestamp      object 
 20  Temperature(F)         float64
 21  Wind_Chill(F)          float64
 22  Humidity(%)       

In [67]:
missing_counts = df.isnull().sum()
print("Missing values per column:")
print(missing_counts)
print("Total of Null Values:")
print(df.isnull().sum().sum())

Missing values per column:
ID                             0
Source                         0
Severity                       0
Start_Time                     0
End_Time                       0
Start_Lat                      0
Start_Lng                      0
End_Lat                  3402762
End_Lng                  3402762
Distance(mi)                   0
Description                    5
Street                     10869
City                         253
County                         0
State                          0
Zipcode                     1915
Country                        0
Timezone                    7808
Airport_Code               22635
Weather_Timestamp         120228
Temperature(F)            163853
Wind_Chill(F)            1999019
Humidity(%)               174144
Pressure(in)              140679
Visibility(mi)            177098
Wind_Direction            175206
Wind_Speed(mph)           571233
Precipitation(in)        2203586
Weather_Condition         173459
Amenity         

In [74]:
duplicates = df[df.duplicated(keep=False)]
print(f"Duplicates found: {len(duplicates)}")


Duplicates found: 0


In [75]:
for col in ['Temperature(F)', 'Humidity(%)', 'Pressure(in)', 'Visibility(mi)', 'Wind_Speed(mph)']:
    df[col] = df[col].fillna(df[col].mean())

In [76]:
for col in ['Street', 'City', 'Zipcode', 'Timezone', 'Airport_Code', 'Wind_Direction', 'Weather_Condition']:
    df[col] = df[col].fillna(df[col].mode()[0])

In [77]:
df['Weather_Timestamp'] = df['Weather_Timestamp'].fillna(df['Start_Time'])

In [78]:
print(df.isnull().sum())

ID                             0
Source                         0
Severity                       0
Start_Time                     0
End_Time                       0
Start_Lat                      0
Start_Lng                      0
End_Lat                  3402762
End_Lng                  3402762
Distance(mi)                   0
Description                    5
Street                         0
City                           0
County                         0
State                          0
Zipcode                        0
Country                        0
Timezone                       0
Airport_Code                   0
Weather_Timestamp              0
Temperature(F)                 0
Wind_Chill(F)            1999019
Humidity(%)                    0
Pressure(in)                   0
Visibility(mi)                 0
Wind_Direction                 0
Wind_Speed(mph)                0
Precipitation(in)        2203586
Weather_Condition              0
Amenity                        0
Bump      

In [79]:
for col in ['Sunrise_Sunset', 'Civil_Twilight', 'Nautical_Twilight', 'Astronomical_Twilight']:
    df[col] = df[col].fillna(df[col].mode()[0])


In [80]:
df['Description'] = df['Description'].fillna('Unknown')


In [81]:
print(df.isnull().sum())

ID                             0
Source                         0
Severity                       0
Start_Time                     0
End_Time                       0
Start_Lat                      0
Start_Lng                      0
End_Lat                  3402762
End_Lng                  3402762
Distance(mi)                   0
Description                    0
Street                         0
City                           0
County                         0
State                          0
Zipcode                        0
Country                        0
Timezone                       0
Airport_Code                   0
Weather_Timestamp              0
Temperature(F)                 0
Wind_Chill(F)            1999019
Humidity(%)                    0
Pressure(in)                   0
Visibility(mi)                 0
Wind_Direction                 0
Wind_Speed(mph)                0
Precipitation(in)        2203586
Weather_Condition              0
Amenity                        0
Bump      

In [82]:
df = df.drop(['End_Lat', 'End_Lng', 'Wind_Chill(F)', 'Precipitation(in)'], axis=1)

In [83]:
print(df.isnull().sum())

ID                       0
Source                   0
Severity                 0
Start_Time               0
End_Time                 0
Start_Lat                0
Start_Lng                0
Distance(mi)             0
Description              0
Street                   0
City                     0
County                   0
State                    0
Zipcode                  0
Country                  0
Timezone                 0
Airport_Code             0
Weather_Timestamp        0
Temperature(F)           0
Humidity(%)              0
Pressure(in)             0
Visibility(mi)           0
Wind_Direction           0
Wind_Speed(mph)          0
Weather_Condition        0
Amenity                  0
Bump                     0
Crossing                 0
Give_Way                 0
Junction                 0
No_Exit                  0
Railway                  0
Roundabout               0
Station                  0
Stop                     0
Traffic_Calming          0
Traffic_Signal           0
T