In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv('/Users/sudin.giri/Downloads/accident.csv')

In [3]:
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Age              200 non-null    int64  
 1   Gender           199 non-null    object 
 2   Speed_of_Impact  197 non-null    float64
 3   Helmet_Used      200 non-null    object 
 4   Seatbelt_Used    200 non-null    object 
 5   Survived         200 non-null    int64  
dtypes: float64(1), int64(2), object(3)
memory usage: 9.5+ KB
None


In [5]:
data.isna().sum()


Age                0
Gender             1
Speed_of_Impact    3
Helmet_Used        0
Seatbelt_Used      0
Survived           0
dtype: int64

In [8]:
data.duplicated().sum()

np.int64(0)

This means that our data has some NaN values but every other data are unique to each other, lets handle the NaN data, as it is very few so we will just eliminate those

In [10]:
data = data.dropna()
print(data.isna().sum())

Age                0
Gender             0
Speed_of_Impact    0
Helmet_Used        0
Seatbelt_Used      0
Survived           0
dtype: int64


In [11]:
#### Checking for any outliers in Speed_of_Impact column
Q1 = data["Speed_of_Impact"].quantile(0.25)  # 25th percentile
Q3 = data["Speed_of_Impact"].quantile(0.75)  # 75th percentile
IQR = Q3 - Q1  # Interquartile range

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

outliers = data[(data["Speed_of_Impact"] < lower_bound) | (data["Speed_of_Impact"] > upper_bound)]
print(outliers)

Empty DataFrame
Columns: [Age, Gender, Speed_of_Impact, Helmet_Used, Seatbelt_Used, Survived]
Index: []


In [13]:
data.columns

Index(['Age', 'Gender', 'Speed_of_Impact', 'Helmet_Used', 'Seatbelt_Used',
       'Survived'],
      dtype='object')

In [14]:
#### Now we map the categorical values into numerical so that our model works well

data['Gender'] = data['Gender'].map({'Male': 0, 'Female': 1})
data['Helmet_Used'] = data['Helmet_Used'].map({'Yes': 1, 'No': 0})
data['Seatbelt_Used'] = data['Seatbelt_Used'].map({'Yes': 1, 'No': 0})

In [15]:
pip install scikit-learn


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3 install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [16]:
from sklearn.model_selection import train_test_split

In [20]:
X = data.drop(columns =['Survived'], axis=1)
y = data['Survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [21]:
from sklearn.linear_model import LogisticRegression
# Usinf Logistic Regression as our predicting column has values in Binary format [ Yes/No, which can be converted to 0, 1]
model = LogisticRegression()
model.fit(X_train, y_train)

In [23]:
y_predict = model.predict(X_test)

In [28]:
from sklearn.metrics import accuracy_score, classification_report
accuracy = accuracy_score(y_test,y_predict)
print(f"Accuracy Rate : {accuracy * 100}%")
print("Classification Report:")
print(classification_report(y_test, y_predict))

Accuracy Rate : 50.0%
Classification Report:
              precision    recall  f1-score   support

           0       0.73      0.32      0.44        25
           1       0.41      0.80      0.55        15

    accuracy                           0.50        40
   macro avg       0.57      0.56      0.49        40
weighted avg       0.61      0.50      0.48        40



CONCLUSION:
This was my first ever hands-on with Machine Learning and using Logistic Regression. The accuracy rate is 50% which means the model had half of the predictions right and half of it wrong. 
I will further learn and improve my skills with ML and broaden my knowledge with other models too.