# Day -01


(March 26 - Day 1)
Start by:
✅ Loading and exploring the dataset.
✅ Checking for missing values, duplicates, and incorrect formats.
✅ Cleaning and preprocessing data for consistency.

In [2]:
import pandas as pd

In [3]:
# Load dataset (update the file path if needed)
file_path = "master-data-year-and-broad-cause-wise-trend-of-consequential-train-accidents-on-indian-railways.xlsx.xlsx"
df = pd.read_excel(file_path)

In [9]:
display(df.head())

Unnamed: 0,fiscal_year,state,cause\n,value,units\n,note
0,2021-22,All India,Failure of Equipment,5,value in absolute number,
1,2021-22,All India,Failure of other than Railway staff,4,value in absolute number,
2,2021-22,All India,Failure of Railway Staff,20,value in absolute number,
3,2021-22,All India,Incidental,3,value in absolute number,
4,2021-22,All India,Sabotage,1,value in absolute number,


In [5]:
# Check for missing values
print("Missing values per column:")
print(df.isnull().sum())

Missing values per column:
fiscal_year      0
state            0
cause\n          0
value            0
units\n          0
note           131
dtype: int64


In [6]:
# Drop duplicates
df.drop_duplicates(inplace=True)

In [7]:
# Convert fiscal year to datetime format (if applicable)
df['fiscal_year'] = df['fiscal_year'].astype(str)

In [10]:
# Standardize column names
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')

In [12]:
# Standardize categorical values (e.g., cause names)
df['cause'] = df['cause'].str.strip().str.lower().str.replace(' ', '_')

In [13]:
# Convert numerical columns to appropriate types (if needed)
df['value'] = pd.to_numeric(df['value'], errors='coerce')

In [14]:
# Save cleaned dataset
df.to_csv("cleaned_train_accident_data.csv", index=False)

print("Data cleaning complete. Saved as cleaned_train_accident_data.csv")

Data cleaning complete. Saved as cleaned_train_accident_data.csv


### Handle missing values


In [18]:
# Option 1: Fill missing numerical values with median
df.fillna(df.median(numeric_only=True), inplace=True)


In [19]:
# Verify data consistency
print("Unique accident causes:")
print(df['cause'].unique())

Unique accident causes:
['failure_of_equipment' 'failure_of_other_than_railway_staff'
 'failure_of_railway_staff' 'incidental' 'sabotage' 'under_investigation'
 'combination_of_factors'
 'could_not_be_established/none_held_responsible_failure_of_equipment\nfailure_of_other_than_railway_staff'
 'could_not_be_established/none_held_responsible'
 'failure_of_other_than_railway_staff_failure_of_railway_staff\nincidental\nsabotage'
 'incidental_sabotage\nunder_investigation\ncombination_of_factors'
 'combination_of_factors\ncould_not_be_established/none_held_responsible_failure_of_equipment'
 'under_investigation_combination_of_factors\ncould_not_be_established/none_held_responsible'
 'incidental_sabotage\nunder_investigation']


In [20]:
# Option 2: Fill missing categorical values with 'unknown'
df.fillna("unknown", inplace=True)

In [21]:
# Verify data consistency
print("Unique accident causes:")
print(df['cause'].unique())

Unique accident causes:
['failure_of_equipment' 'failure_of_other_than_railway_staff'
 'failure_of_railway_staff' 'incidental' 'sabotage' 'under_investigation'
 'combination_of_factors'
 'could_not_be_established/none_held_responsible_failure_of_equipment\nfailure_of_other_than_railway_staff'
 'could_not_be_established/none_held_responsible'
 'failure_of_other_than_railway_staff_failure_of_railway_staff\nincidental\nsabotage'
 'incidental_sabotage\nunder_investigation\ncombination_of_factors'
 'combination_of_factors\ncould_not_be_established/none_held_responsible_failure_of_equipment'
 'under_investigation_combination_of_factors\ncould_not_be_established/none_held_responsible'
 'incidental_sabotage\nunder_investigation']


In [22]:
# Summary statistics
print("Basic statistics:")
print(df.describe())

Basic statistics:
            value
count  131.000000
mean    17.641221
std     38.657484
min      0.000000
25%      1.000000
50%      5.000000
75%     13.500000
max    293.000000


In [23]:
# Display dataset information
print("Dataset Info:")
print(df.info())

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 131 entries, 0 to 130
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   fiscal_year  131 non-null    object 
 1   state        131 non-null    object 
 2   cause        131 non-null    object 
 3   value        131 non-null    float64
 4   units        131 non-null    object 
 5   note         131 non-null    object 
dtypes: float64(1), object(5)
memory usage: 6.3+ KB
None


### 