# DATA OVERVIEW

In [3]:
import pandas as pd
import plotly.express as px

In [4]:
df=pd.read_csv('saudi-arabia-traffic-accidents.csv')
df.head()

Unnamed: 0,Year,Region,Indicator,Value,latitude,longitude
0,(2016),Makkah,No. of Casualties - Injured,12383n,21.628963,41.26369
1,(2016),Eastern Region,No. of Casualties - Dead,1113n,23.288331,50.14783
2,(2016),Tabouk,No. of Casualties - Dead,434n,27.901655,37.259858
3,(2016),Hail,No. of Casualties - Dead,299n,27.400846,41.440696
4,(2016),Hail,No. of Accidents,10199n,27.400846,41.440696


In [5]:
df.dtypes

Unnamed: 0,0
Year,object
Region,object
Indicator,object
Value,object
latitude,float64
longitude,float64


In [6]:
df['Value'] = df['Value'].str.replace('n', '').astype(int)
df['Year'] = df['Year'].str.replace(r'[\(\)]', '', regex=True).astype(int)

In [7]:
df.shape

(84, 6)

In [8]:
df.isnull().sum()

Unnamed: 0,0
Year,0
Region,0
Indicator,0
Value,0
latitude,6
longitude,6


# DATA CLEANING

In [9]:
df['latitude'].fillna(df['latitude'].median(), inplace=True)
df['longitude'].fillna(df['longitude'].median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['latitude'].fillna(df['latitude'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['longitude'].fillna(df['longitude'].median(), inplace=True)


In [10]:
df.isnull().sum()

Unnamed: 0,0
Year,0
Region,0
Indicator,0
Value,0
latitude,0
longitude,0


# EDA

In [11]:
# Count totaal in region coloumn
total_count = df[df['Region'] == 'Total'].shape[0]
print(f"Number of rows with 'Total' in Region: {total_count}")


Number of rows with 'Total' in Region: 6


In [12]:
df = df[df['Region'] != 'Total']

In [13]:
df.shape

(78, 6)

In [14]:
# Number of Accidents per Region
accidents = df[df['Indicator'] == 'No. of Accidents']


fig = px.bar(accidents, x='Region', y='Value',
             title='Number of Accidents per Region',
             labels={'Value': 'Number of Accidents'},
             color='Region')
fig.show()

In [15]:
# Number of Injuries per Region
injuries = df[df['Indicator'] == 'No. of Casualties - Injured']

fig = px.bar(injuries, x='Region', y='Value',
             title='Number of Injuries per Region',
             labels={'Value': 'Number of Injuries'},
             color='Region')
fig.show()


In [16]:
# Geographic Distribution of Accidents
accidents_df = df[df['Indicator'] == 'No. of Accidents']
fig = px.scatter_mapbox(accidents_df,
                        lat='latitude', lon='longitude',
                        size='Value', hover_name='Region',
                        color='Value', size_max=50, zoom=4,
                        title='Geographic Distribution of Accidents',
                        color_continuous_scale='OrRd')
fig.update_layout(mapbox_style='carto-positron', margin={'r':0,'t':40,'l':0,'b':0})
fig.show()

In [17]:
# Proportion of Accidents by Region
fig = px.pie(accidents_df, names='Region', values='Value',
             title='Proportion of Accidents by Region')
fig.show()


In [18]:
# Accidents by Location
fig = px.scatter(accidents_df, x='longitude', y='latitude',
                 size='Value', color='Region',
                 title='Accidents by Location (Bubble Size = Number of Accidents)',
                 labels={'Value': 'Number of Accidents', 'longitude': 'Longitude', 'latitude': 'Latitude'},
                 hover_name='Region')
fig.show()

In [19]:
fig = px.pie(df, names='Indicator', title='Distribution of Indicator Types')
fig.show()

In [20]:
# Total Accidents per Year
df['Value'] = df['Value'].replace('n', '').astype(float)
yearly_accidents = df[df['Indicator'] == 'No. of Accidents'].groupby('Year')['Value'].sum().reset_index()

fig = px.bar(yearly_accidents, x='Year', y='Value',
             title='Total Accidents per Year',
             labels={'Value': 'Total Accidents'})
fig.show()


In [21]:
# Accidents, Injuries, and Deaths by Region
df['Value'] = df['Value'].replace('n', '').astype(float)
fig = px.bar(df, x='Region', y='Value', color='Indicator',
             barmode='group',
             title='Accidents, Injuries, and Deaths by Region')
fig.show()

In [22]:
# Deaths Distribution by Location
deaths_df = df[df['Indicator'] == 'No. of Casualties - Dead']
deaths_df['Value'] = deaths_df['Value'].replace('n', '').astype(float)

fig = px.scatter_mapbox(deaths_df, lat='latitude', lon='longitude',
                        size='Value', color='Value',
                        hover_name='Region', zoom=4,
                        title='Deaths Distribution by Location',
                        color_continuous_scale='Reds')
fig.update_layout(mapbox_style='carto-positron', margin={'r':0,'t':30,'l':0,'b':0})
fig.show()




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



# Outliers

In [23]:

fig = px.box(df, x='Indicator', y='Value',
             title='Outlier Detection in Value by Indicator',
             labels={'Value': 'Value', 'Indicator': 'Indicator Type'})
fig.show()

In [24]:
outliers_count = {}

for indicator in df['Indicator'].unique():
    values = df[df['Indicator'] == indicator]['Value']
    q1 = values.quantile(0.25)
    q3 = values.quantile(0.75)
    iqr = q3 - q1
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr
    outliers = values[(values < lower_bound) | (values > upper_bound)]
    outliers_count[indicator] = len(outliers)


for k, v in outliers_count.items():
    print(f"{k}: {v} outliers")


No. of Casualties - Injured: 2 outliers
No. of Casualties - Dead: 1 outliers
No. of Accidents: 6 outliers
