In [3]:
# modules 
import pandas as pd
import numpy as np

# The raw data files and their format

In [4]:
# read in all our data
accident_data = pd.read_csv("../input/accidentdata/road-accidents.csv", sep="|",index_col="state",skiprows=9)
miles_data=pd.read_csv('../input/accidentdata/miles-driven.csv',sep="|",index_col="state")


# Read in and get an overview of the data

In [5]:
# show first 5 line 
accident_data.head()


In [6]:
miles_data.head()

In [7]:
##combining the two data for better manipulation and understanding
global_data=accident_data.join(miles_data)
global_data.head()

**setup**

In [8]:
import pandas as pd
pd.plotting.register_matplotlib_converters()
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns


# Create a textual and a graphical summary of the data

#   - bar charts

In [9]:
accident_data.describe()

In [10]:
# Set the width and height of the figure
plt.figure(figsize=(16,10))

# Add title
plt.title("drvr_fatl_col_bmiles")

sns.barplot(y=accident_data.index, x=accident_data['drvr_fatl_col_bmiles'])

# Add label for vertical axis
plt.ylabel("state")
# Add label for vertical axis
plt.xlabel("drvr_fatl_col_bmiles")

In [11]:
# Set the width and height of the figure
plt.figure(figsize=(16,10))

# Add title
plt.title("perc_fatl_speed")

sns.barplot(y=accident_data.index, x=accident_data['perc_fatl_speed'])

# Add label for vertical axis
plt.ylabel("state")
# Add label for vertical axis
plt.xlabel("perc_fatl_speed")

In [12]:
# Set the width and height of the figure
plt.figure(figsize=(16,10))

# Add title
plt.title("perc_fatl_alcohol")

sns.barplot(y=accident_data.index, x=accident_data['perc_fatl_alcohol'])

# Add label for vertical axis
plt.ylabel("state")
# Add label for vertical axis
plt.xlabel("perc_fatl_alcohol")

In [13]:
# Set the width and height of the figure
plt.figure(figsize=(16,10))

# Add title
plt.title("perc_fatl_1st_time")

sns.barplot(y=accident_data.index, x=accident_data['perc_fatl_1st_time'])

# Add label for vertical axis
plt.ylabel("state")
# Add label for vertical axis
plt.xlabel("perc_fatl_1st_time")

#      - scatter plots

In [14]:
sns.scatterplot(x=accident_data['perc_fatl_speed'], y=accident_data['drvr_fatl_col_bmiles'])

In [15]:
sns.scatterplot(x=accident_data['perc_fatl_alcohol'], y=accident_data['drvr_fatl_col_bmiles'])

In [16]:
sns.scatterplot(x=accident_data['perc_fatl_1st_time'], y=accident_data['drvr_fatl_col_bmiles'])

# The Pearson correlation coefficient

In [17]:
accident_data.corr()

* -the correlation between     drvr_fatl_col_bmiles   and     perc_fatl_speed is : week
* -the correlation between     drvr_fatl_col_bmiles   and     perc_fatl_alcohol is : week
* -the correlation between     drvr_fatl_col_bmiles   and     perc_fatl_1st_time is : week
* -the correlation between     perc_fatl_speed   and     perc_fatl_alcohol is : medium
* -the correlation between     perc_fatl_speed   and     perc_fatl_1st_time is : week
* -the correlation between     perc_fatl_alcohol   and     perc_fatl_1st_time is : medium
    


# Fit a multivariate linear regression

In [20]:
from sklearn.linear_model import LinearRegression
linear_regression=LinearRegression()
X=accident_data.iloc[:,1:4]
Y=accident_data.iloc[:,0]
linear_regression.fit(X,Y)
print(X.columns)
print(linear_regression.coef_)
print ("Intercept : ",linear_regression.intercept_)


after building this multivaraite regression we found that the relation that would be built between the variables is 
* Y=-0.04180041X1+0.19086404 X2 + 0.02473301 X3 +9.06498048


##just im testing a regession linear with 1 variable

In [19]:
import matplotlib.pyplot as plt
from scipy import stats

y = accident_data.iloc[:,1]
x = accident_data.iloc[:,2]

slope, intercept, r, p, std_err = stats.linregress(x, y)
print(slope, intercept, r, p, std_err)

def myfunc(x):
  return slope * x + intercept

mymodel = list(map(myfunc, x))

plt.scatter(x, y)
plt.plot(x, mymodel)
plt.show()

# Perform PCA on standardized data

In [21]:
###Perform PCA on standardized data

import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [22]:
###data standarsation
scl=StandardScaler()
scl.fit(global_data.iloc[:,1:4])
scaled_data=scl.transform(global_data.iloc[:,1:4])
scaled=pd.DataFrame(scaled_data,columns=['perc_fatl_speed','perc_fatl_alcohol','perc_fatl_1st_time'],index=global_data.index)
print(scaled)

In [23]:
###PCA application
principal_features=PCA(n_components=3)
principal_features.fit(scaled_data)
pca_data=principal_features.transform(scaled_data)
pca=pd.DataFrame(pca_data,columns=['perc_fatl_speed','perc_fatl_alcohol','perc_fatl_1st_time'],index=global_data.index)
print(pca)
print(pca.shape)

In [24]:
principal_features.components_


# Visualize the first two principal components

In [25]:
###visualisation of the scaled data after the application f the pca
###Visualize the first two principal components
plt.figure(figsize=(14,8))
plt.scatter(pca_data[:,0],pca_data[:,1],cmap='plasma')

In [26]:
from mpl_toolkits.mplot3d import Axes3D
fig=plt.figure(figsize=(16,8))
axis = fig.add_subplot(111, projection='3d')
axis.scatter(pca_data[:,0],pca_data[:,1],pca_data[:,2],cmap='plasma')
fig.add_axes(axis)

plt.show

# Find clusters of similar states in the data

In [27]:
###Find clusters of similar states in the data
from sklearn.cluster import KMeans

modl=KMeans(n_clusters=2)
modl.fit(scaled_data)
result=modl.predict(scaled_data)
plt.figure(figsize=(16,8))
plt.scatter(scaled_data[:,0],scaled_data[:,1],c=result, s=50, cmap='viridis')
##we can visualize the centers of our cluster
centers =modl.cluster_centers_
plt.scatter(centers[:,0],centers[:,1],c='black',s=200,alpha=0.5)

In [28]:
distortions=[]
for i in range(1,10):
    mod=KMeans(n_clusters=i)
    mod.fit(scaled_data)
    distortions.append(mod.inertia_)

print(distortions)

In [29]:
plt.figure(figsize=(19,8))
K=range(1,10)
plt.plot(K,distortions)


**We can see that from K=3 the distorion kept decreasng in almost a linear way
So, the elbow is 3 
> the optimal number of clusters is 3

# KMeans to visualize clusters in the PCA scatter plot

In [30]:
from sklearn.cluster import KMeans

plt.figure(figsize=(19,8))
clusters=KMeans(n_clusters=3)
clusters.fit(scaled_data)
marks=clusters.predict(scaled_data)

plt.scatter(scaled_data[:,0],scaled_data[:,2],c=marks,s=50,cmap='viridis')

cens=clusters.cluster_centers_
plt.scatter(cens[:,0],cens[:,1],c='Red',s=200,alpha=0.5)

# Visualize the feature differences between the clusters

In [32]:
###Visualize the feature differences between the clusters
from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure(figsize=(19,8))
ax = plt.axes(projection='3d')
col=global_data.columns
ax.scatter3D(global_data.iloc[:,1], global_data.iloc[:,2], global_data.iloc[:,3], c=marks, cmap='viridis')
ax.set_xlabel(col[1])
ax.set_ylabel(col[1])
ax.set_zlabel(col[1])
plt.title('Visualize the feature differences between the clusters')

# Compute the number of accidents within each cluster

In [36]:
###Compute the number of accidents within each cluster
##combining the two data for better manipulation and understanding
global_data=accident_data.join(miles_data)
global_data.head()

In [39]:
marks=clusters.predict(scaled_data)
global_data['clusters']=marks 
global_data.tail()

In [40]:
import seaborn as sns
plt.figure(figsize=(18,7))
global_data['total accident']=global_data['drvr_fatl_col_bmiles']*global_data['million_miles_annually']/10000
used_data=global_data.loc[:,['total accident','clusters']]
sns.violinplot(data=used_data,x='clusters',y='total accident')
plt.show()