# US ARRESTS DATASET

## Modules

In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sbs
import scipy.stats as sts

import warnings
warnings.filterwarnings('ignore')

## Initializing the dataframe

In [7]:
df = pd.read_excel('USArrests.xlsx')
df.head()

Unnamed: 0.1,Unnamed: 0,Murder,Assault,UrbanPop,Fraud
0,Alabama,13.2,236,58,21.2
1,Alaska,10.0,263,48,44.5
2,Arizona,8.1,294,80,31.0
3,Arkansas,8.8,190,50,19.5
4,California,9.0,276,91,40.6


### Changing the column names

In [10]:
df.columns
df.rename(columns={'Unnamed: 0':'State'}, inplace=True)
df.head()

Unnamed: 0,State,Murder,Assault,UrbanPop,Fraud
0,Alabama,13.2,236,58,21.2
1,Alaska,10.0,263,48,44.5
2,Arizona,8.1,294,80,31.0
3,Arkansas,8.8,190,50,19.5
4,California,9.0,276,91,40.6


### Analyzing the Five Points Summary

In [12]:
df.describe()

Unnamed: 0,Murder,Assault,UrbanPop,Fraud
count,50.0,50.0,50.0,50.0
mean,7.788,170.76,663.84,21.232
std,4.35551,83.337661,4233.45191,9.366385
min,0.8,45.0,32.0,7.3
25%,4.075,109.0,54.5,15.075
50%,7.25,159.0,66.0,20.1
75%,11.25,249.0,77.75,26.175
max,17.4,337.0,30000.0,46.0


### Measures of Shape and Position for 'Assault' & 'Fraud' columns

In [27]:
# IQR
sts.iqr(df[['Assault', 'Fraud']], axis=0)

# Skewness
sts.skew(df[['Assault', 'Fraud']], axis=0)

# Kurtosis
sts.kurtosis(df[['Assault', 'Fraud']], axis=0, fisher=True)
sts.kurtosis(df[['Assault', 'Fraud']], axis=0, fisher=False)

array([1.93097995, 3.20189779])

### Z-score for 'Assault' column

In [23]:
sts.zscore(df['Assault'])

0     0.790787
1     1.118060
2     1.493817
3     0.233212
4     1.275635
5     0.402909
6    -0.736484
7     0.815030
8     1.990786
9     0.487757
10   -1.512241
11   -0.615272
12    0.948363
13   -0.700121
14   -1.391029
15   -0.675878
16   -0.748605
17    0.948363
18   -1.063757
19    1.566544
20   -0.263757
21    1.021090
22   -1.197090
23    1.069575
24    0.087757
25   -0.748605
26   -0.833454
27    0.984726
28   -1.378908
29   -0.142545
30    1.384726
31    1.008969
32    2.015028
33   -1.524362
34   -0.615272
35   -0.239515
36   -0.142545
37   -0.784969
38    0.039273
39    1.311999
40   -1.027393
41    0.208970
42    0.366545
43   -0.615272
44   -1.487999
45   -0.178909
46   -0.312242
47   -1.087999
48   -1.427393
49   -0.118303
Name: Assault, dtype: float64

### Measures of Variability for 'UrbanPop' & 'Assault' column

In [54]:
# Range
np.ptp(df[['Assault', 'UrbanPop']])

# Standard Deciation
df[['Assault', 'UrbanPop']].std()

# Variance
df[['Assault', 'UrbanPop']].var()

Assault     6.945166e+03
UrbanPop    1.792212e+07
dtype: float64

### Detect Outliers for 'UrbanPop' column using IQR

In [43]:
def outlier_finder(column):
    Q3 = np.quantile(df[column], 0.75)
    Q1 = np.quantile(df[column], 0.25)
    
    IQR = Q3 - Q1
    
    lower_limit = Q1 - 1.5*IQR
    upper_limit = Q3 + 1.5*IQR
    
    return df[ (df[column] < lower_limit) | (df[column] > upper_limit) ]

In [45]:
outlier_finder('Fraud')
outlier_finder('UrbanPop')

Unnamed: 0,State,Murder,Assault,UrbanPop,Fraud
20,Massachusetts,4.4,149,30000,16.3


### Detect Outliers for 'UrbanPop' column using Z-score

In [48]:
z = np.abs(sts.zscore(df['UrbanPop']))
np.where(z > 2)

(array([20]),)

In [49]:
df.iloc[20]

State       Massachusetts
Murder                4.4
Assault               149
UrbanPop            30000
Fraud                16.3
Name: 20, dtype: object