# Communities and Crimes Dataset

## Import Libraries

In [24]:
import numpy as np
import pandas as pd
import re
#for EDA
from ydata_profiling import ProfileReport

#for visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

#for preprocessing
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler, OrdinalEncoder, OneHotEncoder

#for model
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier as knn
from sklearn.naive_bayes import GaussianNB as nb
from sklearn.linear_model import LogisticRegression as lr
from sklearn.tree import DecisionTreeClassifier as dt 
from sklearn.svm import SVC as vc
from sklearn.ensemble import RandomForestClassifier as rcl
import sklearn.metrics as sm

## Loading the Dataset

### Importing data

In [25]:
Crime_data = pd.read_csv('data/communities.data')
Crime_data.head()

Unnamed: 0,8,?,?.1,Lakewoodcity,1,0.19,0.33,0.02,0.9,0.12,...,0.12.2,0.26.1,0.2.1,0.06.3,0.04.2,0.9.1,0.5.2,0.32.2,0.14.3,0.2.2
0,53,?,?,Tukwilacity,1,0.0,0.16,0.12,0.74,0.45,...,0.02,0.12,0.45,?,?,?,?,0.0,?,0.67
1,24,?,?,Aberdeentown,1,0.0,0.42,0.49,0.56,0.17,...,0.01,0.21,0.02,?,?,?,?,0.0,?,0.43
2,34,5,81440,Willingborotownship,1,0.04,0.77,1.0,0.08,0.12,...,0.02,0.39,0.28,?,?,?,?,0.0,?,0.12
3,42,95,6096,Bethlehemtownship,1,0.01,0.55,0.02,0.95,0.09,...,0.04,0.09,0.02,?,?,?,?,0.0,?,0.03
4,6,?,?,SouthPasadenacity,1,0.02,0.28,0.06,0.54,1.0,...,0.01,0.58,0.1,?,?,?,?,0.0,?,0.14


As we can see, the data currently has no column name. We need to take them from other file called 'attributes.csv'.

In [26]:
col_names = pd.read_csv('data/attributes.csv', delim_whitespace=True)
col_names = col_names['attributes']
Crime_data = pd.read_csv('data/communities.data', names = col_names)
Crime_data.head()

Unnamed: 0,state,county,community,communityname,fold,population,householdsize,racepctblack,racePctWhite,racePctAsian,...,LandArea,PopDens,PctUsePubTrans,PolicCars,PolicOperBudg,LemasPctPolicOnPatr,LemasGangUnitDeploy,LemasPctOfficDrugUn,PolicBudgPerPop,ViolentCrimesPerPop
0,8,?,?,Lakewoodcity,1,0.19,0.33,0.02,0.9,0.12,...,0.12,0.26,0.2,0.06,0.04,0.9,0.5,0.32,0.14,0.2
1,53,?,?,Tukwilacity,1,0.0,0.16,0.12,0.74,0.45,...,0.02,0.12,0.45,?,?,?,?,0.0,?,0.67
2,24,?,?,Aberdeentown,1,0.0,0.42,0.49,0.56,0.17,...,0.01,0.21,0.02,?,?,?,?,0.0,?,0.43
3,34,5,81440,Willingborotownship,1,0.04,0.77,1.0,0.08,0.12,...,0.02,0.39,0.28,?,?,?,?,0.0,?,0.12
4,42,95,6096,Bethlehemtownship,1,0.01,0.55,0.02,0.95,0.09,...,0.04,0.09,0.02,?,?,?,?,0.0,?,0.03


In [27]:
Crime_data.shape

(1994, 128)

In [28]:
Crime_data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
state,1994.0,28.683551,16.397553,1.0,12.00,34.00,42.00,56.0
fold,1994.0,5.493982,2.873694,1.0,3.00,5.00,8.00,10.0
population,1994.0,0.057593,0.126906,0.0,0.01,0.02,0.05,1.0
householdsize,1994.0,0.463395,0.163717,0.0,0.35,0.44,0.54,1.0
racepctblack,1994.0,0.179629,0.253442,0.0,0.02,0.06,0.23,1.0
...,...,...,...,...,...,...,...,...
LandArea,1994.0,0.065231,0.109459,0.0,0.02,0.04,0.07,1.0
PopDens,1994.0,0.232854,0.203092,0.0,0.10,0.17,0.28,1.0
PctUsePubTrans,1994.0,0.161685,0.229055,0.0,0.02,0.07,0.19,1.0
LemasPctOfficDrugUn,1994.0,0.094052,0.240328,0.0,0.00,0.00,0.00,1.0


In [29]:
Crime_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1994 entries, 0 to 1993
Columns: 128 entries, state to ViolentCrimesPerPop
dtypes: float64(100), int64(2), object(26)
memory usage: 1.9+ MB


In [30]:
Crime_data.duplicated().sum()

0

In [31]:
Crime_data.isna().sum()

state                  0
county                 0
community              0
communityname          0
fold                   0
                      ..
LemasPctPolicOnPatr    0
LemasGangUnitDeploy    0
LemasPctOfficDrugUn    0
PolicBudgPerPop        0
ViolentCrimesPerPop    0
Length: 128, dtype: int64

It can be observed that there are no columns containing NaN values. However, this does not prove that the dataset is free from data missings, and it is very possible that the dataset uses some specical characters to represent missing values. We will attempt to explore this by extracting unique values from a categorical column.

In [32]:
Crime_data['PolicCars'].unique()

array(['0.06', '?', '0.09', '1', '0.04', '0.05', '0.03', '0.41', '0.07',
       '0.25', '0', '0.02', '0.18', '0.21', '0.1', '0.01', '0.22', '0.11',
       '0.55', '0.08', '0.47', '0.16', '0.29', '0.14', '0.43', '0.15',
       '0.28', '0.2', '0.62', '0.36', '0.37', '0.67', '0.12', '0.27',
       '0.3', '0.58', '0.35', '0.71', '0.23', '0.17', '0.26', '0.82',
       '0.51', '0.92', '0.24', '0.13', '0.32', '0.19', '0.49', '0.38',
       '0.61', '0.65', '0.73', '0.89', '0.72', '0.64', '0.31', '0.68',
       '0.4', '0.44', '0.98', '0.53', '0.69', '0.45'], dtype=object)

From the above result, we see that there is an unique value '?' in the dataset and it definitely represents a missing value. So we will replace all of '?' with NaN values for the ease of further analysis.

In [33]:
print('Before:')
print(Crime_data.isna().sum())
Crime_data = Crime_data.replace(to_replace='?', value=np.nan)
print('After:')
Crime_data.isna().sum()

Before:
state                  0
county                 0
community              0
communityname          0
fold                   0
                      ..
LemasPctPolicOnPatr    0
LemasGangUnitDeploy    0
LemasPctOfficDrugUn    0
PolicBudgPerPop        0
ViolentCrimesPerPop    0
Length: 128, dtype: int64
After:


state                     0
county                 1174
community              1177
communityname             0
fold                      0
                       ... 
LemasPctPolicOnPatr    1675
LemasGangUnitDeploy    1675
LemasPctOfficDrugUn       0
PolicBudgPerPop        1675
ViolentCrimesPerPop       0
Length: 128, dtype: int64

### Data Dictionary

1. Categorical Attributes

In [34]:
categorical_columns = ['state', 'county', 'community', 'communityname']

2. Numerical Attributes

In [35]:
numerical_columns = list(set(Crime_data.columns) - set(categorical_columns))
numerical_columns.remove('ViolentCrimesPerPop')
numerical_columns

['PctPopUnderPov',
 'OwnOccLowQuart',
 'PctVacantBoarded',
 'HousVacant',
 'PctPolicMinor',
 'NumInShelters',
 'LemasTotReqPerPop',
 'PctYoungKids2Par',
 'PctSameCity85',
 'PctOccupManu',
 'PctImmigRec5',
 'PctIlleg',
 'racePctAsian',
 'PctNotSpeakEnglWell',
 'pctWPubAsst',
 'PctLess9thGrade',
 'MedRent',
 'agePct12t21',
 'pctWFarmSelf',
 'LemasGangUnitDeploy',
 'PersPerOccupHous',
 'PctUsePubTrans',
 'pctWWage',
 'population',
 'PctRecentImmig',
 'pctWRetire',
 'MalePctDivorce',
 'PolicBudgPerPop',
 'NumUnderPov',
 'PctPolicAsian',
 'PctNotHSGrad',
 'PctTeen2Par',
 'PctPolicHisp',
 'whitePerCap',
 'PctRecImmig10',
 'numbUrban',
 'OtherPerCap',
 'perCapInc',
 'PctKids2Par',
 'fold',
 'pctUrban',
 'agePct12t29',
 'PctHousNoPhone',
 'MedYrHousBuilt',
 'householdsize',
 'PctWorkMomYoungKids',
 'PctHousLess3BR',
 'OwnOccHiQuart',
 'PctOccupMgmtProf',
 'agePct65up',
 'NumKindsDrugsSeiz',
 'racePctHisp',
 'racepctblack',
 'PctForeignBorn',
 'medIncome',
 'PctImmigRec8',
 'NumImmig',
 'PersPe

## Data Exploration and Visualization

In [36]:
profile = ProfileReport(Crime_data, title = "Pandas Profiling Report for Communities and Crime dataset")
profile.to_file(output_file="Crime_profiling.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

  .reset_index(name=duplicates_key)
  .reset_index(name=duplicates_key)
  .reset_index(name=duplicates_key)
  .reset_index(name=duplicates_key)
  .reset_index(name=duplicates_key)
  .reset_index(name=duplicates_key)
  .reset_index(name=duplicates_key)
  .reset_index(name=duplicates_key)
  .reset_index(name=duplicates_key)
  .reset_index(name=duplicates_key)
  .reset_index(name=duplicates_key)
  .reset_index(name=duplicates_key)
  .reset_index(name=duplicates_key)
  .reset_index(name=duplicates_key)
  .reset_index(name=duplicates_key)
  .reset_index(name=duplicates_key)
  .reset_index(name=duplicates_key)
  .reset_index(name=duplicates_key)
  .reset_index(name=duplicates_key)
  .reset_index(name=duplicates_key)
  .reset_index(name=duplicates_key)
  .reset_index(name=duplicates_key)
  .reset_index(name=duplicates_key)
  .reset_index(name=duplicates_key)
  .reset_index(name=duplicates_key)
  .reset_index(name=duplicates_key)
  .reset_index(name=duplicates_key)
  .reset_index(name=duplicat

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

MemoryError: 

In [None]:
profile