### Define all packages and modules needed for the notebook.

In [1]:
import pandas as pd

### Read in the primary raw NCVS dataset.

In [2]:
file = '../DataCleaning/Raw Data/NCVS_dirty.csv'
df = pd.read_csv(file)

### Display sample of the raw NCVS dataset.
Notably, we see that the data has some NaN values and column values require the documentation to interpret.

In [3]:
df.shape

(60034, 23)

In [4]:
df

Unnamed: 0,year,weight,gender,race1r,hispanic,ethnic1r,ager,marital2,hincome,popsize,...,notify,weapon,weapcat,newcrime,newoff,seriousviolent,injury,treatment,vicservices,locationr
0,1993,2418.22376,1,1,2,1,2,1,5,,...,2,2,0,2,5,3,0,0.0,2.0,4
1,1993,2258.95421,1,1,2,1,5,2,3,,...,1,2,0,1,4,2,1,1.0,2.0,3
2,1993,2258.95421,1,1,2,1,5,2,3,,...,1,2,0,1,4,2,1,1.0,2.0,3
3,1993,2258.95421,1,1,2,1,5,2,3,,...,1,2,0,1,4,2,1,1.0,2.0,3
4,1993,2258.95421,1,1,2,1,5,2,3,,...,1,1,3,1,3,1,1,1.0,2.0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60029,2019,1419.98739,1,1,2,1,7,2,7,0.0,...,2,2,0,1,4,2,0,0.0,2.0,3
60030,2019,3103.83207,1,1,2,1,7,2,5,1.0,...,2,2,0,1,1,1,0,0.0,2.0,3
60031,2019,1581.41881,2,3,2,3,5,2,1,2.0,...,1,2,0,1,4,2,0,0.0,1.0,1
60032,2019,1556.29342,1,1,2,1,8,2,6,0.0,...,1,3,5,1,4,2,0,0.0,2.0,3


### Dropping the following years:  1993, 1994, and 1995.
We go from a (60034,23) dataset to (47060, 23), losing approximately 13,000 rows.

In [5]:
df = df[df.year != 1993]
df = df[df.year != 1994]
df = df[df.year != 1995]
df.shape

(47060, 23)

### Identify all the values in each column.
We need to identify all the values in each column.
While there are some 'nan' values, they exist in columns that we are not observing. This will have no consequence on our results.
While there is documentation for '88' regarding the  hincome variable, it is not explicitly stated for 'hispanic' and 'marital2' but we will assume that it is also unknown. Either way, it will have no consequence on our results

In [6]:
# df.columns # Run this obtain column names in df

print(df["gender"].unique())
print(df["race1r"].unique())
print(df["hispanic"].unique())
# hispanic has 88
print(df["ethnic1r"].unique())
print(df["ager"].unique())
print(df["marital2"].unique())
# marital2 has 88
print(df["hincome"].unique())
# hincome has 88
print(df["popsize"].unique())
print(df["region"].unique())
print(df["msa"].unique())
print(df["direl"].unique())
print(df["notify"].unique())
print(df["weapon"].unique())
print(df["weapcat"].unique())
print(df["newcrime"].unique())
print(df["newoff"].unique())
print(df["seriousviolent"].unique())
print(df["injury"].unique())
print(df["treatment"].unique())
# Has nan values
print(df["vicservices"].unique())
# Has nan values
print(df["locationr"].unique())

[1 2]
[1 2 3]
[ 2  1 88]
[1 2 4 3]
[5 6 3 1 4 2 7 8]
[ 2  1  4  5  3 88]
[ 4  6  3  5 88  2  7  1]
[0. 1. 2. 3. 4. 5.]
[3 1 2 4]
[2 1 3]
[4 3 1 6 2 5]
[1 2 3 8]
[2 3 1]
[0 5 3 1 4 2]
[1 2]
[4 3 2 5 1]
[2 1 3]
[0 1]
[ 0.  2.  1. nan]
[ 2.  1. nan]
[5 2 3 4 1]


### Display a sample of edited code.

In [7]:
df.head()

Unnamed: 0,year,weight,gender,race1r,hispanic,ethnic1r,ager,marital2,hincome,popsize,...,notify,weapon,weapcat,newcrime,newoff,seriousviolent,injury,treatment,vicservices,locationr
12974,1996,3451.4374,1,1,2,1,5,2,4,0.0,...,1,2,0,1,4,2,0,0.0,2.0,5
12975,1996,2377.30718,2,1,2,1,6,2,4,0.0,...,2,2,0,1,4,2,0,0.0,2.0,2
12976,1996,2687.23144,1,1,2,1,5,2,4,1.0,...,1,3,5,1,4,2,0,0.0,2.0,3
12977,1996,2555.83758,2,1,2,1,5,2,6,2.0,...,2,2,0,1,4,2,0,0.0,2.0,3
12978,1996,3017.3611,1,1,2,1,5,1,3,1.0,...,2,2,0,1,4,2,0,0.0,2.0,3


### Define dictionaries to replace all values in the columns.

In [8]:
rep_ager = {1 : "12 to 14",
            2 : "15 to 17",
            3 : "18 to 20",
            4 : "21 to 24",
            5 : "25 to 34",
            6 : "35 to 49",
            7 : "50 to 64",
            8 : "65 or older"}

rep_newcrime = {1 : "Violent victimization",
                2 : "Personal theft/larceny"}

rep_hispanic = {1 : "Hispanic",
                2 : "Non-Hispanic"}

rep_hincome = {1 : "Less than $7,500",
               2 : "$7,500 to $14,999",
               3 : "$15,000 to $24,999",
               4 : "$25,000 to $34,999",
               5 : "$35,000 to $49,999",
               6 : "$50,000 to $74,999",
               7 : "$75,000 or more",
               88 : "Unknown"}

rep_injury = {0 : "Not Injured",
              1 : "Injured"}

rep_locationr = {1 : "At or near victim's home",
                 2 : "At or near friend, neighbor, or relative's home",
                 3 : "Commercial place, parking lot, or other public area",
                 4 : "School",
                 5 : "Other location"}

rep_msa = {1 : "Principal city within MSA",
           2 : "Not part of principal city within MSA",
           3 : "Outside MSA"}

rep_marital2 = {1 : "Never Married",
                2 : "Married",
                3 : "Widowed",
                4 : "Divorced",
                5 : "Separated"}

rep_treatment = {0 : "Not injured",
                 1 : "Not treated for injury",
                 2 : "Treated at scene, home, medical office, or other location"}

rep_popsize = {0 : "Not a place",
               1 : "Under 100,000",
               2 : "100,000 to 249,999",
               3 : "250,000 to 499,999",
               4 : "500,000 to 999,999",
               5 : "1 million or more"}

rep_weapon = {1 : "Yes, offender had weapon",
              2 : "No, offender did not have weapon",
              3 : "Do not know if offender had weapon"}

rep_race1r = {1 : "White",
              2 : "Black",
              3 : "Other"}

rep_ethnic1r = {1 : "Non-Hispanic white",
                2 : "Non-Hispanic black",
                3 : "Non-Hispanic other",
                4 : "Hispanic"}

rep_region = {1 : "Northeast",
              2 : "Midwest",
              3 : "South",
              4 : "West"}

rep_notify = {1 : "Yes, reported to the police",
              2 : "No, did not report to the police",
              3 : "Do not know"}

rep_gender = {1 : "Male",
              2 : "Female"}

rep_newoff = {1 : "Rape/sexual assault",
              2 : "Robbery",
              3 : "Aggravated assault",
              4 : "Simple assault",
              5 : "Personal theft"}

rep_vicservices = {1 : "Services received from victim service agencies",
                   2 : "No services received from victim service agencies"}

rep_direl = {1 : "Intimates",
             2 : "Other relatives",
             3 : "Well-known/casual acquaintances",
             4 : "Stranger",
             5 : "Do not know relationship",
             6 : "Do not know number of offenders"}

rep_seriousviolent = {1 : "Violent crime excluding simple assault",
                      2 : "Simple assault",
                      3 : "Personal theft",
                      4 : "Property crime"}

rep_weapcat = {0 : "No weapon",
               1 : "Firearm",
               2 : "Knife",
               3 : "Other type weapon",
               4 : "Type weapon unknown",
               5 : "Do not know if offender had weapon"}

### Call in dictionaries to replace values in the columns.
Using inplace to update the dataframe in place.

In [9]:
df["ager"].replace(rep_ager ,inplace = True)
df["newcrime"].replace(rep_newcrime ,inplace = True)
df["hispanic"].replace(rep_hispanic ,inplace = True)
df["hincome"].replace(rep_hincome ,inplace = True)
df["injury"].replace(rep_injury ,inplace = True)
df["locationr"].replace(rep_locationr ,inplace = True)
df["msa"].replace(rep_msa ,inplace = True)
df["marital2"].replace(rep_marital2 ,inplace = True)
df["treatment"].replace(rep_treatment ,inplace = True)
df["popsize"].replace(rep_popsize ,inplace = True)
df["weapon"].replace(rep_weapon ,inplace = True)
df["race1r"].replace(rep_race1r ,inplace = True)
df["ethnic1r"].replace(rep_ethnic1r ,inplace = True)
df["region"].replace(rep_region ,inplace = True)
df["notify"].replace(rep_notify ,inplace = True)
df["gender"].replace(rep_gender ,inplace = True)
df["newoff"].replace(rep_newoff ,inplace = True)
df["vicservices"].replace(rep_vicservices ,inplace = True)
df["direl"].replace(rep_direl ,inplace = True)
df["seriousviolent"].replace(rep_seriousviolent ,inplace = True)
df["weapcat"].replace(rep_weapcat ,inplace = True)

### Display a sample of edited code.

In [10]:
df.head()

Unnamed: 0,year,weight,gender,race1r,hispanic,ethnic1r,ager,marital2,hincome,popsize,...,notify,weapon,weapcat,newcrime,newoff,seriousviolent,injury,treatment,vicservices,locationr
12974,1996,3451.4374,Male,White,Non-Hispanic,Non-Hispanic white,25 to 34,Married,"$25,000 to $34,999",Not a place,...,"Yes, reported to the police","No, offender did not have weapon",No weapon,Violent victimization,Simple assault,Simple assault,Not Injured,Not injured,No services received from victim service agencies,Other location
12975,1996,2377.30718,Female,White,Non-Hispanic,Non-Hispanic white,35 to 49,Married,"$25,000 to $34,999",Not a place,...,"No, did not report to the police","No, offender did not have weapon",No weapon,Violent victimization,Simple assault,Simple assault,Not Injured,Not injured,No services received from victim service agencies,"At or near friend, neighbor, or relative's home"
12976,1996,2687.23144,Male,White,Non-Hispanic,Non-Hispanic white,25 to 34,Married,"$25,000 to $34,999","Under 100,000",...,"Yes, reported to the police",Do not know if offender had weapon,Do not know if offender had weapon,Violent victimization,Simple assault,Simple assault,Not Injured,Not injured,No services received from victim service agencies,"Commercial place, parking lot, or other public..."
12977,1996,2555.83758,Female,White,Non-Hispanic,Non-Hispanic white,25 to 34,Married,"$50,000 to $74,999","100,000 to 249,999",...,"No, did not report to the police","No, offender did not have weapon",No weapon,Violent victimization,Simple assault,Simple assault,Not Injured,Not injured,No services received from victim service agencies,"Commercial place, parking lot, or other public..."
12978,1996,3017.3611,Male,White,Non-Hispanic,Non-Hispanic white,25 to 34,Never Married,"$15,000 to $24,999","Under 100,000",...,"No, did not report to the police","No, offender did not have weapon",No weapon,Violent victimization,Simple assault,Simple assault,Not Injured,Not injured,No services received from victim service agencies,"Commercial place, parking lot, or other public..."


### Resets the index to start at 0.

In [11]:
df.reset_index(drop=True, inplace=True)
df

Unnamed: 0,year,weight,gender,race1r,hispanic,ethnic1r,ager,marital2,hincome,popsize,...,notify,weapon,weapcat,newcrime,newoff,seriousviolent,injury,treatment,vicservices,locationr
0,1996,3451.43740,Male,White,Non-Hispanic,Non-Hispanic white,25 to 34,Married,"$25,000 to $34,999",Not a place,...,"Yes, reported to the police","No, offender did not have weapon",No weapon,Violent victimization,Simple assault,Simple assault,Not Injured,Not injured,No services received from victim service agencies,Other location
1,1996,2377.30718,Female,White,Non-Hispanic,Non-Hispanic white,35 to 49,Married,"$25,000 to $34,999",Not a place,...,"No, did not report to the police","No, offender did not have weapon",No weapon,Violent victimization,Simple assault,Simple assault,Not Injured,Not injured,No services received from victim service agencies,"At or near friend, neighbor, or relative's home"
2,1996,2687.23144,Male,White,Non-Hispanic,Non-Hispanic white,25 to 34,Married,"$25,000 to $34,999","Under 100,000",...,"Yes, reported to the police",Do not know if offender had weapon,Do not know if offender had weapon,Violent victimization,Simple assault,Simple assault,Not Injured,Not injured,No services received from victim service agencies,"Commercial place, parking lot, or other public..."
3,1996,2555.83758,Female,White,Non-Hispanic,Non-Hispanic white,25 to 34,Married,"$50,000 to $74,999","100,000 to 249,999",...,"No, did not report to the police","No, offender did not have weapon",No weapon,Violent victimization,Simple assault,Simple assault,Not Injured,Not injured,No services received from victim service agencies,"Commercial place, parking lot, or other public..."
4,1996,3017.36110,Male,White,Non-Hispanic,Non-Hispanic white,25 to 34,Never Married,"$15,000 to $24,999","Under 100,000",...,"No, did not report to the police","No, offender did not have weapon",No weapon,Violent victimization,Simple assault,Simple assault,Not Injured,Not injured,No services received from victim service agencies,"Commercial place, parking lot, or other public..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47055,2019,1419.98739,Male,White,Non-Hispanic,Non-Hispanic white,50 to 64,Married,"$75,000 or more",Not a place,...,"No, did not report to the police","No, offender did not have weapon",No weapon,Violent victimization,Simple assault,Simple assault,Not Injured,Not injured,No services received from victim service agencies,"Commercial place, parking lot, or other public..."
47056,2019,3103.83207,Male,White,Non-Hispanic,Non-Hispanic white,50 to 64,Married,"$35,000 to $49,999","Under 100,000",...,"No, did not report to the police","No, offender did not have weapon",No weapon,Violent victimization,Rape/sexual assault,Violent crime excluding simple assault,Not Injured,Not injured,No services received from victim service agencies,"Commercial place, parking lot, or other public..."
47057,2019,1581.41881,Female,Other,Non-Hispanic,Non-Hispanic other,25 to 34,Married,"Less than $7,500","100,000 to 249,999",...,"Yes, reported to the police","No, offender did not have weapon",No weapon,Violent victimization,Simple assault,Simple assault,Not Injured,Not injured,Services received from victim service agencies,At or near victim's home
47058,2019,1556.29342,Male,White,Non-Hispanic,Non-Hispanic white,65 or older,Married,"$50,000 to $74,999",Not a place,...,"Yes, reported to the police",Do not know if offender had weapon,Do not know if offender had weapon,Violent victimization,Simple assault,Simple assault,Not Injured,Not injured,No services received from victim service agencies,"Commercial place, parking lot, or other public..."


### Calculates the total value count.
This value will effectively used as the denominator when calculating certain proportions.

In [19]:
total_count = len(df.index)
total_count

47060

### Creates a data frame that returns the count and proportion for the 8 age groups.

In [18]:
group_ager_df = pd.DataFrame(df["ager"].value_counts())
group_ager_df["1"] = pd.DataFrame(df["ager"].value_counts("ager"))
group_ager_df.reset_index(inplace=True)
group_ager_df.sort_values(by = ["index"], inplace = True)
group_ager_df.reset_index(drop = True, inplace=True)
group_ager_df.rename(columns = {"index" : "ager", "ager" : "count", "1" : "proportion"}, inplace = True)
group_ager_df

Unnamed: 0,ager,count,proportion
0,12 to 14,4424,0.094008
1,15 to 17,3975,0.084467
2,18 to 20,4081,0.086719
3,21 to 24,4568,0.097068
4,25 to 34,9770,0.207607
5,35 to 49,11781,0.25034
6,50 to 64,6614,0.140544
7,65 or older,1847,0.039248


### Creates a data frame that returns the count and proportion for the 2 major crime types.

In [14]:
group_newcrime_df = pd.DataFrame(df["newcrime"].value_counts())
group_newcrime_df["1"] = pd.DataFrame(df["newcrime"].value_counts("newcrime"))
group_newcrime_df.reset_index(inplace=True)
group_newcrime_df.sort_values(by = ["index"], inplace = True)
group_newcrime_df.reset_index(drop = True, inplace=True)
group_newcrime_df.rename(columns = {"index" : "newcrime", "newcrime" : "count", "1" : "proportion"}, inplace = True)
group_newcrime_df

Unnamed: 0,newcrime,count,proportion
0,Personal theft/larceny,1489,0.03164
1,Violent victimization,45571,0.96836


### Creates a data frame that returns the count and proportion for the 8 age groups and 2 major crime types.
- 'count' and 'proportion' are in respect to the total count
- 'count_age' is the count for that age group ignoring the major crime type
- 'proportion_age' is the proportion of the major crime type in that age group

In [15]:
group_ager_newcrime_df = pd.DataFrame(df[["ager", "newcrime"]].value_counts())
group_ager_newcrime_df["1"] = group_ager_newcrime_df[0] / total_count
group_ager_newcrime_df["2"] = df[["ager", "newcrime"]].value_counts(["ager"])
group_ager_newcrime_df.rename(columns = {0 : "count", "1" : "proportion", "2" : "count_age"}, inplace = True)
group_ager_newcrime_df["proportion_age"] = group_ager_newcrime_df["count"]/group_ager_newcrime_df["count_age"]
group_ager_newcrime_df.reset_index(inplace=True)
group_ager_newcrime_df.sort_values(by = ["ager", "newcrime"], inplace = True)
group_ager_newcrime_df.reset_index(drop = True, inplace=True)
group_ager_newcrime_df

AttributeError: 'DataFrame' object has no attribute 'value_counts'

### Creates a data frame that returns the count and proportion for the year,  8 age groups and 2 major crime types.
- 'count' and 'proportion' are in respect to the total count
- 'count_year' is the total count in that year
- 'proportion_year' is the proportion of victimizations in that year
- 'count_year_ager' is the count for that age group in that year ignoring the major crime type
- 'proportion_year_ager' is the proportion of victimization for that age group in the year ignoring the major crime type

In [None]:
group_year_ager_newcrime_df = pd.DataFrame(df[["year","ager", "newcrime"]].value_counts())
group_year_ager_newcrime_df["1"] = group_year_ager_newcrime_df[0] / total_count
group_year_ager_newcrime_df["2"] = df[["year", "ager", "newcrime"]].value_counts(["year"])
group_year_ager_newcrime_df.rename(columns = {0 : "count", "1" : "proportion", "2" : "count_year"}, inplace = True)
group_year_ager_newcrime_df["proportion_year"] = group_year_ager_newcrime_df["count"]/group_year_ager_newcrime_df["count_year"]
group_year_ager_newcrime_df["count_year_ager"] = df[["year", "ager", "newcrime"]].value_counts(["year", "ager"])
group_year_ager_newcrime_df["proportion_year_ager"] = group_year_ager_newcrime_df["count"]/group_year_ager_newcrime_df["count_year_ager"]
group_year_ager_newcrime_df.reset_index(inplace=True)
group_year_ager_newcrime_df.sort_values(by = ["year", "ager", "newcrime"], inplace = True)
group_year_ager_newcrime_df.reset_index(drop = True, inplace=True)
group_year_ager_newcrime_df

### Exporting the datasets and dataframes to be read into 'Analysis.ipynb'

In [None]:
group_ager_df.to_csv("Dataframe_Outputs/group_ager_df")
group_newcrime_df.to_csv("Dataframe_Outputs/group_newcrime_df")
group_ager_newcrime_df.to_csv("Dataframe_Outputs/group_ager_newcrime_df")
group_year_ager_newcrime_df.to_csv("Dataframe_Outputs/group_year_ager_newcrime_df")
df.to_csv("DataFrame_Outputs/NCVS_Clean")