In [1]:
import seaborn as sns
import numpy as np 
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import preprocessing
from sklearn.ensemble import RandomForestRegressor,RandomForestClassifier
from sklearn.linear_model import LinearRegression,LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error,average_precision_score,accuracy_score
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV

In [3]:
data=pd.read_csv("database.csv",dtype={'Perpetrator Age': 'int64'})

In [4]:
data.head()

Unnamed: 0,Record ID,Agency Code,Agency Name,Agency Type,City,State,Year,Month,Incident,Crime Type,...,Victim Ethnicity,Perpetrator Sex,Perpetrator Age,Perpetrator Race,Perpetrator Ethnicity,Relationship,Weapon,Victim Count,Perpetrator Count,Record Source
0,1,AK00101,Anchorage,Municipal Police,Anchorage,Alaska,1980,January,1,Murder or Manslaughter,...,Unknown,Male,15,Native American/Alaska Native,Unknown,Acquaintance,Blunt Object,0,0,FBI
1,2,AK00101,Anchorage,Municipal Police,Anchorage,Alaska,1980,March,1,Murder or Manslaughter,...,Unknown,Male,42,White,Unknown,Acquaintance,Strangulation,0,0,FBI
2,3,AK00101,Anchorage,Municipal Police,Anchorage,Alaska,1980,March,2,Murder or Manslaughter,...,Unknown,Unknown,0,Unknown,Unknown,Unknown,Unknown,0,0,FBI
3,4,AK00101,Anchorage,Municipal Police,Anchorage,Alaska,1980,April,1,Murder or Manslaughter,...,Unknown,Male,42,White,Unknown,Acquaintance,Strangulation,0,0,FBI
4,5,AK00101,Anchorage,Municipal Police,Anchorage,Alaska,1980,April,2,Murder or Manslaughter,...,Unknown,Unknown,0,Unknown,Unknown,Unknown,Unknown,0,1,FBI


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 638453 entries, 0 to 638452
Data columns (total 24 columns):
 #   Column                 Non-Null Count   Dtype 
---  ------                 --------------   ----- 
 0   Record ID              638453 non-null  int64 
 1   Agency Code            638453 non-null  object
 2   Agency Name            638453 non-null  object
 3   Agency Type            638453 non-null  object
 4   City                   638453 non-null  object
 5   State                  638453 non-null  object
 6   Year                   638453 non-null  int64 
 7   Month                  638453 non-null  object
 8   Incident               638453 non-null  int64 
 9   Crime Type             638453 non-null  object
 10  Crime Solved           638453 non-null  object
 11  Victim Sex             638453 non-null  object
 12  Victim Age             638453 non-null  int64 
 13  Victim Race            638453 non-null  object
 14  Victim Ethnicity       638453 non-null  object
 15  

In [119]:
data.isnull().sum()

Record ID                0
Agency Code              0
Agency Name              0
Agency Type              0
City                     0
State                    0
Year                     0
Month                    0
Incident                 0
Crime Type               0
Crime Solved             0
Victim Sex               0
Victim Age               0
Victim Race              0
Victim Ethnicity         0
Perpetrator Sex          0
Perpetrator Age          0
Perpetrator Race         0
Perpetrator Ethnicity    0
Relationship             0
Weapon                   0
Victim Count             0
Perpetrator Count        0
Record Source            0
dtype: int64

# Data Preprocessing

## Encoding of Categorical variables

In [120]:
category_columns=data.select_dtypes(exclude='number')
numeric_columns=data.select_dtypes(include='number')
numeric_columns.columns

Index(['Record ID', 'Year', 'Incident', 'Victim Age', 'Perpetrator Age',
       'Victim Count', 'Perpetrator Count'],
      dtype='object')

In [121]:
le = preprocessing.LabelEncoder()
# use df.apply() to apply le.fit_transform to all columns
c = category_columns.apply(le.fit_transform)
c.head()

Unnamed: 0,Agency Code,Agency Name,Agency Type,City,State,Month,Crime Type,Crime Solved,Victim Sex,Victim Race,Victim Ethnicity,Perpetrator Sex,Perpetrator Race,Perpetrator Ethnicity,Relationship,Weapon,Record Source
0,0,149,1,35,1,4,1,1,1,2,2,1,2,2,0,0,0
1,0,149,1,35,1,7,1,1,1,4,2,1,4,2,0,13,0
2,0,149,1,35,1,7,1,0,0,2,2,2,3,2,26,15,0
3,0,149,1,35,1,0,1,1,1,4,2,1,4,2,0,13,0
4,0,149,1,35,1,0,1,0,0,2,2,2,3,2,26,15,0


In [122]:
data=pd.concat([c,numeric_columns],ignore_index=False,axis=1)

In [123]:
data.head()

Unnamed: 0,Agency Code,Agency Name,Agency Type,City,State,Month,Crime Type,Crime Solved,Victim Sex,Victim Race,...,Relationship,Weapon,Record Source,Record ID,Year,Incident,Victim Age,Perpetrator Age,Victim Count,Perpetrator Count
0,0,149,1,35,1,4,1,1,1,2,...,0,0,0,1,1980,1,14,15,0,0
1,0,149,1,35,1,7,1,1,1,4,...,0,13,0,2,1980,1,43,42,0,0
2,0,149,1,35,1,7,1,0,0,2,...,26,15,0,3,1980,2,30,0,0,0
3,0,149,1,35,1,0,1,1,1,4,...,0,13,0,4,1980,1,43,42,0,0
4,0,149,1,35,1,0,1,0,0,2,...,26,15,0,5,1980,2,30,0,0,1


In [124]:
data.shape

(638453, 24)

In [125]:
data.drop(data[data['Perpetrator Sex'] == 0 ].index, inplace = True)

In [126]:
X1=data.drop(['Perpetrator Age'],axis=1)

In [127]:
Y1=data[['Perpetrator Age']]

In [128]:
X2=data.drop(['Perpetrator Sex'],axis=1)

In [129]:
Y2=data[['Perpetrator Sex']]

## Scaling of Numeric DATA

In [130]:
scaler=preprocessing.StandardScaler()
X1=scaler.fit_transform(X1)
X2=scaler.fit_transform(X2)

# Data spliting into 30% test and 70% train set

In [131]:
x_train_age,x_test_age,y_train_age,y_test_age=train_test_split(X1,Y1,test_size=0.30,random_state=12)

In [132]:
print(f'Shape of X_train {x_train_age.shape}, y_train {y_train_age.shape}, X_test {x_test_age.shape} and y_test {y_test_age.shape}')

Shape of X_train (412933, 23), y_train (412933, 1), X_test (176972, 23) and y_test (176972, 1)


In [133]:
x_train_sex,x_test_sex,y_train_sex,y_test_sex=train_test_split(X2,Y2,test_size=0.30,random_state=12)

In [134]:
print(f'Shape of X_train {x_train_sex.shape}, y_train {y_train_sex.shape}, X_test {x_test_sex.shape} and y_test {y_test_sex.shape}')

Shape of X_train (412933, 23), y_train (412933, 1), X_test (176972, 23) and y_test (176972, 1)


In [135]:
y_train_age=y_train_age.values.ravel()
y_test_age=y_test_age.values.ravel()
y_train_sex=y_train_sex.values.ravel()
y_test_sex=y_test_sex.values.ravel()

## Prediction of Preparator Age 

### Linear Regression Model

In [136]:
linreg_model=LinearRegression()

### Hyperparameter Tuning

In [137]:
print(linreg_model.get_params())

{'copy_X': True, 'fit_intercept': True, 'n_jobs': None, 'normalize': 'deprecated', 'positive': False}


In [138]:
Parameter={
          'positive':[False,True],
          'fit_intercept': [True,False],
        'copy_X': [True,False]  
          }

In [139]:
linreg_model = GridSearchCV(linreg_model, Parameter)
searchResults=linreg_model.fit(x_train_age,y_train_age)
# summarize grid search information
bestScore = searchResults.best_score_
bestParams = searchResults.best_params_
print("Best Score is {:.2f} using {}".format(bestScore,bestParams))

Best Score is 0.60 using {'copy_X': True, 'fit_intercept': True, 'positive': False}


In [140]:
y_pred_age=linreg_model.predict(x_test_age)
print(f'Linear Regression Mean absolute error: {mean_absolute_error(y_test_age, y_pred_age)}, Mean Squared error: {mean_squared_error(y_test_age, y_pred_age)},Adjusted R Square Error: {r2_score(y_test_age, y_pred_age)}')

Linear Regression Mean absolute error: 7.4125287058549665, Mean Squared error: 128.17421453858591,Adjusted R Square Error: 0.597934519365203


## Random Forest Regressor

In [167]:
rfr=RandomForestRegressor()

## Hyperparameter

In [168]:
print(rfr.get_params())

{'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'squared_error', 'max_depth': None, 'max_features': 1.0, 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}


In [169]:
Parameter={
          'max_depth':[None,3,5,7],
          'max_samples': [2,4,6], 
        'min_samples_leaf': [1,3,4],
        'n_estimators':[50,100,150,200,300]
          }

In [170]:
rfr = GridSearchCV(rfr, Parameter)
searchResults=rfr.fit(x_train_age,y_train_age)
# summarize grid search information
bestScore = searchResults.best_score_
bestParams = searchResults.best_params_
print("Best Score is {:.2f} using {}".format(bestScore,bestParams))

Best Score is 0.40 using {'max_depth': 5, 'max_samples': 6, 'min_samples_leaf': 1, 'n_estimators': 50}


In [171]:
y_pred_age=rfr.predict(x_test_age)
print(f'Random Forest Mean absolute error: {mean_absolute_error(y_test_age, y_pred_age)}, Mean Squared error: {mean_squared_error(y_test_age, y_pred_age)},Adjusted R Square Error: {r2_score(y_test_age, y_pred_age)}')

Linear Regression Mean absolute error: 11.360330786791131, Mean Squared error: 207.19060644395717,Adjusted R Square Error: 0.3500705967827361


## Classification of Preparator Gender

### Logistic Regression

In [172]:
logreg_model= LogisticRegression(max_iter=400)

In [173]:
logreg_model.fit(x_train_sex,y_train_sex)

In [174]:
y_pred_sex=logreg_model.predict(x_test_sex)
print(classification_report(y_test_sex,y_pred_sex))

              precision    recall  f1-score   support

           1       1.00      1.00      1.00    119787
           2       1.00      1.00      1.00     57185

    accuracy                           1.00    176972
   macro avg       1.00      1.00      1.00    176972
weighted avg       1.00      1.00      1.00    176972



In [175]:
rfc=RandomForestClassifier()

In [176]:
rfc.fit(x_train_sex,y_train_sex)

In [177]:
y_pred_sex=rfc.predict(x_test_sex)
print(classification_report(y_test_sex,y_pred_sex))

              precision    recall  f1-score   support

           1       1.00      1.00      1.00    119787
           2       1.00      1.00      1.00     57185

    accuracy                           1.00    176972
   macro avg       1.00      1.00      1.00    176972
weighted avg       1.00      1.00      1.00    176972



# Data spliting into 20% test and 80% train set

In [178]:
x_train_age,x_test_age,y_train_age,y_test_age=train_test_split(X1,Y1,test_size=0.20,random_state=12)
print(f'Shape of X_train {x_train_age.shape}, y_train {y_train_age.shape}, X_test {x_test_age.shape} and y_test {y_test_age.shape}')
x_train_sex,x_test_sex,y_train_sex,y_test_sex=train_test_split(X2,Y2,test_size=0.30,random_state=12)
print(f'Shape of X_train {x_train_sex.shape}, y_train {y_train_sex.shape}, X_test {x_test_sex.shape} and y_test {y_test_sex.shape}')

Shape of X_train (471924, 23), y_train (471924, 1), X_test (117981, 23) and y_test (117981, 1)
Shape of X_train (412933, 23), y_train (412933, 1), X_test (176972, 23) and y_test (176972, 1)


In [179]:
y_train_age=y_train_age.values.ravel()
y_test_age=y_test_age.values.ravel()
y_train_sex=y_train_sex.values.ravel()
y_test_sex=y_test_sex.values.ravel()

## Prediction of Preparator Age 

In [183]:
linreg_model.fit(x_train_age,y_train_age)
y_pred_age=linreg_model.predict(x_test_age)
print(f'Linear Regression Mean absolute error: {mean_absolute_error(y_test_age, y_pred_age)}, Mean Squared error: {mean_squared_error(y_test_age, y_pred_age)},Adjusted R Square Error: {r2_score(y_test_age, y_pred_age)}')

Linear Regression Mean absolute error: 7.419340609901713, Mean Squared error: 128.86705031644303,Adjusted R Square Error: 0.5981173291493506


## Random Forest

In [185]:
rfr=RandomForestRegressor()
rfr.fit(x_train_age,y_train_age)
y_pred_age=rfr.predict(x_test_age)
print(f'Random Forest Mean absolute error: {mean_absolute_error(y_test_age, y_pred_age)}, Mean Squared error: {mean_squared_error(y_test_age, y_pred_age)},Adjusted R Square Error: {r2_score(y_test_age, y_pred_age)}')

Random Forest Mean absolute error: 5.52952899195633, Mean Squared error: 87.68724688212511,Adjusted R Square Error: 0.726539989159415


## Classification of Preparator Sex

In [186]:
logreg_model.fit(x_train_sex,y_train_sex)
y_pred_sex=logreg_model.predict(x_test_sex)
print(classification_report(y_test_sex,y_pred_sex))

              precision    recall  f1-score   support

           1       1.00      1.00      1.00    119787
           2       1.00      1.00      1.00     57185

    accuracy                           1.00    176972
   macro avg       1.00      1.00      1.00    176972
weighted avg       1.00      1.00      1.00    176972



### Random Forest

In [187]:
rfc=RandomForestClassifier()

In [188]:
rfc.fit(x_train_sex,y_train_sex)
y_pred_sex=rfc.predict(x_test_sex)
print(classification_report(y_test_sex,y_pred_sex))

              precision    recall  f1-score   support

           1       1.00      1.00      1.00    119787
           2       1.00      1.00      1.00     57185

    accuracy                           1.00    176972
   macro avg       1.00      1.00      1.00    176972
weighted avg       1.00      1.00      1.00    176972



# Data spliting into 15% test and 85% train set

In [189]:
x_train_age,x_test_age,y_train_age,y_test_age=train_test_split(X1,Y1,test_size=0.15,random_state=12)
print(f'Shape of X_train {x_train_age.shape}, y_train {y_train_age.shape}, X_test {x_test_age.shape} and y_test {y_test_age.shape}')
x_train_sex,x_test_sex,y_train_sex,y_test_sex=train_test_split(X2,Y2,test_size=0.30,random_state=12)
print(f'Shape of X_train {x_train_sex.shape}, y_train {y_train_sex.shape}, X_test {x_test_sex.shape} and y_test {y_test_sex.shape}')

Shape of X_train (501419, 23), y_train (501419, 1), X_test (88486, 23) and y_test (88486, 1)
Shape of X_train (412933, 23), y_train (412933, 1), X_test (176972, 23) and y_test (176972, 1)


In [190]:
y_train_age=y_train_age.values.ravel()
y_test_age=y_test_age.values.ravel()
y_train_sex=y_train_sex.values.ravel()
y_test_sex=y_test_sex.values.ravel()

## Prediction of Preparator Age 

In [191]:
linreg_model.fit(x_train_age,y_train_age)
y_pred_age=linreg_model.predict(x_test_age)
print(f'Linear Regression Mean absolute error: {mean_absolute_error(y_test_age, y_pred_age)}, Mean Squared error: {mean_squared_error(y_test_age, y_pred_age)},Adjusted R Square Error: {r2_score(y_test_age, y_pred_age)}')

Linear Regression Mean absolute error: 7.409334870842719, Mean Squared error: 128.38455424653722,Adjusted R Square Error: 0.5988887861170482


### Random Forest

In [192]:
rfr=RandomForestRegressor()

In [193]:
rfr.fit(x_train_age,y_train_age)
y_pred_age=rfr.predict(x_test_age)
print(f'Random Forest Mean absolute error: {mean_absolute_error(y_test_age, y_pred_age)}, Mean Squared error: {mean_squared_error(y_test_age, y_pred_age)},Adjusted R Square Error: {r2_score(y_test_age, y_pred_age)}')

Random Forest Mean absolute error: 5.512055240377009, Mean Squared error: 87.21779588861514,Adjusted R Square Error: 0.7275058811677767


## Classification of Preparator Sex

In [194]:
logreg_model.fit(x_train_sex,y_train_sex)
y_pred_sex=logreg_model.predict(x_test_sex)
print(classification_report(y_test_sex,y_pred_sex))

              precision    recall  f1-score   support

           1       1.00      1.00      1.00    119787
           2       1.00      1.00      1.00     57185

    accuracy                           1.00    176972
   macro avg       1.00      1.00      1.00    176972
weighted avg       1.00      1.00      1.00    176972



In [195]:
rfc=RandomForestClassifier()

In [196]:
rfc.fit(x_train_sex,y_train_sex)
y_pred_sex=rfc.predict(x_test_sex)
print(classification_report(y_test_sex,y_pred_sex))

              precision    recall  f1-score   support

           1       1.00      1.00      1.00    119787
           2       1.00      1.00      1.00     57185

    accuracy                           1.00    176972
   macro avg       1.00      1.00      1.00    176972
weighted avg       1.00      1.00      1.00    176972

