In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/crime-cast-forecasting-crime-categories/sample.csv
/kaggle/input/crime-cast-forecasting-crime-categories/train.csv
/kaggle/input/crime-cast-forecasting-crime-categories/test.csv


# **About:**

This dataset offers a comprehensive snapshot of criminal activities within the city. It encompasses various aspects of each incident, including date, time, location, victim demographics, and more.

# **Goal:**

By leveraging machine learning techniques, participants can analyze this rich dataset to predict crime categories, enhance law enforcement strategies, and bolster public safety measures.
Machine learning model must get a accuracy of 80% on the test data.

# **Columns information:**

1. Location: Location of the incident.
2. Cross_Street: Cross street near the incident.
3. Latitude: Latitude coordinate.
4. Longitude: Longitude coordinate.
5. Date_Reported: Date when the crime was reported.
6. Date_Occurred: Date when the crime occurred.
7. Time_Occurred: Time when the crime occurred.
8. Area_ID: ID of the area where the crime occurred.
9. Area_Name: Name of the area where the crime occurred.
10. Reporting_District_no: Reporting district number.
11. Part 1-2: Part classification of the crime (1 or 2).
12. Modus_Operandi: Method of operation used in the crime.
13. Victim_Age: Age of the victim.
14. Victim_Sex: Sex of the victim.
15. Victim_Descent: Descent of the victim.
16. Premise_Code: Code representing the type of premise where the crime occurred.
17.Premise_Description: Description of the premise.
18.Weapon_Used_Code: Code of the weapon used.
19.Weapon_Description: Description of the weapon used.
20.Status: Status of the investigation.
21. Status_Description: Description of the status.
22. Crime_Category: Category of the crime (target variable).

**Importing the Data & extracting basic information**

In [5]:
import pandas as pd
import numpy as np

In [6]:
train_df=pd.read_csv("/kaggle/input/crime-cast-forecasting-crime-categories/train.csv")
test_df=pd.read_csv("/kaggle/input/crime-cast-forecasting-crime-categories/test.csv")

In [None]:
train_df.head()

In [None]:
test_df.head()

In [None]:
train_df.shape,test_df.shape

In [7]:
def Basic_info(data):
    output=pd.DataFrame(columns=["count","missing","no_unique","dtype","mode","v_mode"])

    for col in data :
      output.loc[col]=[data[col].count(),
                      data[col].isna().sum(),
                      data[col].nunique(),
                      data[col].dtypes,
                      data[col].mode().values[0],
                      data[col].value_counts().max(),
                       ]

    return  output

In [None]:
Basic_info(train_df)

In [None]:
Basic_info(test_df)

In [8]:
date_format = '%m/%d/%Y %I:%M:%S %p'
train_df['Date_Reported'] = pd.to_datetime(train_df['Date_Reported'], format=date_format)
train_df['Date_Occurred'] = pd.to_datetime(train_df['Date_Occurred'], format=date_format)

In [10]:
train_df['Year_Reported'] = train_df['Date_Reported'].dt.year
train_df['Month_Reported'] = train_df['Date_Reported'].dt.month
train_df['Day_Reported'] = train_df['Date_Reported'].dt.day
train_df['Hour_Occurred'] = train_df['Time_Occurred']
train_df['Year_Occurred'] = train_df['Date_Occurred'].dt.year
train_df['Month_Occurred'] = train_df['Date_Occurred'].dt.month
train_df['Day_Occurred'] = train_df['Date_Occurred'].dt.day

In [7]:
Basic_info(train_df)

Unnamed: 0,count,missing,no_unique,dtype,mode,v_mode
Location,20000,0,12399,object,6TH,33
Cross_Street,3448,16552,1495,object,BROADWAY,56
Latitude,20000,0,3622,float64,34.2012,90
Longitude,20000,0,3578,float64,-118.2739,168
Date_Reported,20000,0,811,datetime64[ns],2020-07-13 00:00:00,87
Date_Occurred,20000,0,366,datetime64[ns],2020-01-01 00:00:00,137
Time_Occurred,20000,0,996,float64,1200.0,850
Area_ID,20000,0,21,float64,12.0,1345
Area_Name,20000,0,21,object,77th Street,1345
Reporting_District_no,20000,0,1120,float64,645.0,99


**Explortory Data Analysis [EDA]**

In [None]:
train_df.Crime_Category.value_counts()

In [25]:
train_df.Victim_Age.value_counts()

Victim_Age
 0.0     4828
 30.0     448
 31.0     446
 26.0     442
 29.0     425
         ... 
 98.0       2
-2.0        2
 92.0       2
 96.0       2
 94.0       2
Name: count, Length: 100, dtype: int64

In [26]:
train_df.Victim_Age.unique()

array([75., 41., 67., 61.,  0., 50., 68., 22., 31., 46., 72., 26., 38.,
       37., 42., 40., 53., 60., 29., 13., 33., 27., 15., 23., 74., 63.,
       78., 51., 44., 34., 69., 36., 52., 25., 49., 48., 32., 18., 35.,
       24., 39., 16., 28., 47., 30., 64., 76.,  5., 58., 45., 57., 19.,
       55., 54., 21., 65., 17., 20., 77., 82., 56., -2., 84., 59., 43.,
        7., 70., 66., 62., 14., 80., 71., 81., 96., 12., 11.,  4., 83.,
       10.,  8.,  6., 88., 86., 73.,  9., 87., 85., 93., 95., 99., 79.,
        3.,  2., 91., 92., 90., 89., 98., -1., 94.])

In [24]:
def cross_table(data,col1,col2) :

      return pd.crosstab(data[col1],data[col2],margins=True)

In [None]:
cross_table(train_df,"Victim_Sex","Crime_Category")

Highest no of Property Crimes are done against Men.

Highest no of Violent Crimes are done against Women.

In [None]:
cross_table(train_df,"Victim_Descent","Crime_Category")

Highest no of Property Crimes & Violent Crimes are done against Victim_Descent "H"

In [None]:
cross_table(train_df,"Weapon_Used_Code","Crime_Category")

Hoghest no of Violent Crimes & Property Crimes are done using Weopen_used_code 400 which is STRONG-ARM (HANDS, FIST, FEET OR BODILY FORCE)

In [None]:
cross_table(train_df,"Area_ID","Crime_Category")

Highest no of Violent Crimes are done in Area_ID 12 and Property Crimes in 14

In [None]:
cross_table(train_df,"Status","Crime_Category")

Most of the Property Crimes are under Status "IC" which is Invest Cont

In [None]:
cross_table(train_df,"Year_Reported","Crime_Category")

Most of the crimes Year_Reported is 2020

In [None]:
cross_table(train_df,"Month_Reported","Crime_Category")

Highest no of Property Crimes are reported in the 1st month & Violent Crime in 6th month.

In [None]:
cross_table(train_df,"Day_Reported","Crime_Category")

Highest no of Property Crimes are reported in the 21th day & Violent Crime in 5th day.

In [None]:
pd.set_option('display.max_rows', 300)
cross_table(train_df,"Premise_Code","Crime_Category")

Property Crime are done in Premise_Code 101.0 (Street).

Violent crime are done in 501.0 (SINGLE FAMILY DWELLING).

In [None]:
cross_table(train_df,"Victim_Age","Crime_Category")

People of age 25 are the highest victim of Violent Crime.

In [None]:
cross_table(train_df,"Weapon_Used_Code","Victim_Sex")

Highest no of crimes against female & male are done using Weapon_uesd_Code 400 ['STRONG-ARM (HANDS, FIST, FEET OR BODILY FORCE']

In [None]:
cross_table(train_df,"Weapon_Used_Code","Victim_Descent")

Highest no of crimes against Victim_Descent "H" are done using Weapon_uesd_Code 400 ['STRONG-ARM (HANDS, FIST, FEET OR BODILY FORCE']

In [None]:
pd.pivot_table(train_df, index=["Area_ID", "Month_Reported"], columns="Crime_Category",
                              aggfunc="size", fill_value=0)

Highest no of Property Crimes happen in Area_ID 14 on Month_Reported 8 .

In [None]:
pd.pivot_table(train_df, index=["Status","Victim_Sex"], columns="Crime_Category", aggfunc="size", fill_value=0)

Highest no of Property Crimes & Violent Crimes are against Victim_Sex "M"[male] are in Status "IC"["Invest Cont"].

Most of Property Crimes & Violent Crimes are against Victim_Sex "F"[Female] are also in Status "IC"["Invest Cont"].

In [None]:
 pd.pivot_table(train_df, index=["Victim_Sex","Victim_Descent"], columns="Crime_Category", aggfunc="size", fill_value=0,)

Violent Crimes on Female are highest against Victim_Descent "H" .

Property Crimes on male are highest against Victim_Descent "W" 

Most of the on female are against Victim_Descent "H" & "B" .


**Data Visualization**

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
plt.figure(figsize=(16,9))
sns.scatterplot(x="Latitude",y="Longitude", data=train_df, hue="Crime_Category")
plt.xlim(33.6,34.5)
plt.ylim(-118.12,-118.7)
plt.legend(loc=2)

In [None]:
plt.figure(figsize=(16,9))
sns.histplot(train_df.Longitude,color="b")
plt.xlim(-118,-119)
plt.show()

In [None]:
plt.figure(figsize=(16,9))
sns.histplot(train_df.Latitude,color="g")
plt.xlim(33.6,34.7)

In [None]:
plt.figure(figsize=(16,9))
sns.histplot(train_df.Weapon_Used_Code,color="g")

In [None]:
plt.figure(figsize=(20,8))
sns.countplot(x="Area_ID",hue="Crime_Category", data=train_df)

plt.show()

In [None]:
plt.figure(figsize=(20,10))
sns.countplot(x="Month_Reported",hue="Crime_Category", data=train_df)

plt.show()

In [None]:
plt.figure(figsize=(10,10))
sns.countplot(x="Status",hue="Crime_Category", data=train_df)

plt.show()

In [None]:
plt.figure(figsize=(20,10))
sns.countplot(x="Area_ID",hue="Status", data=train_df)

plt.show()

In [None]:
plt.figure(figsize=(20,15))
sns.countplot(x="Area_ID",hue="Victim_Sex", data=train_df)

plt.show()

In [None]:
plt.figure(figsize=(20,12))
sns.countplot(x="Month_Reported",hue="Status", data=train_df)

plt.show()

In [None]:
def hist_plot(col, bins=50, figsize=(8, 6)) :
        plt.figure(figsize=figsize)
        train_df[col].hist(bins=bins)
        plt.title(f'Histogram of {col}')
        plt.xlabel('Value')
        plt.ylabel('Frequency')
        plt.show()

In [None]:
hist_plot('Time_Occurred')

In [None]:
hist_plot('Victim_Age')

In [None]:
hist_plot('Premise_Code')

**Data Cleaning**

In [11]:
new_df=train_df.Modus_Operandi

In [12]:
new_df

0                                           0385
1        0906 0352 0371 0446 1822 0344 0416 0417
2                                      0329 1202
3                                      0329 1300
4                  0416 0945 1822 0400 0417 0344
                          ...                   
19995                        0416 0446 2004 0913
19996                   1822 0209 0344 1414 1420
19997                             0344 1822 1420
19998              1202 2038 0913 0602 1242 0553
19999                                        NaN
Name: Modus_Operandi, Length: 20000, dtype: object

In [13]:
new_df.str.len().max(),new_df.str.len().median(),new_df.str.len().min(),new_df.str.len().mean()

(49.0, 14.0, 4.0, 16.95729764181007)

In [15]:
n_df=pd.DataFrame()

In [17]:
n_df['M_d_1']=new_df.str.slice(0, 4)
n_df['M_d_2']=new_df.str.slice(4, 9)
n_df['M_d_3']=new_df.str.slice(9, 14)
n_df['M_d_4']=new_df.str.slice(14, 19)
n_df['M_d_5']=new_df.str.slice(19,24)
n_df['M_d_6']=new_df.str.slice(24, 29)
n_df['M_d_7']=new_df.str.slice(29,34)
n_df['M_d_8']=new_df.str.slice(34,39)
n_df['M_d_9']=new_df.str.slice(39,44)
n_df['M_d_10']=new_df.str.slice(44, 49)

In [18]:
n_df

Unnamed: 0,M_d_1,M_d_2,M_d_3,M_d_4,M_d_5,M_d_6,M_d_7,M_d_8,M_d_9,M_d_10
0,0385,,,,,,,,,
1,0906,0352,0371,0446,1822,0344,0416,0417,,
2,0329,1202,,,,,,,,
3,0329,1300,,,,,,,,
4,0416,0945,1822,0400,0417,0344,,,,
...,...,...,...,...,...,...,...,...,...,...
19995,0416,0446,2004,0913,,,,,,
19996,1822,0209,0344,1414,1420,,,,,
19997,0344,1822,1420,,,,,,,
19998,1202,2038,0913,0602,1242,0553,,,,


In [19]:
n_df.fillna('unknown',inplace=True)
n_df.replace('','finish', inplace=True)

In [20]:
n_df

Unnamed: 0,M_d_1,M_d_2,M_d_3,M_d_4,M_d_5,M_d_6,M_d_7,M_d_8,M_d_9,M_d_10
0,0385,finish,finish,finish,finish,finish,finish,finish,finish,finish
1,0906,0352,0371,0446,1822,0344,0416,0417,finish,finish
2,0329,1202,finish,finish,finish,finish,finish,finish,finish,finish
3,0329,1300,finish,finish,finish,finish,finish,finish,finish,finish
4,0416,0945,1822,0400,0417,0344,finish,finish,finish,finish
...,...,...,...,...,...,...,...,...,...,...
19995,0416,0446,2004,0913,finish,finish,finish,finish,finish,finish
19996,1822,0209,0344,1414,1420,finish,finish,finish,finish,finish
19997,0344,1822,1420,finish,finish,finish,finish,finish,finish,finish
19998,1202,2038,0913,0602,1242,0553,finish,finish,finish,finish


In [21]:
n_train_df=pd.concat([train_df,n_df],axis=1)

In [22]:
n_train_df.shape

(20000, 39)

In [30]:
n_train_df["Victim_Age"].replace(-2,0,)
n_train_df["Victim_Age"].replace(-1,0,)

0        75.0
1        41.0
2        67.0
3        61.0
4         0.0
         ... 
19995    51.0
19996     0.0
19997    42.0
19998    76.0
19999     0.0
Name: Victim_Age, Length: 20000, dtype: float64

In [31]:
Basic_info(n_train_df)

Unnamed: 0,count,missing,no_unique,dtype,mode,v_mode
Location,20000,0,12399,object,6TH,33
Cross_Street,3448,16552,1495,object,BROADWAY,56
Latitude,20000,0,3622,float64,34.2012,90
Longitude,20000,0,3578,float64,-118.2739,168
Date_Reported,20000,0,811,datetime64[ns],2020-07-13 00:00:00,87
Date_Occurred,20000,0,366,datetime64[ns],2020-01-01 00:00:00,137
Time_Occurred,20000,0,996,float64,1200.0,850
Area_ID,20000,0,21,float64,12.0,1345
Area_Name,20000,0,21,object,77th Street,1345
Reporting_District_no,20000,0,1120,float64,645.0,99


In [33]:
drop=["Location","Cross_Street","Date_Reported","Date_Occurred","Area_Name","Premise_Description","Weapon_Description","Status_Description","Hour_Occurred","Modus_Operandi",'M_d_5','M_d_6','M_d_7','M_d_8','M_d_9','M_d_10']

In [34]:
new_train_df=n_train_df.drop(drop,axis=1)

In [35]:
new_train_df["Victim_Descent"].fillna('unknown', inplace=True)
new_train_df["Victim_Sex"].fillna('unknown', inplace=True)
new_train_df["Weapon_Used_Code"].fillna(new_train_df["Weapon_Used_Code"].mean(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  new_train_df["Victim_Descent"].fillna('unknown', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  new_train_df["Victim_Sex"].fillna('unknown', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on

In [36]:
Basic_info(new_train_df)

Unnamed: 0,count,missing,no_unique,dtype,mode,v_mode
Latitude,20000,0,3622,float64,34.2012,90
Longitude,20000,0,3578,float64,-118.2739,168
Time_Occurred,20000,0,996,float64,1200.0,850
Area_ID,20000,0,21,float64,12.0,1345
Reporting_District_no,20000,0,1120,float64,645.0,99
Part 1-2,20000,0,2,float64,1.0,11637
Victim_Age,20000,0,98,float64,0.0,4834
Victim_Sex,20000,0,5,object,M,8538
Victim_Descent,20000,0,18,object,H,6143
Premise_Code,20000,0,217,float64,101.0,5033


In [37]:
new_train_df.shape

(20000, 23)

**Data Pre-Processing**

In [38]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler , OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

In [39]:
X=new_train_df.drop('Crime_Category', axis=1)

In [40]:
y=pd.DataFrame(new_train_df['Crime_Category'])

In [41]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [42]:
X_train.shape, y_train.shape,X_test.shape, y_test.shape

((16000, 22), (16000, 1), (4000, 22), (4000, 1))

In [43]:
numeric_features = ["Victim_Age","Latitude","Longitude","Time_Occurred","Area_ID",
                    "Reporting_District_no","Part 1-2","Premise_Code",
                    "Weapon_Used_Code","Month_Reported","Day_Reported","Year_Reported",
                    "Year_Occurred","Month_Occurred","Day_Occurred"
                   ]
categorical_features = [
                         'Victim_Sex','Victim_Descent','Status',
                         'M_d_1','M_d_2','M_d_3','M_d_4'
                        ]

In [44]:
numeric_transformer = StandardScaler()
categorical_transformer =OrdinalEncoder()

In [45]:
preprocessor = ColumnTransformer(
                                  transformers=[
                                                 ('num', numeric_transformer, numeric_features),
                                                 ('cat', categorical_transformer, categorical_features),
                                               ]
                                )

In [47]:
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor)  ])

In [48]:
pipeline

In [49]:
X_train_transformed = pipeline.fit_transform(X_train)
y_train_transformed=categorical_transformer.fit_transform(y_train)

In [50]:
y_train_transformed.shape,X_train_transformed.shape

((16000, 1), (16000, 22))

**Model Fitting**
* 1. MLP Classifier

 2. Bagging Classifier
 
 3. Random Forest Classifier*
 

**MLP Classifier**

In [52]:
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

In [None]:
mlp = MLPClassifier(warm_start=True,random_state=0,n_iter_no_change=5,)

In [None]:
param_grid_mlp = {
    'hidden_layer_sizes':[(22,6)],
    'activation': ['relu', 'tanh', 'logistic','identity'],
    'solver': ['sgd', 'adam','lbfgs'],
    'alpha': [0.00001,0.0001, 0.001, 0.01],
    'learning_rate': ['constant', 'invscaling','adaptive']
}

In [None]:
 grid_search_mlp = GridSearchCV(mlp, param_grid_mlp, cv=10, scoring='accuracy',n_jobs=-1)

In [None]:
 grid_search_mlp.fit(X_train_transformed,y_train_transformed)

In [None]:
 print("Best parameters:", grid_search_mlp.best_params_)
 print("Best score:", grid_search_mlp.best_score_)

In [None]:
best_mlp = grid_search_mlp.best_estimator_
best_mlp.fit(X_train_transformed,y_train_transformed)

In [None]:
y_mlp_pred = best_mlp.predict(X_train_transformed)

In [None]:
report_mlp = classification_report(y_train_transformed, y_mlp_pred)
conf_matrix_mlp = confusion_matrix(y_train_transformed, y_mlp_pred)

In [None]:
print(report_mlp)

In [None]:
print(conf_matrix_mlp)

**Bagging Classifier**

In [35]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

In [36]:
bagging = BaggingClassifier(random_state=0,warm_start=True,)

In [38]:
param_grid_bag = {
    'n_estimators': [10,50, 100, 200, 500],
    'bootstrap': [True, False],
}

In [41]:
 grid_search_bag = GridSearchCV(estimator=bagging, param_grid=param_grid_bag, cv=10, scoring='accuracy',n_jobs=-1)

In [42]:
 grid_search_bag.fit(X_train_transformed,y_train_transformed)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

In [43]:
print("Best parameters:", grid_search_bag.best_params_)
print("Best score:", grid_search_bag.best_score_)

Best parameters: {'bootstrap': True, 'n_estimators': 500}
Best score: 0.9301250000000001


In [44]:
best_bag= grid_search_bag.best_estimator_
best_bag.fit(X_train_transformed,y_train_transformed)

  y = column_or_1d(y, warn=True)
  warn(


In [45]:
y_bag_pred = best_bag.predict(X_train_transformed)

In [46]:
report_bag = classification_report(y_train_transformed, y_bag_pred)
conf_matrix_bag = confusion_matrix(y_train_transformed, y_bag_pred)

In [47]:
 print(report_bag)

              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00       184
         1.0       1.00      1.00      1.00      1464
         2.0       1.00      1.00      1.00      1081
         3.0       1.00      1.00      1.00       141
         4.0       1.00      1.00      1.00      9280
         5.0       1.00      1.00      1.00      3850

    accuracy                           1.00     16000
   macro avg       1.00      1.00      1.00     16000
weighted avg       1.00      1.00      1.00     16000



In [48]:
 print(conf_matrix_bag)

[[ 184    0    0    0    0    0]
 [   0 1464    0    0    0    0]
 [   0    0 1081    0    0    0]
 [   0    0    0  141    0    0]
 [   0    0    0    0 9280    0]
 [   0    0    0    0    0 3850]]


**Random Forest Classifier**

In [53]:
from sklearn.ensemble import RandomForestClassifier

In [54]:
rf = RandomForestClassifier(random_state=0,warm_start=True,oob_score=True,bootstrap=True)

In [56]:
param_grid_rf = {
    'n_estimators': [ 10,50,100, 200,500],
    'criterion' : ['gini','entropy','log_loss'],
    'ccp_alpha' : [0.01,0.001],
    'class_weight' : ['balanced',None,'balanced_subsample'],
    'max_features' : ["sqrt", "log2", None],
}

In [57]:
 grid_search_rf = GridSearchCV(estimator=rf, param_grid=param_grid_rf, cv=10, n_jobs=-1)

In [None]:
 grid_search_rf.fit(X_train_transformed,y_train_transformed)

  estimator.fit(X_train, y_train, **fit_params)
  warn(
  estimator.fit(X_train, y_train, **fit_params)
  warn(
  estimator.fit(X_train, y_train, **fit_params)
  warn(
  estimator.fit(X_train, y_train, **fit_params)
  warn(
  warn(
  estimator.fit(X_train, y_train, **fit_params)
  warn(
  warn(
  estimator.fit(X_train, y_train, **fit_params)
  warn(
  warn(
  estimator.fit(X_train, y_train, **fit_params)
  warn(
  warn(
  estimator.fit(X_train, y_train, **fit_params)
  warn(
  warn(
  estimator.fit(X_train, y_train, **fit_params)
  warn(
  warn(
  estimator.fit(X_train, y_train, **fit_params)
  warn(
  warn(
  estimator.fit(X_train, y_train, **fit_params)
  warn(
  warn(
  estimator.fit(X_train, y_train, **fit_params)
  warn(
  warn(
  estimator.fit(X_train, y_train, **fit_params)
  warn(
  warn(
  estimator.fit(X_train, y_train, **fit_params)
  warn(
  estimator.fit(X_train, y_train, **fit_params)
  warn(
  estimator.fit(X_train, y_train, **fit_params)
  warn(
  estimator.fit(X_train,

In [None]:
# print("Best parameters:", grid_search_rf.best_params_)
# print("Best score:", grid_search_rf.best_score_)

In [None]:
#best_rf = grid_search_rf.best_estimator_
#best_rf.fit(X_train_transformed,y_train_transformed)

In [None]:
#y_rf_pred = best_rf.predict(X_train_transformed)

In [None]:
#report_rf = classification_report(y_train_transformed, y_rf_pred)
#conf_matrix_rf = confusion_matrix(y_train_transformed, y_rf_pred)

In [None]:
#print(report_rf)

In [None]:
#print(conf_matrix_rf)

**Preparing Test Data**

In [49]:
new_t_df= test_df.Modus_Operandi

In [50]:
new_t_df.str.len().max(),new_t_df.str.len().median(),new_t_df.str.len().min(),new_t_df.str.len().mean()

(49.0, 14.0, 4.0, 17.05722891566265)

In [51]:
n_t_df=pd.DataFrame()

In [52]:
n_t_df['M_d_1']=new_t_df.str.slice(0, 4)
n_t_df['M_d_2']=new_t_df.str.slice(4, 9)
n_t_df['M_d_3']=new_t_df.str.slice(9, 14)
n_t_df['M_d_4']=new_t_df.str.slice(14, 19)
n_t_df['M_d_5']=new_t_df.str.slice(19,24)
n_t_df['M_d_6']=new_t_df.str.slice(24, 29)
n_t_df['M_d_7']=new_t_df.str.slice(29,34)
n_t_df['M_d_8']=new_t_df.str.slice(34,39)
n_t_df['M_d_9']=new_t_df.str.slice(39,44)
n_t_df['M_d_10']=new_t_df.str.slice(44, 49)

In [53]:
n_t_df

Unnamed: 0,M_d_1,M_d_2,M_d_3,M_d_4,M_d_5,M_d_6,M_d_7,M_d_8,M_d_9,M_d_10
0,0416,1241,1243,1813,1821,2000,,,,
1,0344,0394,,,,,,,,
2,1822,0701,1914,0355,1202,0100,,,,
3,,,,,,,,,,
4,1501,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...
4995,,,,,,,,,,
4996,1300,0325,,,,,,,,
4997,0913,1817,0416,,,,,,,
4998,0416,,,,,,,,,


In [54]:
n_t_df.fillna('unknown',inplace=True)
n_t_df.replace('','finish', inplace=True)

In [55]:
n_t_df

Unnamed: 0,M_d_1,M_d_2,M_d_3,M_d_4,M_d_5,M_d_6,M_d_7,M_d_8,M_d_9,M_d_10
0,0416,1241,1243,1813,1821,2000,finish,finish,finish,finish
1,0344,0394,finish,finish,finish,finish,finish,finish,finish,finish
2,1822,0701,1914,0355,1202,0100,finish,finish,finish,finish
3,unknown,unknown,unknown,unknown,unknown,unknown,unknown,unknown,unknown,unknown
4,1501,finish,finish,finish,finish,finish,finish,finish,finish,finish
...,...,...,...,...,...,...,...,...,...,...
4995,unknown,unknown,unknown,unknown,unknown,unknown,unknown,unknown,unknown,unknown
4996,1300,0325,finish,finish,finish,finish,finish,finish,finish,finish
4997,0913,1817,0416,finish,finish,finish,finish,finish,finish,finish
4998,0416,finish,finish,finish,finish,finish,finish,finish,finish,finish


In [56]:
n_test_df=pd.concat([test_df,n_t_df],axis=1)

In [57]:
n_test_df.shape

(5000, 31)

In [58]:
date_format = '%m/%d/%Y %I:%M:%S %p'
n_test_df['Date_Reported'] = pd.to_datetime(n_test_df['Date_Reported'], format=date_format)
n_test_df['Date_Occurred'] = pd.to_datetime(n_test_df['Date_Occurred'], format=date_format)

In [59]:
n_test_df['Year_Reported'] = n_test_df['Date_Reported'].dt.year
n_test_df['Month_Reported'] = n_test_df['Date_Reported'].dt.month
n_test_df['Day_Reported'] = n_test_df['Date_Reported'].dt.day
n_test_df['Hour_Occurred'] = n_test_df['Time_Occurred']
n_test_df['Year_Occurred'] = n_test_df['Date_Reported'].dt.year
n_test_df['Month_Occurred'] = n_test_df['Date_Reported'].dt.month
n_test_df['Day_Occurred'] = n_test_df['Date_Reported'].dt.day

In [60]:
new_test_df=n_test_df.drop(drop,axis=1)

In [61]:
new_test_df["Victim_Descent"].fillna('unknown', inplace=True)
new_test_df["Victim_Sex"].fillna('unknown', inplace=True)
new_test_df["Weapon_Used_Code"].fillna(new_test_df["Weapon_Used_Code"].mean(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  new_test_df["Victim_Descent"].fillna('unknown', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  new_test_df["Victim_Sex"].fillna('unknown', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on w

In [62]:
Basic_info(new_test_df)

Unnamed: 0,count,missing,no_unique,dtype,mode,v_mode
Latitude,5000,0,2141,float64,34.1016,31
Longitude,5000,0,2213,float64,-118.2739,44
Time_Occurred,5000,0,517,float64,1200.0,205
Area_ID,5000,0,21,float64,12.0,301
Reporting_District_no,5000,0,1017,float64,162.0,29
Part 1-2,5000,0,2,float64,1.0,2930
Victim_Age,5000,0,94,float64,0.0,1188
Victim_Sex,5000,0,5,object,M,2121
Victim_Descent,5000,0,17,object,H,1536
Premise_Code,5000,0,146,float64,101.0,1257


In [None]:
new_test_df.shape

**Fitting the model on Test Data**

In [63]:
test_df_transformed = pipeline.fit_transform(new_test_df)

**MLP Classifier**

In [None]:
y_test_mlp=best_mlp.predict(test_df_transformed)

In [None]:
y_test_mlp_df=pd.DataFrame(y_test_mlp)

In [None]:
y_test_mlp_de=categorical_transformer.inverse_transform(y_test_mlp_df)

In [None]:
    output = pd.DataFrame(columns=["ID","Crime_Category"])
    output["ID"] = test_df.index+1
    output['Crime_Category'] =  pd.DataFrame(y_test_mlp_de)

In [None]:
output.to_csv('submission_mlp_last.csv', index = False)

**Bagging Classifier**

In [64]:
y_test_bag=best_bag.predict(test_df_transformed)

In [65]:
y_test_bag_df=pd.DataFrame(y_test_bag)

In [66]:
y_test_bag_de=categorical_transformer.inverse_transform(y_test_bag_df)

In [67]:
    output = pd.DataFrame(columns=["ID","Crime_Category"])
    output["ID"] = test_df.index+1
    output['Crime_Category'] =  pd.DataFrame(y_test_bag_de)

In [68]:
output.to_csv('submission_bag_last.csv', index = False)