In [1]:
# importing necessary libraries
import pandas as pd
import numpy as np
# !pip install kmodes
from kmodes.kmodes import KModes
import matplotlib.pyplot as plt
%matplotlib inline
import os
import wrangle
import plotly.express as px
import seaborn as sns
from sklearn.model_selection import train_test_split
#sklearn stuff 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from scipy import stats
import utilities
import warnings
warnings.filterwarnings("ignore")

In [2]:
df = wrangle.create_terrorism_df()

In [3]:
cols_to_drop =['eventid',
 'year',
 'month',
 'day',
 'country',
 'region',
 'provstate',
 'city',
 'latitude',
 'longitude',
 'success',
 'suicide',
 'attack_type',
 'targ_desc',
 'targeted_group',
 'tg_desc',
 'nationality',
 'atk_group',
 'claimed',
 'weap_type',
 'weap_sub',
 'killed',
 'us_killed',
 'ter_killed',
 'wounded',
 'us_wounded',
 'ter_wounded',
 'property']

In [4]:
df3 = df.drop(cols_to_drop, axis = 1)

In [18]:
bottom_targ_groups = df3.target.value_counts().index.to_list()
bottom_targ_groups = bottom_targ_groups[4:]
df3.target = df3.target.replace(bottom_targ_groups, 'other')

In [6]:
data = pd.read_csv('modeling_df.csv')

In [7]:
data.month = df.month.astype('str')
data.year = df.year.astype('str')

In [8]:
data = data.drop(columns = 'Unnamed: 0')

In [9]:
dummy_df = pd.get_dummies(data[['Cluster',
                               'provstate',
                                'year',
                                'suicide',
                                'country',
                                'city',
                                'property',
                              'nationality',
                              'month',
                              'attack_type',
                              'atk_group', 
                              'weap_type',
                              'weap_sub']], dummy_na=False, drop_first=[True, True])

In [10]:
df3 = df3.reset_index()
df3 = df3.drop(columns = 'index')

In [11]:
df_trial2 = pd.concat([df3, dummy_df], axis = 1)

In [12]:
#split the data
encoded_train, encoded_validate, encoded_test = wrangle.split_data(df_trial2)
encoded_train.shape, encoded_validate.shape, encoded_test.shape

((24812, 370), (10635, 370), (8862, 370))

In [13]:
X_train = encoded_train.drop(columns=["target"])
y_train = encoded_train.target

X_validate = encoded_validate.drop(columns=["target"])
y_validate = encoded_validate.target

X_test = encoded_test.drop(columns=["target"])
y_test = encoded_test.target

In [14]:
metrics = []

for i in range(1, 25):

    tree = RandomForestClassifier(bootstrap = True, 
                                  class_weight = None,
                                  criterion = 'gini', 
                                  min_samples_leaf = 3, 
                                  n_estimators = 150,
                                  max_depth=i,
                                  random_state=17)


    tree = tree.fit(X_train, y_train)


    in_sample_accuracy = tree.score(X_train, y_train)
    
    out_of_sample_accuracy = tree.score(X_validate, y_validate)
    
    test_accuracy = tree.score(X_test, y_test)
    
    output = {
        "max_depth": i,
        "train_accuracy": in_sample_accuracy,
        "validate_accuracy": out_of_sample_accuracy,
        "test_accuracy": test_accuracy
    }
    
    metrics.append(output)
    
rf_df = pd.DataFrame(metrics)
rf_df

Unnamed: 0,max_depth,train_accuracy,validate_accuracy,test_accuracy
0,1,0.279018,0.268453,0.281878
1,2,0.33109,0.318289,0.334349
2,3,0.356481,0.345933,0.358948
3,4,0.372763,0.361918,0.373731
4,5,0.391464,0.378843,0.387723
5,6,0.407263,0.392384,0.401602
6,7,0.419394,0.399906,0.414241
7,8,0.433661,0.407804,0.420221
8,9,0.447606,0.417019,0.428346
9,10,0.460463,0.428303,0.441323


In [17]:
final_metrics = []
tree = RandomForestClassifier(bootstrap = True, 
                                  class_weight = None,
                                  criterion = 'gini', 
                                  min_samples_leaf = 3, 
                                  n_estimators = 150,
                                  max_depth=18,
                                  random_state=17)


tree = tree.fit(X_train, y_train)


in_sample_accuracy = tree.score(X_train, y_train)
    
out_of_sample_accuracy = tree.score(X_validate, y_validate)
    
test_accuracy = tree.score(X_test, y_test)
    
output = {
        "max_depth": 18,
        "train_accuracy": in_sample_accuracy,
        "validate_accuracy": out_of_sample_accuracy,
        "test_accuracy": test_accuracy
    }
    
final_metrics.append(output)
    
rf_df = pd.DataFrame(final_metrics)
rf_df

Unnamed: 0,max_depth,train_accuracy,validate_accuracy,test_accuracy
0,18,0.524343,0.470334,0.476868


In [19]:
def model_data():
    df = wrangle.create_terrorism_df()
    cols_to_drop =['eventid',
     'year',
     'month',
     'day',
     'country',
     'region',
     'provstate',
     'city',
     'latitude',
     'longitude',
     'success',
     'suicide',
     'attack_type',
     'targ_desc',
     'targeted_group',
     'tg_desc',
     'nationality',
     'atk_group',
     'claimed',
     'weap_type',
     'weap_sub',
     'killed',
     'us_killed',
     'ter_killed',
     'wounded',
     'us_wounded',
     'ter_wounded',
     'property']
    
    df3 = df.drop(cols_to_drop, axis = 1)
    
    bottom_targ_groups = df3.target.value_counts().index.to_list()
    
    bottom_targ_groups = bottom_targ_groups[4:]
    
    df3.target = df3.target.replace(bottom_targ_groups, 'other')
    
    data = pd.read_csv('modeling_df.csv')
    
    data.month = df.month.astype('str')
    
    data.year = df.year.astype('str')
    
    data = data.drop(columns = 'Unnamed: 0')
    
    df3 = df3.reset_index()

    df3 = df3.drop(columns = 'index')
    
    dummy_df = pd.get_dummies(data[['Cluster',
                               'provstate',
                                'year',
                                'suicide',
                                'country',
                                'city',
                                'property',
                              'nationality',
                              'month',
                              'attack_type',
                              'atk_group', 
                              'weap_type',
                              'weap_sub']], dummy_na=False, drop_first=[True, True])
   
    df_trial2 = pd.concat([df3, dummy_df], axis = 1)
    
    #split the data
    encoded_train, encoded_validate, encoded_test = wrangle.split_data(df_trial2)
    
    return encoded_train, encoded_validate, encoded_test


In [21]:
encoded_train2, encoded_validate2, encoded_test2 = model_data()

In [23]:
encoded_train2.shape, encoded_validate2.shape, encoded_test2.shape

((24812, 370), (10635, 370), (8862, 370))

In [24]:
encoded_train.shape, encoded_validate.shape, encoded_test.shape

((24812, 370), (10635, 370), (8862, 370))

In [25]:
X_train = encoded_train2.drop(columns=["target"])
y_train = encoded_train2.target

X_validate = encoded_validate2.drop(columns=["target"])
y_validate = encoded_validate2.target

X_test = encoded_test2.drop(columns=["target"])
y_test = encoded_test2.target

In [26]:
final_metrics = []
tree = RandomForestClassifier(bootstrap = True, 
                                  class_weight = None,
                                  criterion = 'gini', 
                                  min_samples_leaf = 3, 
                                  n_estimators = 150,
                                  max_depth=18,
                                  random_state=17)


tree = tree.fit(X_train, y_train)


in_sample_accuracy = tree.score(X_train, y_train)
    
out_of_sample_accuracy = tree.score(X_validate, y_validate)
    
test_accuracy = tree.score(X_test, y_test)
    
output = {
        "max_depth": 18,
        "train_accuracy": in_sample_accuracy,
        "validate_accuracy": out_of_sample_accuracy,
        "test_accuracy": test_accuracy
    }
    
final_metrics.append(output)
    
rf_df = pd.DataFrame(final_metrics)
rf_df

Unnamed: 0,max_depth,train_accuracy,validate_accuracy,test_accuracy
0,18,0.524343,0.470334,0.476868
