In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import MinMaxScaler 
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report
from sklearn.tree import export_graphviz
from sklearn.svm import SVC
import graphviz

In [2]:
%matplotlib notebook
pd.options.display.max_columns = None

import os
os.environ["PATH"] += os.pathsep + 'C:/Program Files (x86)/Graphviz2.38/bin/'

In [3]:
# Loading dataset 2011
df1 = pd.read_csv("Violencia_Domestica_2011.csv",sep=';',keep_default_na=True).rename(index=str,columns={'P07':'P08','P151':'P071','P152':'P072','P153':'P073','P154':'P074','P151':'P071','P32':'P31', 'P31':'P30', 'P30':'P29', 'P29':'P28', 'P28':'P27', 'P27':'P26', 'P26':'P25', 'P25':'P24', 'P24':'P23', 'P23':'P22', 'P227':'P217', 'P226':'P216', 'P225':'P215', 'P224':'P214', 'P223':'P213', 'P222':'P212', 'P221':'P211', 'P2111':'P2011', 'P2110':'P2010', 'P2109':'P2009', 'P2108':'P2008', 'P2107':'P2007', 'P2106':'P2006', 'P2105':'P2005', 'P2104':'P2004', 'P2103':'P2003', 'P2102':'P2002', 'P2101':'P2001', 'P20':'P19', 'P197':'P187', 'P196':'P186', 'P195':'P185', 'P194':'P184', 'P193':'P183', 'P192':'P182', 'P191':'P181', 'P18':'P17', 'P17':'P16', 'P16':'P15'}
).replace(' ',-2).drop(['P14','P33_1','P33_2','N_PESSOAS'],axis=1)
# agression_2011 = df[df['P19'] == 1]
# non_agression_2011 = df[df['P19'] == 2]

# Loading dataset 2013
df2 = pd.read_csv("Violencia_Domestica_2013.csv",sep=';').drop(['P14','N_PESSOAS'],axis=1)
# agression_2013 = df[df['P19'] == 1]
# non_agression_2013 = df[df['P19'] == 2]

# Loading dataset 2015
df3 = pd.read_csv("Violencia_Domestica_2015.csv",sep=';', keep_default_na=True).drop(['P14','PESO'],axis=1).replace(' ',-2)
# df = pd.read_csv("Violencia_Domestica_2015.csv",sep=';')
# agression_2015 = df[df['P19'] == 1]
# non_agression_2015 = df[df['P19'] == 2]


In [4]:
# Merging the three databases
merged = pd.concat([df1,df2,df3],axis=0,sort=True)

In [5]:
# dropping questions
cols = [name for name in  merged.columns if name[0] == 'P' and name != 'P19']

# dropping year
clean_merged = merged.drop(cols, axis=1).drop(['ANO'],axis=1)

# dropping instances with missing values
clean_merged = clean_merged[ clean_merged['RACACOR'] != 99 ]
clean_merged = clean_merged[ clean_merged['RENDA'] != 99 ]
clean_merged = clean_merged[ clean_merged['ESC'] != 99 ]
clean_merged = clean_merged[ clean_merged['IDADE'] != 99 ]
clean_merged = clean_merged[ clean_merged['SEXO'] != 99 ]
#Esc = Escolaridade. Ordinal
#Idade. Ordinal
#P19 = target
#RACACOR = Categórico
#Regiao = categorico
#RENDA = ordinal
#Sexo = Categórico
#UF = Categórico


# Adding dummy variables to categorical attributes
dummy_merged = pd.get_dummies(clean_merged, columns=['RACACOR','REGIAO','SEXO','UF'])

# splitting positive and negative instances (positive means this woman has suffered domestic violence)
positive = dummy_merged[ dummy_merged['P19'] == 1]
negative = dummy_merged[ dummy_merged['P19'] == 2]

In [6]:
# Datasets are unbalanced. I'll try to fix this by undersampling

# Oversampling
sampled_positive = positive.sample(frac=negative.shape[0]/positive.shape[0],replace=1)
sampled_negative = negative.sample(frac=1)

# Undersampling
# sampled_positive = positive.sample(frac=1)
# sampled_negative = negative.sample(frac=positive.shape[0]/negative.shape[0])

merged_sampled = pd.concat([sampled_positive, sampled_negative])
merged_sampled.shape

(5834, 43)

In [7]:
# Function to plot a RandomForest tree 
def plot_tree(estimator, feature_names = [name for name in  merged_sampled.columns if name != 'P19'], class_names = ['Violented','Not Violented']):
    dot_data = export_graphviz(estimator, out_file='tree.dot', 
                    feature_names = feature_names,
                    class_names = class_names,
                    rounded = True, proportion = False, 
                    precision = 2, filled = True)

    # Convert to png using system command (requires Graphviz)
    from subprocess import call
    call(['dot', '-Tpng', 'tree.dot', '-o', 'tree.png', '-Gdpi=600'])

    # Display in jupyter notebook
    from IPython.display import Image
    Image(filename = 'tree.png')

In [8]:
# Normalizing values for machine learning
scaler = MinMaxScaler()
# Scaling data using scikit
scaled_data = scaler.fit_transform(merged_sampled.astype(float))

merged_sampled.iloc[0]

# # Y = target, X = 
Y = scaled_data[:,2]
X = np.delete(scaled_data,2, axis=1)

# Creating the classifier
clf = RandomForestClassifier()

# Creating a k-fold iterator
skf = StratifiedKFold(n_splits=3)

# For each fold
for train, test in skf.split(X,Y):
    # trains the model
    clf.fit(X[train],Y[train])
    # prints the classification report for this fold
    print(classification_report(Y[test],clf.predict(X[test])))

              precision    recall  f1-score   support

         0.0       0.73      0.87      0.79       973
         1.0       0.84      0.68      0.75       973

   micro avg       0.77      0.77      0.77      1946
   macro avg       0.78      0.77      0.77      1946
weighted avg       0.78      0.77      0.77      1946

              precision    recall  f1-score   support

         0.0       0.75      0.89      0.81       972
         1.0       0.86      0.70      0.77       972

   micro avg       0.79      0.79      0.79      1944
   macro avg       0.81      0.79      0.79      1944
weighted avg       0.81      0.79      0.79      1944

              precision    recall  f1-score   support

         0.0       0.72      0.87      0.79       972
         1.0       0.84      0.66      0.74       972

   micro avg       0.76      0.76      0.76      1944
   macro avg       0.78      0.76      0.76      1944
weighted avg       0.78      0.76      0.76      1944





In [9]:
# Training model on the whole dataset
clf = clf.fit(X,Y)

from pickle import load, dump

# Saving model
with open('model.pkl','wb') as f:
    dump(clf, f)

In [10]:
# Loading the trained model

# loading model
with open('model.pkl','rb') as f:
    clf = load(f)