In [1]:
import numpy as np  
import pandas as pd  
import matplotlib.pyplot as plt
import os
from sklearn import decomposition, preprocessing
from sklearn.model_selection import train_test_split
from pandas import read_csv
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix

In [2]:
train_path = os.getcwd() + '/criminal_train.csv'
test_path = os.getcwd() + '/criminal_test.csv'

In [3]:
train = pd.read_csv(train_path)
test = pd.read_csv(test_path)

In [4]:
# Dividing datasets to input and output variables. 
y = train['Criminal']
X = train.drop(labels='Criminal',axis=1)

In [5]:
if 'Criminal' in test.columns:
    X_test = test.drop(labels='Criminal',axis=1)
else:
    X_test = test
X_test.head()

Unnamed: 0,PERID,IFATHER,NRCH17_2,IRHHSIZ2,IIHHSIZ2,IRKI17_2,IIKI17_2,IRHH65_2,IIHH65_2,PRXRETRY,...,POVERTY3,TOOLONG,TROUBUND,PDEN10,COUTYP2,MAIIN102,AIIND102,ANALWT_C,VESTR,VEREP
0,66583679,4,0,4,1,2,1,1,1,99,...,2,2,2,1,1,2,2,16346.7954,40020,1
1,35494679,4,0,4,1,1,1,1,1,99,...,3,2,2,1,1,2,2,3008.863906,40044,2
2,79424679,2,0,3,1,2,1,1,1,99,...,1,2,2,2,2,2,2,266.952503,40040,2
3,11744679,4,0,6,1,2,1,1,1,99,...,3,2,2,1,1,2,2,5386.928199,40017,1
4,31554679,1,0,4,1,3,1,1,1,99,...,3,2,1,1,1,2,2,173.489895,40017,1


In [6]:
if 'NRCH17_2' in X.columns:
    del X['NRCH17_2']
    
if 'PERID' in X.columns:
    del X['PERID']
    
if 'ANALWT_C' in X.columns:
    del X['ANALWT_C']
    
if 'NRCH17_2' in X_test.columns:
    del X_test['NRCH17_2']
    
if 'PERID' in X_test.columns:
    X_test_PERID = X_test['PERID']
    del X_test['PERID']
    
if 'ANALWT_C' in X_test.columns:
    del X_test['ANALWT_C']

In [7]:
column = list(X.columns)
# mark zero values as missing or NaN
X[column] = X[column].replace(0, np.NAN)
# count the number of NaN values in each column
print(X.isnull().sum())

column = list(X_test.columns)
# mark zero values as missing or NaN
X_test[column] = X_test[column].replace(0, np.NAN)
# count the number of NaN values in each column
print(X_test.isnull().sum())

IFATHER      0
IRHHSIZ2     0
IIHHSIZ2     0
IRKI17_2     0
IIKI17_2     0
IRHH65_2     0
IIHH65_2     0
PRXRETRY     0
PRXYDATA     0
MEDICARE     0
CAIDCHIP     0
CHAMPUS      0
PRVHLTIN     0
GRPHLTIN     0
HLTINNOS     0
HLCNOTYR     0
HLCNOTMO     0
HLCLAST      0
HLLOSRSN     0
HLNVCOST     0
HLNVOFFR     0
HLNVREF      0
HLNVNEED     0
HLNVSOR      0
IRMCDCHP     0
IIMCDCHP     0
IRMEDICR     0
IIMEDICR     0
IRCHMPUS     0
IICHMPUS     0
            ..
IIINSUR4     0
OTHINS       0
CELLNOTCL    0
CELLWRKNG    0
IRFAMSOC     0
IIFAMSOC     0
IRFAMSSI     0
IIFAMSSI     0
IRFSTAMP     0
IIFSTAMP     0
IRFAMPMT     0
IIFAMPMT     0
IRFAMSVC     0
IIFAMSVC     0
IRWELMOS     0
IIWELMOS     0
IRPINC3      0
IRFAMIN3     0
IIPINC3      0
IIFAMIN3     0
GOVTPROG     0
POVERTY3     0
TOOLONG      0
TROUBUND     0
PDEN10       0
COUTYP2      0
MAIIN102     0
AIIND102     0
VESTR        0
VEREP        0
Length: 68, dtype: int64
IFATHER      0
IRHHSIZ2     0
IIHHSIZ2     0
IRKI17_2     0


In [9]:
#diving input training dataset into train and valdiation datasets
X_train, X_val = train_test_split(X,test_size=0.1,train_size=0.9)

#diving output training dataset into train and valdiation datasets
y_train, y_val = train_test_split(y,test_size=0.1,train_size=0.9)

X_train = X
y_train = y

In [10]:
model = LogisticRegression()
rfe = RFE(model, 12)
fit = rfe.fit(X_train, y_train)
print("Num Features: ", fit.n_features_) 
print("Selected Features: ", fit.support_)
print("Feature Ranking: ", fit.ranking_) 

Num Features:  12
Selected Features:  [ True False False False False False False False False False False False
 False False False False False False False False False False False False
 False  True  True False False False  True  True False  True False False
 False  True  True  True False False  True False False  True False False
 False False False False False False False  True False False False False
 False False False False False False False False]
Feature Ranking:  [ 1 17 50  6 20  9 15 56 41 49 55 28 24 25 26 32 46 31 52 54 44 43 47 53
 21  1  1 27 22  8  1  1 30  1 29 38 45  1  1  1 40 48  1 14 37  1 12  2
 39 42 33  3 51 34  4  1 11 10 13  5 36 35 18 19 23  7 57 16]


In [11]:
columns = X_train.columns
# Removing un important features from training set
for i in range(len(columns)):
    if not fit.support_[i]:
        if columns[i] in X_train.columns:
            del X_train[columns[i]]
        if columns[i] in X_val.columns:
            del X_val[columns[i]]

columns = X_test.columns
# Removing un important features from training set
for i in range(len(columns)):
    if not fit.support_[i]:
        if columns[i] in X_test.columns:
            del X_test[columns[i]]

In [12]:
X_train.columns, X_test.columns

(Index(['IFATHER', 'IIMCDCHP', 'IRMEDICR', 'IRPRVHLT', 'IIPRVHLT', 'IIOTHHLT',
        'IRINSUR4', 'IIINSUR4', 'OTHINS', 'IRFAMSOC', 'IIFAMSSI', 'IRFAMIN3'],
       dtype='object'),
 Index(['IFATHER', 'IIMCDCHP', 'IRMEDICR', 'IRPRVHLT', 'IIPRVHLT', 'IIOTHHLT',
        'IRINSUR4', 'IIINSUR4', 'OTHINS', 'IRFAMSOC', 'IIFAMSSI', 'IRFAMIN3'],
       dtype='object'))

In [13]:
X_val.shape, X_train.shape, y_val.shape, y_train.shape, X_test.shape

((4572, 12), (45718, 12), (4572,), (45718,), (11430, 12))

In [14]:
LogReg = LogisticRegression()
LogReg.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [15]:
y_val_pred = LogReg.predict(X_val)
y_test_pred = LogReg.predict(X_test)

In [16]:
LogReg.score(X_train, y_train), LogReg.score(X_val, y_val)

(0.9391268209457981, 0.9118547681539807)

In [18]:
confusion_matrix = confusion_matrix(y_val, y_val_pred)
confusion_matrix

TypeError: 'numpy.ndarray' object is not callable

In [30]:
y_test_pred.shape, X_test.shape

((11430,), (11430, 13))

In [20]:
X_test['Criminal'] = y_test_pred

In [51]:
type(X_test_PERID), type(y_test_pred)

(pandas.core.series.Series, numpy.ndarray)

In [53]:
final_dataframe = pd.DataFrame(X_test_PERID)

In [55]:
final_dataframe['Criminal'] = y_test_pred

In [62]:
final_dataframe.head()

Unnamed: 0,PERID,Criminal
0,66583679,0
1,35494679,0
2,79424679,0
3,11744679,0
4,31554679,0


In [63]:
final_dataframe.to_csv('submission_26-03-2018.csv', index=False)