<a href="https://colab.research.google.com/github/devorahst/Test/blob/main/Logistic_Regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Analysis of Factors that Influence the Likelihood a RapeKit Sample will Produce a Profile Eligible for Use in Law Enforcement and Criminal Justice

#Set Up

##Import relevant libraries

In [None]:
import statsmodels.api as sm
import pandas as pd  
import numpy as np
from patsy import dmatrices

In [None]:

from google.colab import auth
auth.authenticate_user()
import gspread
from oauth2client.client import GoogleCredentials
gc = gspread.authorize(GoogleCredentials.get_application_default())

## Part 1. Upload
First, we must download the dataset. Upon running the cell, you will be prompted to login to your Gmail account. You will then be provided with a one-time use code to copy and paste into the slot below. After hitting enter, the dataset will load into this script.

In [None]:
#pulls up our SAK dataset
#@title uploader
file_id = "1lMSXIdMUQTZVOOk2wG-1awvPEIlypfxV" #@param {type:"string"}
!pip install -U -q PyDrive

from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

from google.colab import auth
auth.authenticate_user()

from googleapiclient.discovery import build
drive_service = build('drive', 'v3')

import io
from googleapiclient.http import MediaIoBaseDownload

request = drive_service.files().get_media(fileId=file_id)
downloaded = io.BytesIO()
downloader = MediaIoBaseDownload(downloaded, request)
done = False
while done is False:
  _, done = downloader.next_chunk()

fileId = drive.CreateFile({'id': file_id }) #DRIVE_FILE_ID is file id example: 1iytA1n2z4go3uVCwE_vIKouTKyIDjEq
print(fileId['title'])  
fileId.GetContentFile(fileId['title'])  # Save Drive file as a local file

DataFrame_corrected.csv


## Part 2. Set Up
1. Install and Import - Install SciKit-Learn and import necessary packages
2. Clean the Dataset - Label the columns and replace null values with "No Response" and sort features based on type.
3. Display Cleaned Dataset

###1. Install and Import

In [None]:
pip install -U scikit-learn



In [None]:
from scipy.stats import chi2_contingency
from scipy import stats
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# To change scientific numbers to float
np.set_printoptions(formatter={'float_kind':'{:f}'.format})


###2. Read in File and Clean the Dataset



#####**Label the columns and replace null values with "No Response"**


In [None]:
with open(fileId['title'], encoding="utf8", errors='ignore') as f:
  df = pd.read_csv(f)

df = df.astype('string')
df = df.replace(r'^\s+$', np.nan, regex=True)
df = df.replace({np.nan: "No Response"})

df = df[df['ProfileofSTRDNAloci'] != "No Response"]
df = df[df['CODISNDISeligibleProfile'] != "No Response"]

df = df[df['Site'] != '6'] #filter out other state data
df = df[df['Gender'] == '1']  #dataframe containing information from only female respondents

#Add col showing STRDNA profile eligible/not eligible
def makeContingency(row):
  good_values = ['1','2']  
  eligibility = row["ProfileofSTRDNAloci"]
  if eligibility in good_values:
    return "1"
  else:
    return "0"


df['STRDNAEligible'] = df.apply(makeContingency, axis=1)

predictedVariable = "STRDNAEligible"

  interactivity=interactivity, compiler=compiler, result=result)


#####**Sort Features Based on Type**
Because our features are not all categorical, we cannot calculate each feature's p-value using the same method. We must sort our features into numerical, categorical, and string entries and then apply the correct method to obtain each p-value. For categorical features, we will run chi-square tests while we will use analysis of variance (ANOVA) with T-Testing for numerical features. The meaning of the variables are all described in the [code book](https://drive.google.com/file/d/18PDTuK0lshc193lXA3b7UDgcRzMoEEGA/view?usp=sharing).

In [None]:
predictedFeatures = ['CODISNDISeligibleProfile', 'SDISeligibleprofile']  

numericalFeatures = ['Age', 'Timebetweenassaultandexaminhours', 'PainLevel', 'MulitipleSuspectNumber', 
                     'NumberofUnknownresponses', 'NumberAssaultiveActs', 'Numberofphysicalinjuries', 'Numberofgentialinjuries',
                     'NumberOFitemsTested', 'TimeBetweenCollectAndDNAext', 'TimeBetweenSubmissionANDtesting', 'NumberOfswabsQuantMaleDNA',
                     'NumberOfswabsDNAanalysis', 'NumberofSTRDNAloci', 'NumberOFswabsSTRDNAprofile', 'NumberOfYSTRDNAloci']

categoricalFeatures = ['Site', 'EXAMbySANE', 'YearKitCollected', 'KITbroughtTOcrimelab', 'KITlengthofSubmissionTime',
                       'UnderAge18', 'Gender', 'ExamDeclined', 'Noninterview', 'Race', 'PriorHxofSAover14',
                       'PriorHxofSAunder14', 'Student', 'Military', 'Pain', 'PainLocation1','PainLocation2', 
                       'PainLocation3', 'PainLocation4','PainTreatment', 'PermanentAddress', 'CurrentPhysicalmedprob',
                       'MedProbChronic', 'MedProbInfection', 'MedProbBlood', 'MedProbCardiac', 'MedProbEar', 'MedProbEndocrine',
                       'MedProbEye', 'MedProbGI', 'MedProbGU', 'MedProbGYN', 'MedProbImmune', 'MedProbMusculoskeletal', 'MedProbNeurological',
                       'MedProbOral', 'MedProbRenal', 'MedProbRespiratory', 'MedProbSkin', 'MedProbOther', 'Medication',
                       'PsychotropicMEDuse', 'PsychotropicANTIPSYCHOTICSatypical', 'PsychotropicSTIMULANTuse', 'PsychotropicANTIANXIETY', 
                       'PsychotropicANTIDEPRESSANTS', 'PsychotropicANTISEIZUREbipolar', 'PsychotropicADDICTIONmeds','PsychotropicSLEEPaid', 'PsychotropicOTHER', 
                       'PsychotropicANTIPSYCHOTICStypical', 'PolypharmacyPsychMeds', 'ImmunizationstatusTETANUS', 'ReceivedTetanus',
                       'ImmunizationstatusHEP', 'ReceivedHepB', 'Sexualcontactwithin120hours', 'Selfdisclosurementalillness', 'MIdepression',
                       'MIanxiety', 'MIPTSD', 'MIpsychoticDisorders', 'MIadhd', 'MIpersonalitydisorder', 'MIbipolar', 'MIeatingdisorder', 'MIdrugalcoholdisorders', 
                       'MIother', 'SelfDiscolsureMentalillnessORuseofpsychotropics', 'OnlineMeetingOFsuspect', 'Suspectrelationship',
                       'Locationofassault', 'PatientActionScratch', 'PatientActionBite', 'PatientActionHit', 'PatientActionKick', 'PatientActionOther',
                       'Suspectrace', 'SuspectactionVERBAL', 'SuspectactionsGRABBEDHELD', 'SuspectactionsPHYSICALBLOWS', 'SuspectactionsSTRANGLEDCHOKED',
                       'SuspectactionsWEAPON', 'SuspectactionsRESTRAINTS', 'SuspectactionsBURNED', 'MultipleSuspects', 'SuspectedDrugfacilitated',
                       'Patientdruguse', 'PatientETOHuse', 'Suspectdruguse', 'SuspectETOHuse', 'PatientSuspectETOHordrug', 'LossOFconsciousnessORawareness',
                       'OneORmoreunknownanswer', 'Unknownanswerto4ormorequestions', 'UnknownanswertoALL', 'AsleepANDawakenedtoassault', 'MemoryLoss',
                       'LossOfconsciousness', 'DecreasedAwareness', 'TonicImmobility', 'Detachment', 'NOSApatientsVAGINApenis', 'NOSApatientsVAGINAfingerhand',
                       'NOSApatientsVAGINAmouth', 'NOSApatientsVAGINAobject', 'NOSApatientsANUSpenis', 'NOSApatientsANUSfingerhand', 'NOSApatientsANUSmouth', 
                       'NOSApatientsANUSobject', 'NOSApatientsPENISgenitals', 'NOSApatientsPENISfinger', 'NOSApatientsPENISmouth', 'NOSApatientsPENISobject', 
                       'NOSApatientsMOUTHpenis', 'NOSApatientsMOUTHfinger', 'NOSApatientsMOUTHmouth', 'NOSApatientsMOUTHobject', 'SUSPECTmouthcontactGENITALS', 
                       'SUSPECTmouthcontactMOUTH', 'SUSPECTmouthcontactOTHER', 'SUSPECTmouthcontactOTHERsite', 'HANDSofSuspectBreast', 'HANDSofSuspectExtremities', 
                       'HANDSofSuspectOther', 'Ejaculation', 'CONDOMuse', 'LUBRICATIONuse', 'SuspectWASHEDpatient', 'SuspectINJUREDbypatient', 'PostassaultURINATED', 
                       'PostassaultDEFECATED', 'PostassaultDOUCHED', 'PostassaultVOMITED', 'PostassaultGARGLED', 'PostassaultBRUSHEDTEETH', 'PostassaultATEdrank', 
                       'PostassaultBATHED', 'PostassaultGENITALWIPE', 'PostassaultCHANGEDCLOTHING', 'PostassaultREMOVEDInserted', 'PhysicalORmentalimpairment', 'Physicalinjury', 
                       'LPIhead', 'LPIneck', 'LPIbreasts', 'LPIchestback', 'LPIabdomen', 'LPIextremities', 'TPIlaceration', 'TPIecchymosis', 'TPIabrasion', 'TPIredness', 
                       'TPIswelling', 'TPIbruise', 'TPIpetechiae', 'TPIincision', 'TPIavulsion', 'TPIdiscoloredmark', 'TPIpuncturewound', 'TPIfracture', 
                       'TPIbitemark', 'TPIburn', 'TPImissingorbrokenTEETH', 'TPIconjunctivalhemorrhage', 'Genitalinjury', 'LGIinnerthighs', 'LGIclitoralhoodclitoris', 
                       'LGIlabiamajora', 'LGIlabiaminora', 'LGIperiurethraltissueURETHRA', 'LGIperihymenaltissue', 'LGIhymen', 'LGIvagina', 'LGIcervix', 'LGIfossanavicularis', 
                       'LGIposteriorfourchette', 'LGIperineum', 'LGIperineum', 'LGIanalrectal', 'LGIbuttocks', 'LGImalePerianalperineum', 'LGIglanspenis', 'LGIpenileshaft', 
                       'LGImaleURETHRALmeatus', 'LGIscrotum', 'LGItestes', 'LGImaleanus', 'LGImalerectum', 'TGIlaceration', 'TGIecchymosis', 'TGIabrasion', 'TGIredness', 
                       'TGIswelling', 'TGIbruise', 'TGIpetechiae', 'TGIincision', 'TGIavulsion', 'TGIdiscoloredmark', 'TGIpuncturewound', 'ToludineDYEuptake', 'HIVnPEP', 
                       'UQuikcollected', 'Yscreen', 'NumberItemsWITH3cutoff', 'ItemsAnalyzed1', 'ItemsAnalyzed2', 'ItemsAnalyzed3', 'ItemsAnalyzed4', 'ItemsAnalyzed5', 
                       'ItemsAnalyzed6', 'ItemsAnalyzed7', 'ItemsAnalyzed8', 'ItemsAnalyzed9', 'ItemsAnalyzed10', 'TypesOFitemsTested', 'RandomSample20142015', 
                       'YearofDNAextraction', 'LocationOfTesting','DANYfundedSAK', 'DNAKitUsed', 'SerologyDoneBeforeDNA', 'QuantMaleDNAFound', 'QuantMaleSwabLoc1', 
                       'QuantMaleSwabLoc2', 'QuantMaleSwabLoc3', 'QuantMaleSwabLoc4', 'QuantMaleSwabLoc5', 'ProbableSTRDNAprofileOFsuspect', 'ProfileofSTRDNAloci', 'ProbableYSTRDNAprofile', 'ProfileOfYSTRDNAloci', 
                       'SwabLocationYSTRDNA', 'SecondSwabLocationYSTRDNA', 'SwabFromSuspectwithVictimDNA', 'ExcludeSuspect', 'ConsensualPartnerStandardSubmitted', 
                       'STRDNAProbableprofileTYPE', 'CODISprofileHit', 'STRDNAkitUsed', 'SUSPECTmouthcontactBREASTS', 'Swab1LocationSTRDNAprofile', 'Swab2LocationSTRDNAprofile',
                       'Swab3LocationSTRDNAprofile', 'SuspectStandardSubmitted', 'CODISNDISreasons', 'CODISSDISreasons']
swabToDNAFeatures = ['Swab1ToDNAanalysis', 'Swab2ToDNAanalysis', 'Swab3ToDNAanalysis', 'Swab4ToDNAanalysis']
#unusedFeatures and stringFeatures are columns that contain data that was relevant to medical professionals and for legal purposes, 
#but that aren't useful for our feature association or for predicting eligibility
unusedFeatures = ['filter_$', 'PainTreatmentYesNo', 'GenderMaleFemale', 'DVsuspect', 'RacePrimaryGroups', 'IPSAcombined', 'STRDNAcompleted', 
                  'PhysicalInjuryNOunknown', 'GenitalInjuryNOunknown']

stringFeatures = ['DeIdentifiedCase', 'Raceother', 'SchoolName', 'MilitaryBranchName', 'AddressIfnotPermanent', 'Currentmedprobtext',
                  'MedProbOtherText', 'Medicationtext', 'Sexualcontactwithin120hoursTYPE', 'SelfdisclosureMItype', 'OnlineMeetingName', 'SuspectrelationshipOTHER',
                  'LocationofassaultOTHER', 'Surfaceofassault', 'PatientActionOtherTEXT', 'SuspectraceOTHER', 'SuspectOTHERactions', 'NOSApatientsVAGINAobjectdescription',
                  'NOSApatientsANUSobjectdescription', 'NOSApatientsPENISobjectdescription', 'NOSApatientsMOUTHobjectdescription', 'EjaculationSITE', 'LUBRICATIONtype',
                  'SuspectINJUREDbypatientexplanation', 'Impairmentdescription', 'UBFSnumber', 'ISPnumber', 'DateSubmittedUBFS', 'DateofDNAextractionReport',
                  'BodySwabLocQuant', 'BodySwabDNAanalysis', 'BodySwabLocationSTRDNA', 'BodySwabYSTRDNA', 'ISPnotes2020', 'UBFSnotes2020', 'UBFSnotes2018', 'UBFSnotes2014']

##Name and Clean the Variables for Use

**Examine and Clean variables for use**

In [None]:
df = df.apply(pd.to_numeric, errors='ignore')

In [None]:
df = df.astype({'Site': int, 'EXAMbySANE': int, 'YearKitCollected': int, 'KITbroughtTOcrimelab': int, 'KITlengthofSubmissionTime': int, 'Age': str, 'UnderAge18': int, 'Gender': int, 'ExamDeclined': bool, 'Noninterview': bool, 'Timebetweenassaultandexaminhours': str, 'Race': int, 'Raceother': int, 'PriorHxofSAover14': bool, 'PriorHxofSAunder14': bool, 'Student': bool, 'SchoolName': str, 'Military': bool, 'MilitaryBranchName': str, 'Pain': bool, 'PainLevel': int, 'PainLocation1': int, 'PainLocation2': int, 'PainLocation3': int, 'PainLocation4': int, 'PainTreatment': bool}, errors='ignore') 

In [None]:
df['Timebetweenassaultandexaminhours_int'] = pd.to_numeric(df['Timebetweenassaultandexaminhours'], errors='coerce')

def makeContingency(row):
  value = row["Timebetweenassaultandexaminhours_int"]
  if value < 23:
    return "0"
  elif value < 47:
    return "1"
  elif value < 71:
    return "2"
  elif value < 95:
    return "3"
  elif value < 119:
    return "4"
  elif value < 900:
    return "5"
  else:
    return "6"

# 0 = "Under 24 hours"
# 1 = "1 Day"
# 2 = "2 Days" 
# 3 = "3 Days"
# 4 = "4 Days"
# 5 = "5+ Days"
# 6 = "nan"

df['timeBetweenAssaultAndExam'] = df.apply(makeContingency, axis=1)
print(df['timeBetweenAssaultAndExam'].value_counts())

df['timeBetweenAssaultAndExam'] = pd.to_numeric(df['timeBetweenAssaultAndExam'], errors='coerce')

0    2813
1     773
2     338
3     209
4     115
5      89
6      66
Name: timeBetweenAssaultAndExam, dtype: int64


In [None]:
df['Age'] = pd.to_numeric(df['Age'], errors='coerce')
def makeContingency(row):
  value = row["Age"]
  if value < 20:
    return "0"
  elif value < 30:
    return "1"
  elif value < 40:
    return "2"
  elif value < 50:
    return "3"
  elif value < 60:
    return "4"
  else:
    return "6"

df['ageOfVictim'] = df.apply(makeContingency, axis=1)
print(df['ageOfVictim'].value_counts())

df['ageOfVictim'] = pd.to_numeric(df['ageOfVictim'], errors='coerce')

1    1644
0    1263
2     866
3     408
4     155
6      67
Name: ageOfVictim, dtype: int64


In [None]:
df['MemoryLoss'] = pd.to_numeric(df['MemoryLoss'], errors='coerce')
df['MulitipleSuspectNumber'] = pd.to_numeric(df['MulitipleSuspectNumber'], errors='coerce')
df['Sexualcontactwithin120hours'] = pd.to_numeric(df['Sexualcontactwithin120hours'], errors='coerce')
df['Ejaculation'] = pd.to_numeric(df['Ejaculation'], errors='coerce')
df['CONDOMuse'] = pd.to_numeric(df['CONDOMuse'], errors='coerce')
df['PostassaultBATHED'] = pd.to_numeric(df['PostassaultBATHED'], errors='coerce')
df['NOSApatientsVAGINApenis'] = pd.to_numeric(df['NOSApatientsVAGINApenis'], errors='coerce')
df['NOSApatientsANUSpenis'] = pd.to_numeric(df['NOSApatientsANUSpenis'], errors='coerce')
df['NOSApatientsMOUTHpenis'] = pd.to_numeric(df['NOSApatientsMOUTHpenis'], errors='coerce')
df['SUSPECTmouthcontactGENITALS'] = pd.to_numeric(df['SUSPECTmouthcontactGENITALS'], errors='coerce')
df['SUSPECTmouthcontactBREASTS'] = pd.to_numeric(df['SUSPECTmouthcontactBREASTS'], errors='coerce')
df['SUSPECTmouthcontactOTHER'] = pd.to_numeric(df['SUSPECTmouthcontactOTHER'], errors='coerce')
df['NOSApatientsVAGINAfingerhand'] = pd.to_numeric(df['NOSApatientsVAGINAfingerhand'], errors='coerce')
df['SuspectactionsSTRANGLEDCHOKED'] = pd.to_numeric(df['SuspectactionsSTRANGLEDCHOKED'], errors='coerce')
df['SuspectactionsPHYSICALBLOWS'] = pd.to_numeric(df['SuspectactionsPHYSICALBLOWS'], errors='coerce')
df['SuspectactionsWEAPON'] = pd.to_numeric(df['SuspectactionsWEAPON'], errors='coerce')
df['Genitalinjury'] = pd.to_numeric(df['Genitalinjury'], errors='coerce')
df['Physicalinjury'] = pd.to_numeric(df['Physicalinjury'], errors='coerce')
df['NumberAssaultiveActs'] = pd.to_numeric(df['NumberAssaultiveActs'], errors='coerce')

In [None]:
df['Site'] = pd.to_numeric(df['Site'], errors='coerce')

In [None]:
df['YearKitCollected'] = pd.to_numeric(df['YearKitCollected'], errors='coerce')

In [None]:
df['KITbroughtTOcrimelab'] = pd.to_numeric(df['KITbroughtTOcrimelab'], errors='coerce')

In [None]:
df['UnderAge18'] = pd.to_numeric(df['UnderAge18'], errors='coerce')

In [None]:
df['Race'] = pd.to_numeric(df['Race'], errors='coerce')

In [None]:
df['MultipleSuspects'] = pd.to_numeric(df['MultipleSuspects'], errors='coerce')

In [None]:
df['SuspectactionsGRABBEDHELD'] = pd.to_numeric(df['SuspectactionsGRABBEDHELD'], errors='coerce')

#Logistic Model

Predict/Explain y (CODIS eligible profiles) as a function of X (all of the other variables that follow...time between assualt and examination, whether the swab was vaginal, etc. First we set up the equation, then we fit the model (mod.fit), then we print the results.

In [None]:
# first running a simple logistic regression
# y, X = dmatrices('CODISNDISeligibleProfile ~ Gender + Age_cat', data=df, return_type='dataframe')
# x is list of variables, y is predicted variable
# set up log equation
y, X = dmatrices('CODISNDISeligibleProfile ~ timeBetweenAssaultAndExam + ageOfVictim + MemoryLoss + MulitipleSuspectNumber + Sexualcontactwithin120hours + Ejaculation + CONDOMuse + PostassaultBATHED + NOSApatientsVAGINApenis + NOSApatientsANUSpenis + NOSApatientsMOUTHpenis + SUSPECTmouthcontactGENITALS + SUSPECTmouthcontactBREASTS + SUSPECTmouthcontactOTHER + NOSApatientsVAGINAfingerhand + SuspectactionsSTRANGLEDCHOKED + SuspectactionsPHYSICALBLOWS + SuspectactionsWEAPON + Genitalinjury + Physicalinjury + NumberAssaultiveActs', data=df, return_type='dataframe')

mod = sm.Logit(y, X)    # Describe model
res = mod.fit(method='bfgs')       # Fit model using standard method
print(res.summary())


         Current function value: 0.538968
         Iterations: 35
         Function evaluations: 37
         Gradient evaluations: 37
                              Logit Regression Results                              
Dep. Variable:     CODISNDISeligibleProfile   No. Observations:                  368
Model:                                Logit   Df Residuals:                      346
Method:                                 MLE   Df Model:                           21
Date:                      Thu, 11 Nov 2021   Pseudo R-squ.:                  0.1197
Time:                              21:06:02   Log-Likelihood:                -198.34
converged:                            False   LL-Null:                       -225.30
Covariance Type:                  nonrobust   LLR p-value:                 0.0001011
                                    coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------------

