<a href="https://colab.research.google.com/github/devorahst/Test/blob/main/DVariable.ProfileOfSTRDNAloci/Section_1_BathedAndSwabs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Upload and Clean Dataset

## Part 1. Upload
First, we must download the dataset. Upon running the cell, you will be prompted to login to your Gmail account. You will then be provided with a one-time use code to copy and paste into the slot below. After hitting enter, the dataset will load into this script.

In [None]:
#pulls up our SAK dataset
#@title uploader
file_id = "1lMSXIdMUQTZVOOk2wG-1awvPEIlypfxV" #@param {type:"string"}
!pip install -U -q PyDrive

from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

from google.colab import auth
auth.authenticate_user()

from googleapiclient.discovery import build
drive_service = build('drive', 'v3')

import io
from googleapiclient.http import MediaIoBaseDownload

request = drive_service.files().get_media(fileId=file_id)
downloaded = io.BytesIO()
downloader = MediaIoBaseDownload(downloaded, request)
done = False
while done is False:
  _, done = downloader.next_chunk()

fileId = drive.CreateFile({'id': file_id }) #DRIVE_FILE_ID is file id example: 1iytA1n2z4go3uVCwE_vIKouTKyIDjEq
print(fileId['title'])  
fileId.GetContentFile(fileId['title'])  # Save Drive file as a local file

DataFrame_corrected.csv


## Part 2. Set Up
1. Install and Import - Install SciKit-Learn and import necessary packages
2. Clean the Dataset - Label the columns and replace null values with "No Response" and sort features based on type.
3. Display Cleaned Dataset

###1. Install and Import

In [None]:
pip install -U scikit-learn



In [None]:
from scipy.stats import chi2_contingency
from scipy import stats
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# To change scientific numbers to float
np.set_printoptions(formatter={'float_kind':'{:f}'.format})


###2. Read in File and Clean the Dataset



#####**Label the columns and replace null values with "No Response"**


In [None]:
with open(fileId['title'], encoding="utf8", errors='ignore') as f:
  df = pd.read_csv(f)

df = df.astype('string')
df = df.replace(r'^\s+$', np.nan, regex=True)
df = df.replace({np.nan: "No Response"})

df = df[df['ProfileofSTRDNAloci'] != "No Response"]
df = df[df['CODISNDISeligibleProfile'] != "No Response"]

df = df[df['Site'] != '6'] #filter out other state data
df = df[df['Gender'] == '1']  #dataframe containing information from only female respondents

#Add col showing STRDNA profile eligible/not eligible
def makeContingency(row):
  good_values = ['1','2']  
  eligibility = row["ProfileofSTRDNAloci"]
  if eligibility in good_values:
    return "1"
  else:
    return "0"


df['STRDNAEligible'] = df.apply(makeContingency, axis=1)

predictedVariable = "STRDNAEligible"

  interactivity=interactivity, compiler=compiler, result=result)


#####**Sort Features Based on Type**
Because our features are not all categorical, we cannot calculate each feature's p-value using the same method. We must sort our features into numerical, categorical, and string entries and then apply the correct method to obtain each p-value. For categorical features, we will run chi-square tests while we will use analysis of variance (ANOVA) with T-Testing for numerical features. The meaning of the variables are all described in the [code book](https://drive.google.com/file/d/18PDTuK0lshc193lXA3b7UDgcRzMoEEGA/view?usp=sharing).

In [None]:
predictedFeatures = ['CODISNDISeligibleProfile', 'SDISeligibleprofile']  

numericalFeatures = ['Age', 'Timebetweenassaultandexaminhours', 'PainLevel', 'MulitipleSuspectNumber', 
                     'NumberofUnknownresponses', 'NumberAssaultiveActs', 'Numberofphysicalinjuries', 'Numberofgentialinjuries',
                     'NumberOFitemsTested', 'TimeBetweenCollectAndDNAext', 'TimeBetweenSubmissionANDtesting', 'NumberOfswabsQuantMaleDNA',
                     'NumberOfswabsDNAanalysis', 'NumberofSTRDNAloci', 'NumberOFswabsSTRDNAprofile', 'NumberOfYSTRDNAloci']

categoricalFeatures = ['Site', 'EXAMbySANE', 'YearKitCollected', 'KITbroughtTOcrimelab', 'KITlengthofSubmissionTime',
                       'UnderAge18', 'Gender', 'ExamDeclined', 'Noninterview', 'Race', 'PriorHxofSAover14',
                       'PriorHxofSAunder14', 'Student', 'Military', 'Pain', 'PainLocation1','PainLocation2', 
                       'PainLocation3', 'PainLocation4','PainTreatment', 'PermanentAddress', 'CurrentPhysicalmedprob',
                       'MedProbChronic', 'MedProbInfection', 'MedProbBlood', 'MedProbCardiac', 'MedProbEar', 'MedProbEndocrine',
                       'MedProbEye', 'MedProbGI', 'MedProbGU', 'MedProbGYN', 'MedProbImmune', 'MedProbMusculoskeletal', 'MedProbNeurological',
                       'MedProbOral', 'MedProbRenal', 'MedProbRespiratory', 'MedProbSkin', 'MedProbOther', 'Medication',
                       'PsychotropicMEDuse', 'PsychotropicANTIPSYCHOTICSatypical', 'PsychotropicSTIMULANTuse', 'PsychotropicANTIANXIETY', 
                       'PsychotropicANTIDEPRESSANTS', 'PsychotropicANTISEIZUREbipolar', 'PsychotropicADDICTIONmeds','PsychotropicSLEEPaid', 'PsychotropicOTHER', 
                       'PsychotropicANTIPSYCHOTICStypical', 'PolypharmacyPsychMeds', 'ImmunizationstatusTETANUS', 'ReceivedTetanus',
                       'ImmunizationstatusHEP', 'ReceivedHepB', 'Sexualcontactwithin120hours', 'Selfdisclosurementalillness', 'MIdepression',
                       'MIanxiety', 'MIPTSD', 'MIpsychoticDisorders', 'MIadhd', 'MIpersonalitydisorder', 'MIbipolar', 'MIeatingdisorder', 'MIdrugalcoholdisorders', 
                       'MIother', 'SelfDiscolsureMentalillnessORuseofpsychotropics', 'OnlineMeetingOFsuspect', 'Suspectrelationship',
                       'Locationofassault', 'PatientActionScratch', 'PatientActionBite', 'PatientActionHit', 'PatientActionKick', 'PatientActionOther',
                       'Suspectrace', 'SuspectactionVERBAL', 'SuspectactionsGRABBEDHELD', 'SuspectactionsPHYSICALBLOWS', 'SuspectactionsSTRANGLEDCHOKED',
                       'SuspectactionsWEAPON', 'SuspectactionsRESTRAINTS', 'SuspectactionsBURNED', 'MultipleSuspects', 'SuspectedDrugfacilitated',
                       'Patientdruguse', 'PatientETOHuse', 'Suspectdruguse', 'SuspectETOHuse', 'PatientSuspectETOHordrug', 'LossOFconsciousnessORawareness',
                       'OneORmoreunknownanswer', 'Unknownanswerto4ormorequestions', 'UnknownanswertoALL', 'AsleepANDawakenedtoassault', 'MemoryLoss',
                       'LossOfconsciousness', 'DecreasedAwareness', 'TonicImmobility', 'Detachment', 'NOSApatientsVAGINApenis', 'NOSApatientsVAGINAfingerhand',
                       'NOSApatientsVAGINAmouth', 'NOSApatientsVAGINAobject', 'NOSApatientsANUSpenis', 'NOSApatientsANUSfingerhand', 'NOSApatientsANUSmouth', 
                       'NOSApatientsANUSobject', 'NOSApatientsPENISgenitals', 'NOSApatientsPENISfinger', 'NOSApatientsPENISmouth', 'NOSApatientsPENISobject', 
                       'NOSApatientsMOUTHpenis', 'NOSApatientsMOUTHfinger', 'NOSApatientsMOUTHmouth', 'NOSApatientsMOUTHobject', 'SUSPECTmouthcontactGENITALS', 
                       'SUSPECTmouthcontactMOUTH', 'SUSPECTmouthcontactOTHER', 'SUSPECTmouthcontactOTHERsite', 'HANDSofSuspectBreast', 'HANDSofSuspectExtremities', 
                       'HANDSofSuspectOther', 'Ejaculation', 'CONDOMuse', 'LUBRICATIONuse', 'SuspectWASHEDpatient', 'SuspectINJUREDbypatient', 'PostassaultURINATED', 
                       'PostassaultDEFECATED', 'PostassaultDOUCHED', 'PostassaultVOMITED', 'PostassaultGARGLED', 'PostassaultBRUSHEDTEETH', 'PostassaultATEdrank', 
                       'PostassaultBATHED', 'PostassaultGENITALWIPE', 'PostassaultCHANGEDCLOTHING', 'PostassaultREMOVEDInserted', 'PhysicalORmentalimpairment', 'Physicalinjury', 
                       'LPIhead', 'LPIneck', 'LPIbreasts', 'LPIchestback', 'LPIabdomen', 'LPIextremities', 'TPIlaceration', 'TPIecchymosis', 'TPIabrasion', 'TPIredness', 
                       'TPIswelling', 'TPIbruise', 'TPIpetechiae', 'TPIincision', 'TPIavulsion', 'TPIdiscoloredmark', 'TPIpuncturewound', 'TPIfracture', 
                       'TPIbitemark', 'TPIburn', 'TPImissingorbrokenTEETH', 'TPIconjunctivalhemorrhage', 'Genitalinjury', 'LGIinnerthighs', 'LGIclitoralhoodclitoris', 
                       'LGIlabiamajora', 'LGIlabiaminora', 'LGIperiurethraltissueURETHRA', 'LGIperihymenaltissue', 'LGIhymen', 'LGIvagina', 'LGIcervix', 'LGIfossanavicularis', 
                       'LGIposteriorfourchette', 'LGIperineum', 'LGIperineum', 'LGIanalrectal', 'LGIbuttocks', 'LGImalePerianalperineum', 'LGIglanspenis', 'LGIpenileshaft', 
                       'LGImaleURETHRALmeatus', 'LGIscrotum', 'LGItestes', 'LGImaleanus', 'LGImalerectum', 'TGIlaceration', 'TGIecchymosis', 'TGIabrasion', 'TGIredness', 
                       'TGIswelling', 'TGIbruise', 'TGIpetechiae', 'TGIincision', 'TGIavulsion', 'TGIdiscoloredmark', 'TGIpuncturewound', 'ToludineDYEuptake', 'HIVnPEP', 
                       'UQuikcollected', 'Yscreen', 'NumberItemsWITH3cutoff', 'ItemsAnalyzed1', 'ItemsAnalyzed2', 'ItemsAnalyzed3', 'ItemsAnalyzed4', 'ItemsAnalyzed5', 
                       'ItemsAnalyzed6', 'ItemsAnalyzed7', 'ItemsAnalyzed8', 'ItemsAnalyzed9', 'ItemsAnalyzed10', 'TypesOFitemsTested', 'RandomSample20142015', 
                       'YearofDNAextraction', 'LocationOfTesting','DANYfundedSAK', 'DNAKitUsed', 'SerologyDoneBeforeDNA', 'QuantMaleDNAFound', 'QuantMaleSwabLoc1', 
                       'QuantMaleSwabLoc2', 'QuantMaleSwabLoc3', 'QuantMaleSwabLoc4', 'QuantMaleSwabLoc5', 'ProbableSTRDNAprofileOFsuspect', 'ProfileofSTRDNAloci', 'ProbableYSTRDNAprofile', 'ProfileOfYSTRDNAloci', 
                       'SwabLocationYSTRDNA', 'SecondSwabLocationYSTRDNA', 'SwabFromSuspectwithVictimDNA', 'ExcludeSuspect', 'ConsensualPartnerStandardSubmitted', 
                       'STRDNAProbableprofileTYPE', 'CODISprofileHit', 'STRDNAkitUsed', 'SUSPECTmouthcontactBREASTS', 'Swab1LocationSTRDNAprofile', 'Swab2LocationSTRDNAprofile',
                       'Swab3LocationSTRDNAprofile', 'SuspectStandardSubmitted', 'CODISNDISreasons', 'CODISSDISreasons']
swabToDNAFeatures = ['Swab1ToDNAanalysis', 'Swab2ToDNAanalysis', 'Swab3ToDNAanalysis', 'Swab4ToDNAanalysis']
#unusedFeatures and stringFeatures are columns that contain data that was relevant to medical professionals and for legal purposes, 
#but that aren't useful for our feature association or for predicting eligibility
unusedFeatures = ['filter_$', 'PainTreatmentYesNo', 'GenderMaleFemale', 'DVsuspect', 'RacePrimaryGroups', 'IPSAcombined', 'STRDNAcompleted', 
                  'PhysicalInjuryNOunknown', 'GenitalInjuryNOunknown']

stringFeatures = ['DeIdentifiedCase', 'Raceother', 'SchoolName', 'MilitaryBranchName', 'AddressIfnotPermanent', 'Currentmedprobtext',
                  'MedProbOtherText', 'Medicationtext', 'Sexualcontactwithin120hoursTYPE', 'SelfdisclosureMItype', 'OnlineMeetingName', 'SuspectrelationshipOTHER',
                  'LocationofassaultOTHER', 'Surfaceofassault', 'PatientActionOtherTEXT', 'SuspectraceOTHER', 'SuspectOTHERactions', 'NOSApatientsVAGINAobjectdescription',
                  'NOSApatientsANUSobjectdescription', 'NOSApatientsPENISobjectdescription', 'NOSApatientsMOUTHobjectdescription', 'EjaculationSITE', 'LUBRICATIONtype',
                  'SuspectINJUREDbypatientexplanation', 'Impairmentdescription', 'UBFSnumber', 'ISPnumber', 'DateSubmittedUBFS', 'DateofDNAextractionReport',
                  'BodySwabLocQuant', 'BodySwabDNAanalysis', 'BodySwabLocationSTRDNA', 'BodySwabYSTRDNA', 'ISPnotes2020', 'UBFSnotes2020', 'UBFSnotes2018', 'UBFSnotes2014']

###3. Display Cleaned Dataset

In [None]:
df

Unnamed: 0,DeIdentifiedCase,Site,EXAMbySANE,YearKitCollected,KITbroughtTOcrimelab,KITlengthofSubmissionTime,Age,UnderAge18,Gender,ExamDeclined,Noninterview,Timebetweenassaultandexaminhours,Race,Raceother,PriorHxofSAover14,PriorHxofSAunder14,Student,SchoolName,Military,MilitaryBranchName,Pain,PainLevel,PainLocation1,PainLocation2,PainLocation3,PainLocation4,PainTreatment,PermanentAddress,AddressIfnotPermanent,CurrentPhysicalmedprob,Currentmedprobtext,MedProbChronic,MedProbInfection,MedProbBlood,MedProbCardiac,MedProbEar,MedProbEndocrine,MedProbEye,MedProbGI,MedProbGU,...,SecondSwabLocationYSTRDNA,BodySwabYSTRDNA,SwabFromSuspectwithVictimDNA,ExcludeSuspect,SuspectStandardSubmitted,ConsensualPartnerStandardSubmitted,CODISNDISeligibleProfile,SDISeligibleprofile,STRDNAProbableprofileTYPE,CODISprofileHit,ISPnotes2020,UBFSnotes2020,UBFSnotes2018,UBFSnotes2014,filter_$,PainTreatmentYesNo,GenderMaleFemale,DVsuspect,RacePrimaryGroups,IPSAcombined,STRDNAcompleted,PhysicalInjuryNOunknown,GenitalInjuryNOunknown,CODISNDISreasons,CODISSDISreasons,STRDNAkitUsed,ProfileofSTRDNAloci,SwabToDNAanalysisNoquantmaleDNAfound,SwabToDNAanalysisVaginal,SwabToDNAanalysisCervical,SwabToDNAanalysisPerianal,SwabToDNAanalysisRectal,SwabToDNAanalysisOral,SwabToDNAanalysisBody,SwabToDNAanalysisUnderwear,SwabToDNAanalysisOtherClothing,SwabToDNAanalysisBedding,SwabToDNAanalysisCondom,SwabToDNAanalysisTampon,STRDNAEligible
0,FT988320,1,1,1,1,1,17,1,1,0,No Response,48,3,No Response,4,4,No Response,No Response,No Response,No Response,No Response,No Response,No Response,No Response,No Response,No Response,No Response,No Response,No Response,0,No Response,0,0,0,0,0,0,0,0,0,...,No Response,No Response,0,0,0,0,1,1,1,0,No Response,No Response,DNA report found 5/16/19.,Left message for detective requesting standard...,0,No Response,1,No Response,3,2,1,1,1,1,1,2,1,0,0,0,0,0,0,1,0,0,0,0,0,1
1,LZ866552,1,1,1,1,2,27,0,1,0,No Response,3,2,No Response,4,4,No Response,No Response,No Response,No Response,No Response,No Response,No Response,No Response,No Response,No Response,No Response,No Response,No Response,0,No Response,0,0,0,0,0,0,0,0,0,...,No Response,No Response,0,0,0,0,0,0,3,4,No Response,No Response,DNA report found 5/16/19.,Detective said no DNA analysis is needed at th...,0,No Response,1,No Response,2,1,1,1,0,4,4,2,4,0,0,0,0,0,0,1,0,0,0,0,0,0
2,FT988772,1,1,1,1,1,23,0,1,0,No Response,6,1,No Response,4,4,No Response,No Response,No Response,No Response,No Response,No Response,No Response,No Response,No Response,No Response,No Response,No Response,No Response,0,No Response,0,0,0,0,0,0,0,0,0,...,No Response,No Response,0,0,0,0,1,1,1,0,No Response,No Response,DNA report found 5/16/19.,"Detective was called with results, no call bac...",0,No Response,1,No Response,1,2,1,1,1,1,1,2,1,0,0,0,0,1,0,0,0,0,0,0,0,1
3,UM632222,1,1,1,1,1,19,0,1,0,No Response,14,1,No Response,4,4,No Response,No Response,No Response,No Response,No Response,No Response,No Response,No Response,No Response,No Response,No Response,No Response,No Response,1,"Asthma, mono",1,1,1,0,0,0,0,0,0,...,No Response,No Response,0,0,0,0,1,1,1,0,No Response,No Response,DNA report found 5/16/19.,"Vaginal, perianal, breast, neck and chest swab...",0,No Response,1,No Response,1,2,1,1,1,1,1,2,1,0,0,0,0,0,0,1,0,0,0,0,0,1
4,TT976221,1,1,1,1,1,16,1,1,0,No Response,23,1,No Response,4,4,No Response,No Response,No Response,No Response,No Response,No Response,No Response,No Response,No Response,No Response,No Response,No Response,No Response,1,Strep throat,0,1,0,0,0,0,0,0,0,...,No Response,No Response,0,0,0,0,0,0,No Response,4,No Response,No Response,DNA report found 5/16/19.,"No male DNA found, no further testing done",0,No Response,1,No Response,1,1,No Response,1,0,5,5,No Response,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5635,JH741110,1,1,7,1,1,66,0,1,0,0,49,1,No Response,1,0,No Response,No Response,No Response,No Response,No Response,No Response,No Response,No Response,No Response,No Response,No Response,No Response,No Response,1,"pacemaker, diverticulitis, anxiety",1,0,0,1,0,0,0,1,0,...,No Response,No Response,0,0,0,0,0,0,3,4,No Response,No Response,No Response,No Response,0,No Response,1,No Response,1,2,1,1,1,4,4,1,3,0,1,0,1,0,0,1,0,0,0,0,0,0
5638,SW954310,1,1,10,1,1,31,0,1,0,0,14,1,No Response,1,1,0,No Response,0,No Response,1,No Response,8,1,No Response,No Response,0,0,listed mom's address,0,No Response,0,0,0,0,0,0,0,0,0,...,No Response,No Response,0,0,0,0,0,0,No Response,4,No Response,DNA report found Aug 2020.,No Response,No Response,0,No Response,No Response,No Response,No Response,No Response,No Response,No Response,No Response,5,5,2,3,0,0,0,0,0,0,1,0,0,0,0,0,0
5639,PW973000,4,1,6,1,3,17,1,1,0,0,5,1,No Response,No Response,No Response,No Response,No Response,No Response,No Response,No Response,No Response,No Response,No Response,No Response,No Response,No Response,No Response,No Response,0,No Response,0,0,0,0,0,0,0,0,0,...,No Response,No Response,0,0,0,0,0,0,3,4,No Response,No Response,No Response,No Response,0,No Response,1,No Response,1,2,1,0,0,4,4,2,3,0,1,1,0,1,0,0,0,0,0,0,0,0
5640,FR444330,1,1,7,1,1,22,0,1,0,0,22,2,No Response,1,2,No Response,No Response,No Response,No Response,No Response,No Response,No Response,No Response,No Response,No Response,No Response,No Response,No Response,0,anxiety,0,0,0,0,0,0,0,0,0,...,No Response,No Response,0,0,0,0,0,0,3,4,No Response,No Response,Swabs held for future YSTR testing,No Response,0,No Response,1,No Response,2,No Response,1,0,1,4,4,2,3,0,0,0,1,0,0,1,0,0,0,0,0,0


#Swabs and Bathing
These cells sort the dataset into whether a victim bathed post-assault and type of swab collected during the examination. This information will then be used in chi-square in the next cell block.

##Set-up

In [None]:
# Sort swab variables into whether they are internal/external
internalSwabs = ['SwabToDNAanalysisVaginal', 'SwabToDNAanalysisCervical', 'SwabToDNAanalysisRectal',]

externalSwabs = ['SwabToDNAanalysisPerianal', 'SwabToDNAanalysisOral','SwabToDNAanalysisBody', 
                 'SwabToDNAanalysisUnderwear', 'SwabToDNAanalysisOtherClothing', 'SwabToDNAanalysisBedding', 
                      'SwabToDNAanalysisCondom','SwabToDNAanalysisTampon']

# Because we want to know if a swab is useful, we will look at cases with only 1 swab sent to analysis
# This way, we don't have to account for other swabs that may have contributed DNA
eligible = eligible[eligible['NumberOfswabsDNAanalysis'] == '1']

4403
4403
1001


###Bathed

In [None]:
bathed = eligible[eligible['PostassaultBATHED'] == '1'] #only keep cases where the patient bathed

print(len(bathed))

325


In [None]:
# This function returns a new column to the bathed df that states whether the one swab taken for a given case was internal/external and eligible/not eligible
def makeContingency(row):
  internal = False
  eligible = False
  #look if it is internal
  good_values = ['1','2','4']  #these are locations 1,2,4
  value = row["Swab1ToDNAanalysis"]
  if value in good_values:
    internal = True
  #check if eligible

  eligibility = row["STRDNAEligible"]

  if eligibility == '1':
    eligible = True

  if internal and eligible:
    return "internal and eligible"
  elif not internal and eligible:
    return "external and eligible"
  elif internal and not eligible:
    return "internal and not eligible"
  elif not internal and not eligible:
    return "external and not eligible"


bathed['new_col'] = bathed.apply(makeContingency, axis=1)
table = bathed['new_col'].value_counts()
print("Counts for bathed: " + "(" + str(len(bathed)) + " total cases)")

for row in table.index:
  perc = (table[row] / len(bathed)) * 100
  format_float = "{:.2f}".format(perc)

  print(row + " " + str(table[row]) + " " + str(format_float) + "%")

Counts for bathed: (325 total cases)
external and not eligible 135 41.54%
external and eligible 73 22.46%
internal and eligible 61 18.77%
internal and not eligible 56 17.23%


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


###Did Not Bathe

In [None]:
notBathed = eligible[eligible['PostassaultBATHED'] == '0'] #only keep cases where the patient did not bathe

print(len(notBathed))

668


In [None]:
def makeContingency(row):
  internal = False
  eligible = False
  #look if it is internal
  good_values = ['1','2','4']  #these are locations 1,2,4
  value = row["Swab1ToDNAanalysis"]
  if value in good_values:
    internal = True
  #check if eligible

  eligibility = row["STRDNAEligible"]

  if eligibility == '1':
    eligible = True

  if internal and eligible:
    return "internal and eligible"
  elif not internal and eligible:
    return "external and eligible"
  elif internal and not eligible:
    return "internal and not eligible"
  elif not internal and not eligible:
    return "external and not eligible"


notBathed['new_col'] = notBathed.apply(makeContingency, axis=1)
table = notBathed['new_col'].value_counts()

print("Counts for not bathed: " + "(" + str(len(notBathed)) + " total cases)")

for row in table.index:
  perc = (table[row] / len(notBathed)) * 100
  format_float = "{:.2f}".format(perc)

  print(row + " " + str(table[row]) + " " + str(format_float) + "%")

Counts for not bathed: (668 total cases)
external and eligible 283 42.37%
external and not eligible 228 34.13%
internal and eligible 93 13.92%
internal and not eligible 64 9.58%


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


#Chi Square Swabs and Bathing

This compares bathing to swab location with CODIS eligibility, specifically if an internal swab was collected

In [None]:
# Set-up
eligible = df[df['ProfileofSTRDNAloci'] != 'No Response']

internalSwabs = ['SwabToDNAanalysisVaginal', 'SwabToDNAanalysisCervical', 'SwabToDNAanalysisRectal',]

externalSwabs = ['SwabToDNAanalysisPerianal', 'SwabToDNAanalysisOral','SwabToDNAanalysisBody', 
                 'SwabToDNAanalysisUnderwear', 'SwabToDNAanalysisOtherClothing', 'SwabToDNAanalysisBedding', 
                      'SwabToDNAanalysisCondom','SwabToDNAanalysisTampon']

eligible = eligible[eligible['NumberOfswabsDNAanalysis'] == '1']

bathed = eligible[eligible['PostassaultBATHED'] == '1'] #only keep cases where the patient bathed

In [None]:
# Make new column of yes/no internal swab was swab taken

def makeContingency(row):

  #look if it is internal
  good_values = ['1','2','4']  #these are locations 1,2,4
  value = row["Swab1ToDNAanalysis"]
  if value in good_values:
    return "1"
  return "0"

bathed['internalSwabCollected'] = bathed.apply(makeContingency, axis=1)
table = bathed['internalSwabCollected'].value_counts()
print("Counts for bathed: " + "(" + str(len(bathed)) + " total cases)")
# print(table)

for row in table.index:
  perc = (table[row] / len(bathed)) * 100
  format_float = "{:.2f}".format(perc)

  print(row + " " + str(table[row]) + " " + str(format_float) + "%")


Counts for bathed: (325 total cases)
0 208 64.00%
1 117 36.00%


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [None]:
newContingency= pd.crosstab(bathed['internalSwabCollected'], bathed['STRDNAEligible'])
c, p, dof, expected = chi2_contingency(newContingency)
print(str(p) + " p-value")
print(str(dof) + " dof")
print(str(c) + " chi-square value")
print(newContingency)

0.003999442462707307 p-value
1 dof
8.284068111124139 chi-square value
STRDNAEligible           0   1
internalSwabCollected         
0                      135  73
1                       56  61


#Chi Square Swabs and No Bathing

This compares bathing to swab location with CODIS eligibility, specifically if an internal swab was collected

In [None]:
# Set-up
eligible = df[df['ProfileofSTRDNAloci'] != 'No Response']

internalSwabs = ['SwabToDNAanalysisVaginal', 'SwabToDNAanalysisCervical', 'SwabToDNAanalysisRectal',]

externalSwabs = ['SwabToDNAanalysisPerianal', 'SwabToDNAanalysisOral','SwabToDNAanalysisBody', 
                 'SwabToDNAanalysisUnderwear', 'SwabToDNAanalysisOtherClothing', 'SwabToDNAanalysisBedding', 
                      'SwabToDNAanalysisCondom','SwabToDNAanalysisTampon']

eligible = eligible[eligible['NumberOfswabsDNAanalysis'] == '1']

bathed = eligible[eligible['PostassaultBATHED'] == '0'] #only keep cases where the patient bathed

In [None]:
# Make new column of yes/no internal swab was swab taken

def makeContingency(row):

  #look if it is internal
  good_values = ['1','2','4']  #these are locations 1,2,4
  value = row["Swab1ToDNAanalysis"]
  if value in good_values:
    return "1"
  return "0"



bathed['internalSwabCollected'] = bathed.apply(makeContingency, axis=1)
table = bathed['internalSwabCollected'].value_counts()
print("Counts for bathed: " + "(" + str(len(bathed)) + " total cases)")
# print(table)

for row in table.index:
  perc = (table[row] / len(bathed)) * 100
  format_float = "{:.2f}".format(perc)

  print(row + " " + str(table[row]) + " " + str(format_float) + "%")


In [None]:
newContingency= pd.crosstab(bathed['internalSwabCollected'], bathed['STRDNAEligible'])
c, p, dof, expected = chi2_contingency(newContingency)
print(str(p) + " p-value")
print(str(dof) + " dof")
print(str(c) + " chi-square value")
print(newContingency)

0.44754400779302306 p-value
1 dof
0.5768643987259156 chi-square value
STRDNAEligible           0    1
internalSwabCollected          
0                      228  283
1                       64   93
