In [1]:
# Importing Libraries
import pandas as pd
from random import randint
import datetime

In [2]:
# Nan value tratement from status_change_date column
patients_ds = pd.read_csv(r'patients_data.csv')
patients_ds.dropna(subset=['status_change_date'],inplace=True)

# saving modified patients_dataset
patients_ds.to_csv('patients_ds_mod.csv',index=False)

In [3]:
# Date parser for date columns
mydateparser = lambda x: pd.datetime.strptime(x, "%d-%m-%Y")

# Reading modifies patients ds
patients_ds = pd.read_csv("patients_ds_mod.csv", parse_dates=['date_announced','status_change_date'], date_parser=mydateparser)

In [4]:
patients_ds.head(2)

Unnamed: 0,patient_number,state_patient_number,date_announced,age_bracket,gender,detected_city,detected_district,detected_state,state_code,current_status,notes,suspected_contacted_patient,nationality,type_of_transmission,status_change_date,source_1,source_2,source_3,backup_notes
0,1,KL-TS-P1,2020-01-30,20.0,F,Thrissur,Thrissur,Kerala,KL,Recovered,Travelled from Wuhan,,India,Imported,2020-02-14,https://twitter.com/vijayanpinarayi/status/122...,https://weather.com/en-IN/india/news/news/2020...,Student from Wuhan,
1,2,KL-AL-P1,2020-02-02,,,Alappuzha,Alappuzha,Kerala,KL,Recovered,Travelled from Wuhan,,India,Imported,2020-02-14,https://www.indiatoday.in/india/story/kerala-r...,https://weather.com/en-IN/india/news/news/2020...,,Student from Wuhan


In [5]:
patients_ds.head(2)

Unnamed: 0,patient_number,state_patient_number,date_announced,age_bracket,gender,detected_city,detected_district,detected_state,state_code,current_status,notes,suspected_contacted_patient,nationality,type_of_transmission,status_change_date,source_1,source_2,source_3,backup_notes
0,1,KL-TS-P1,2020-01-30,20.0,F,Thrissur,Thrissur,Kerala,KL,Recovered,Travelled from Wuhan,,India,Imported,2020-02-14,https://twitter.com/vijayanpinarayi/status/122...,https://weather.com/en-IN/india/news/news/2020...,Student from Wuhan,
1,2,KL-AL-P1,2020-02-02,,,Alappuzha,Alappuzha,Kerala,KL,Recovered,Travelled from Wuhan,,India,Imported,2020-02-14,https://www.indiatoday.in/india/story/kerala-r...,https://weather.com/en-IN/india/news/news/2020...,,Student from Wuhan


In [6]:
# Reading districs dataset
# Note: I manually mapped districts in excel on the basis of population density,
# please refer to my_covid_district_dataset.xlsx for more details on mapping
dense_district_ds = pd.read_csv(r'pop_density_mapped_district.csv')

In [7]:
dense_district_ds.head(2)

Unnamed: 0,District,Person,Male,Female,Sex Ratio,Density,mapped_district
0,Papum Pare,121750,64122,57628,899,35,North Cachar Hills
1,Uttarkashi,294179,151599,142580,941,37,North Cachar Hills


In [8]:
# Filtering only required columns
patients_ds = patients_ds[['date_announced','status_change_date','age_bracket','gender','detected_district','detected_city','detected_state','state_code']]

In [9]:
# Merging patients and mapped district dataset on the basis of districts
den_final_df = pd.merge(patients_ds,dense_district_ds,left_on='detected_district',right_on='District')

In [10]:
den_final_df.shape

(1952, 15)

In [11]:
# Checking for Nan values in Gnder column
den_final_df['gender'].value_counts(dropna=False)

NaN    1279
M       480
F       193
Name: gender, dtype: int64

In [12]:
# Replacing NaN values in Gender with M
den_final_df['gender'].fillna(value='M',inplace=True)

In [13]:
den_final_df['gender'] = den_final_df['gender'].map({'M':'Male','F':'Female'})

In [14]:
# Renaming columns as per NobBS R Code requirement
den_final_df.rename(columns={'date_announced':'report_week'},inplace=True)

In [15]:
#startdate=datetime.date(YYYY,MM,DD)
den_final_df['onset_week']=den_final_df['report_week'].apply(lambda x: x - datetime.timedelta(randint(5,9)))

In [16]:
den_final_df.head(2)

Unnamed: 0,report_week,status_change_date,age_bracket,gender,detected_district,detected_city,detected_state,state_code,District,Person,Male,Female,Sex Ratio,Density,mapped_district,onset_week
0,2020-02-03,2020-02-14,,Male,Kasaragod,Kasaragod,Kerala,KL,Kasaragod,1203342,587763,615579,1047,604,Nagaon,2020-01-25
1,2020-03-16,2020-03-16,,Male,Kasaragod,Kalanadu,Kerala,KL,Kasaragod,1203342,587763,615579,1047,604,Nagaon,2020-03-07


In [17]:
# Checking for adjusted onset date adjustment (5-9 days)
from datetime import datetime, timedelta
den_final_df['day_diff'] = den_final_df['report_week'] - den_final_df['onset_week']
den_final_df['day_diff'].value_counts()

8 days    402
9 days    393
7 days    387
5 days    386
6 days    384
Name: day_diff, dtype: int64

In [18]:
den_final_df.head(2)

Unnamed: 0,report_week,status_change_date,age_bracket,gender,detected_district,detected_city,detected_state,state_code,District,Person,Male,Female,Sex Ratio,Density,mapped_district,onset_week,day_diff
0,2020-02-03,2020-02-14,,Male,Kasaragod,Kasaragod,Kerala,KL,Kasaragod,1203342,587763,615579,1047,604,Nagaon,2020-01-25,9 days
1,2020-03-16,2020-03-16,,Male,Kasaragod,Kalanadu,Kerala,KL,Kasaragod,1203342,587763,615579,1047,604,Nagaon,2020-03-07,9 days


In [19]:
# Getting Assam district list
district_list = den_final_df['mapped_district'].value_counts().reset_index()['index'].values

In [20]:
# Creating csv file for each district
for dist in district_list:
    dist_name = '%s'%dist + '_df'
    dist_name = den_final_df[den_final_df['mapped_district'] == dist]
    dist_name.name = dist
    dist_name[['onset_week','report_week','gender']].to_csv('%s_data_input.csv'%dist_name.name,index=False)