In [1]:
import numpy as np 
import pandas as pd 
import re
import warnings

warnings.filterwarnings('ignore')

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/covid19-in-india/StatewiseTestingDetails.csv
/kaggle/input/covid19-in-india/covid_19_india.csv
/kaggle/input/covid19-in-india/HospitalBedsIndia.csv
/kaggle/input/covid19-in-india/IndividualDetails.csv
/kaggle/input/covid19-in-india/AgeGroupDetails.csv
/kaggle/input/covid19-in-india/ICMRTestingLabs.csv
/kaggle/input/covid19-in-india/population_india_census2011.csv


In [2]:
population_df = pd.read_csv("/kaggle/input/covid19-in-india/population_india_census2011.csv", index_col=0) 
statewise_testing_df = pd.read_csv("/kaggle/input/covid19-in-india/StatewiseTestingDetails.csv", index_col=0)

In [3]:
population_df.head()

Unnamed: 0_level_0,State / Union Territory,Population,Rural population,Urban population,Area,Density,Gender Ratio
Sno,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,Uttar Pradesh,199812341,155317278,44495063,"240,928 km2 (93,023 sq mi)","828/km2 (2,140/sq mi)",912
2,Maharashtra,112374333,61556074,50818259,"307,713 km2 (118,809 sq mi)",365/km2 (950/sq mi),929
3,Bihar,104099452,92341436,11758016,"94,163 km2 (36,357 sq mi)","1,102/km2 (2,850/sq mi)",918
4,West Bengal,91276115,62183113,29093002,"88,752 km2 (34,267 sq mi)","1,029/km2 (2,670/sq mi)",953
5,Madhya Pradesh,72626809,52557404,20069405,"308,245 km2 (119,014 sq mi)",236/km2 (610/sq mi),931


In [4]:
statewise_testing_df.head()

Unnamed: 0_level_0,State,TotalSamples,Negative,Positive
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020-04-17,Andaman and Nicobar Islands,1403.0,1210.0,12.0
2020-04-24,Andaman and Nicobar Islands,2679.0,,27.0
2020-04-27,Andaman and Nicobar Islands,2848.0,,33.0
2020-05-01,Andaman and Nicobar Islands,3754.0,,33.0
2020-05-16,Andaman and Nicobar Islands,6677.0,,33.0


In [5]:
statewise_testing_df.reset_index(inplace=True)

statewise_testing_df['Date'] = pd.to_datetime(statewise_testing_df['Date'], format="%Y-%m-%d")

statewise_testing_df['Date'].min(), statewise_testing_df['Date'].max()

(Timestamp('2020-04-01 00:00:00'), Timestamp('2020-06-04 00:00:00'))

In [6]:
population_df = population_df[['State / Union Territory','Density']]
statewise_testing_df = statewise_testing_df[['Date','TotalSamples','State','Positive']]

In [7]:
print(population_df['State / Union Territory'].nunique())
pop_states = set(population_df['State / Union Territory'].unique())

36


In [8]:
print(statewise_testing_df['State'].nunique())
statewise_testing_states = set(statewise_testing_df['State'].unique())

35


In [9]:
pop_states - statewise_testing_states

{'Lakshadweep', 'Telengana'}

In [10]:
statewise_testing_states - pop_states

{'Telangana'}

In [11]:
population_df.loc[population_df['State / Union Territory'].str.contains('ngana')]

Unnamed: 0_level_0,State / Union Territory,Density
Sno,Unnamed: 1_level_1,Unnamed: 2_level_1
12,Telengana,312/km2 (810/sq mi)


In [12]:
population_df.loc[population_df['State / Union Territory'].str.contains('ngana'),'State / Union Territory'] = "Telangana"

### Joining statewise testing details with population information

In [13]:
statewise_features = statewise_testing_df.reset_index().merge(population_df, \
                                    how='inner', \
                                   left_on='State', \
                                   right_on='State / Union Territory')
statewise_features = statewise_features.drop(["State / Union Territory",'index'], axis=1)

In [14]:
statewise_features[statewise_features['State']=='West Bengal'].head()

Unnamed: 0,Date,TotalSamples,State,Positive,Density
1663,2020-04-01,659.0,West Bengal,37.0,"1,029/km2 (2,670/sq mi)"
1664,2020-04-04,1042.0,West Bengal,,"1,029/km2 (2,670/sq mi)"
1665,2020-04-06,1301.0,West Bengal,,"1,029/km2 (2,670/sq mi)"
1666,2020-04-07,1487.0,West Bengal,,"1,029/km2 (2,670/sq mi)"
1667,2020-04-09,1889.0,West Bengal,,"1,029/km2 (2,670/sq mi)"


In [15]:
statewise_features['Density'] = statewise_features['Density'].apply(lambda density: re.sub(",", "",density))
statewise_features['pop_density'] = statewise_features['Density'].str.extract("(\d+)").astype(float)
statewise_features.drop("Density", axis=1, inplace=True)

In [16]:
statewise_features.head()

Unnamed: 0,Date,TotalSamples,State,Positive,pop_density
0,2020-04-17,1403.0,Andaman and Nicobar Islands,12.0,46.0
1,2020-04-24,2679.0,Andaman and Nicobar Islands,27.0,46.0
2,2020-04-27,2848.0,Andaman and Nicobar Islands,33.0,46.0
3,2020-05-01,3754.0,Andaman and Nicobar Islands,33.0,46.0
4,2020-05-16,6677.0,Andaman and Nicobar Islands,33.0,46.0


In [17]:
statewise_features.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1724 entries, 0 to 1723
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   Date          1724 non-null   datetime64[ns]
 1   TotalSamples  1724 non-null   float64       
 2   State         1724 non-null   object        
 3   Positive      1716 non-null   float64       
 4   pop_density   1724 non-null   float64       
dtypes: datetime64[ns](1), float64(3), object(1)
memory usage: 80.8+ KB


In [18]:
statewise_daily_df = None
for state in statewise_features['State'].unique():
    covid_data_state = statewise_features[statewise_features['State']==state]
    covid_data_state['previous_day'] = covid_data_state['Positive'].shift(1)
    covid_data_state['new_cases'] = covid_data_state['Positive'] - covid_data_state['previous_day']

    covid_data_state['previous_day'] = covid_data_state['TotalSamples'].shift(1)
    covid_data_state['samples_tested'] = covid_data_state['TotalSamples'] - covid_data_state['previous_day']

    covid_data_state = covid_data_state.drop('previous_day',axis=1)
    statewise_daily_df = pd.concat([statewise_daily_df, covid_data_state], axis=0)
    
statewise_daily_df.set_index('Date', inplace=True)


In [19]:
statewise_daily_df.head()

Unnamed: 0_level_0,TotalSamples,State,Positive,pop_density,new_cases,samples_tested
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-04-17,1403.0,Andaman and Nicobar Islands,12.0,46.0,,
2020-04-24,2679.0,Andaman and Nicobar Islands,27.0,46.0,15.0,1276.0
2020-04-27,2848.0,Andaman and Nicobar Islands,33.0,46.0,6.0,169.0
2020-05-01,3754.0,Andaman and Nicobar Islands,33.0,46.0,0.0,906.0
2020-05-16,6677.0,Andaman and Nicobar Islands,33.0,46.0,0.0,2923.0


In [20]:
statewise_daily_df.drop(['TotalSamples','Positive'], axis=1).corr()

Unnamed: 0,pop_density,new_cases,samples_tested
pop_density,1.0,0.085343,-0.042294
new_cases,0.085343,1.0,0.571229
samples_tested,-0.042294,0.571229,1.0


In [21]:
statewise_daily_df.dropna(inplace=True)
statewise_daily_df.drop(['TotalSamples','Positive'], axis=1).to_csv("./statewise_features.csv")