# <span style="color: MediumPurple;">Maternal Mental Health during pregnancy in covid </span>

### <span style="color: LightPink;">Sophia Dalal </span>

### <span style="color: Lightblue;">Health Data Science Project </span>

#### <span style="color: MediumPurple;">(Below are ways for me to double check my data and verify that I am looking at the correct set!) </span>

##### Importing Packages

In [1]:
import pandas as pd 
from scipy import stats

#### Data Import

In [11]:
# Read the CSV file
df = pd.read_csv('Original Pregnancy During the COVID-19 Pandemic 2.csv')

In [12]:
# Verify the size of 'df'
df.shape

(10772, 16)

In [13]:
# Show the column headers
df.columns

Index(['OSF_ID', 'Maternal_Age', 'Household_Income', 'Maternal_Education',
       'Edinburgh_Postnatal_Depression_Scale', 'PROMIS_Anxiety',
       'Gestational_Age_At_Birth',
       'Delivery_Date(converted to month and year)', 'Birth_Length',
       'Birth_Weight', 'Delivery_Mode', 'NICU_Stay', 'Language',
       'Threaten_Life', 'Threaten_Baby_Danger', 'Threaten_Baby_Harm'],
      dtype='object')

In [14]:
# Show the first 5 rows of the DataFrame
df.head()


Unnamed: 0,OSF_ID,Maternal_Age,Household_Income,Maternal_Education,Edinburgh_Postnatal_Depression_Scale,PROMIS_Anxiety,Gestational_Age_At_Birth,Delivery_Date(converted to month and year),Birth_Length,Birth_Weight,Delivery_Mode,NICU_Stay,Language,Threaten_Life,Threaten_Baby_Danger,Threaten_Baby_Harm
0,1,38.3,"$200,000+",Masters degree,9.0,13.0,39.71,Dec2020,49.2,3431.0,Vaginally,No,English,2.0,3.0,27.0
1,2,34.6,"$200,000+",Undergraduate degree,4.0,17.0,,,,,,,English,2.0,33.0,92.0
2,3,34.3,"$100,000 -$124,999",Undergraduate degree,,,,,,,,,French,,,
3,4,28.8,"$100,000 -$124,999",Masters degree,9.0,20.0,38.57,Dec2020,41.0,2534.0,Vaginally,No,French,53.0,67.0,54.0
4,5,36.5,"$40,000-$69,999",Undergraduate degree,14.0,20.0,39.86,Oct2020,53.34,3714.0,Caesarean-section (c-section),No,English,23.0,32.0,71.0


In [15]:
# Show info about 'df'
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10772 entries, 0 to 10771
Data columns (total 16 columns):
 #   Column                                      Non-Null Count  Dtype  
---  ------                                      --------------  -----  
 0   OSF_ID                                      10772 non-null  int64  
 1   Maternal_Age                                10661 non-null  float64
 2   Household_Income                            10521 non-null  object 
 3   Maternal_Education                          10595 non-null  object 
 4   Edinburgh_Postnatal_Depression_Scale        9598 non-null   float64
 5   PROMIS_Anxiety                              9566 non-null   float64
 6   Gestational_Age_At_Birth                    6734 non-null   float64
 7   Delivery_Date(converted to month and year)  6733 non-null   object 
 8   Birth_Length                                5480 non-null   float64
 9   Birth_Weight                                6078 non-null   float64
 10  Delivery_M

#### <span style="color: MediumPurple;">Categorical Data </span>

#### <span style="color: LightBlue;">Below are ways to play with categorical variables such as Maternal Education </span>

In [20]:
# Show the first 10 rows of the "Maternal_Education" column
df['Maternal_Education'].head(10)


0          Masters degree
1    Undergraduate degree
2    Undergraduate degree
3          Masters degree
4    Undergraduate degree
5    Undergraduate degree
6     High school diploma
7                     NaN
8    College/trade school
9    College/trade school
Name: Maternal_Education, dtype: object

In [21]:
# Show the unique values of the Maternal education column
df['Maternal_Education'].unique()

array(['Masters degree', 'Undergraduate degree', 'High school diploma',
       nan, 'College/trade school', 'Less than high school diploma',
       'Doctoral Degree'], dtype=object)

In [22]:
# Calculate the frequency of each category in the Maternal education column
df['Maternal_Education'].value_counts()


Maternal_Education
Undergraduate degree             4117
College/trade school             2760
Masters degree                   1889
High school diploma               901
Doctoral Degree                   786
Less than high school diploma     142
Name: count, dtype: int64

In [23]:
# Calculate the proportion of each category in the Maternal education column
df['Maternal_Education'].value_counts(normalize=True)

Maternal_Education
Undergraduate degree             0.388580
College/trade school             0.260500
Masters degree                   0.178292
High school diploma              0.085040
Doctoral Degree                  0.074186
Less than high school diploma    0.013403
Name: proportion, dtype: float64

In [24]:
#Calculate the percetage of each category in the percentage in the Maternal education column
df['Maternal_Education'].value_counts(normalize=True) * 100


Maternal_Education
Undergraduate degree             38.857952
College/trade school             26.050024
Masters degree                   17.829165
High school diploma               8.504011
Doctoral Degree                   7.418594
Less than high school diploma     1.340255
Name: proportion, dtype: float64

In [25]:
# Show the unique values for the "language" column
df['Language'].unique()

array(['English', 'French'], dtype=object)

In [26]:
# Calculate the frequency of each category in the "language" column
df['Language'].value_counts()


Language
English    8155
French     2617
Name: count, dtype: int64

In [27]:
# Create me a table that can show me the association between the language and the maternal education
pd.crosstab(df['Language'], df['Maternal_Education'])

Maternal_Education,College/trade school,Doctoral Degree,High school diploma,Less than high school diploma,Masters degree,Undergraduate degree
Language,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
English,1869,636,680,66,1464,3290
French,891,150,221,76,425,827


#### <span style="color: MediumPurple;">Comparative Numerical Analysis </span>

#### <span style="color: LightBlue;">Below are ways to play with numerical variables such as Maternal age. It inlcudes me comparing numerical to catgeorical. Age versus Education. </span>

In [28]:
# Calculate the mean of the "Maternal_age" Column for each of the "Maternal_Education" categories (Categorical and then Numerical )
df.groupby('Maternal_Education')['Maternal_Age'].mean()

Maternal_Education
College/trade school             30.984065
Doctoral Degree                  34.202545
High school diploma              29.086459
Less than high school diploma    27.012676
Masters degree                   33.246186
Undergraduate degree             32.081657
Name: Maternal_Age, dtype: float64

#### <span style="color: MediumPurple;">Numerical Data Analysis</span>
#### <span style="color: LightBlue;">Below are ways to play with numerical variable such as maternal age </span>

In [29]:
# Calculate the mean of the "age" column 
df['Maternal_Age'].mean()


np.float64(31.844601819716722)

In [30]:
# Calculate the sample standard deviation of the "Age" colum (you want to make sure it defaults to sample (n-1 denominator and not n)!)
df['Maternal_Age'].std()

np.float64(4.422367904548697)

In [31]:
df.Maternal_Age.std(ddof=1)

np.float64(4.422367904548697)

In [32]:
# Describe the summary statistics of the "Maternal age Column"
df['Maternal_Age'].describe()


count    10661.000000
mean        31.844602
std          4.422368
min         17.000000
25%         28.900000
50%         31.800000
75%         34.800000
max         49.700000
Name: Maternal_Age, dtype: float64

In [33]:
# Describe the summary stats for the age column for each category in the Maternal Education column 
df.groupby('Maternal_Education')['Maternal_Age'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
Maternal_Education,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
College/trade school,2755.0,30.984065,4.602244,19.4,27.7,30.8,34.1,45.8
Doctoral Degree,786.0,34.202545,3.570761,24.8,31.8,34.1,36.5,49.0
High school diploma,901.0,29.086459,5.296504,17.5,25.1,28.8,32.8,47.2
Less than high school diploma,142.0,27.012676,6.027747,17.0,22.0,26.8,31.55,41.3
Masters degree,1888.0,33.246186,3.597122,23.8,30.7,33.0,35.425,47.1
Undergraduate degree,4116.0,32.081657,3.928138,20.9,29.3,31.8,34.7,49.7


#### <span style="color: LightBlue;">Below is a numerical variable verusus numerical variable</span>

In [34]:
# Now do this for a numerical X numerical relationship. Calculate the correlation between "Age" and "Edinburgh_Postnatal_Depression_Scale" columns
df.Maternal_Age.corr(df.Edinburgh_Postnatal_Depression_Scale)

np.float64(-0.09331094682635495)