# More Data Wrangling 7

In this notebook I will demonstrate an alternative way to drop rows of participants/ observation with missing values and also how to drop columns (variables) that have an amount of missing values above a certain percentage. 

In [1]:
import pandas as pd
import numpy as np

In [2]:
crime = pd.read_csv('fearofcrime.csv')
crime.head()

Unnamed: 0,sex,anxlevel,stress,totalworry,construct
0,2,2,1.3,3.0375,3.04878048780488
1,2,2,2.1,3.21875,2.95121951219512
2,1,3,1.95,2.025,3.29268292682927
3,2,2,2.1,1.80625,2.19512195121951
4,2,2,2.05,2.5625,2.80487804878049


In [3]:
# Converting the number variables in the crime dataset that are
# saved as strings to floats:

crime.dtypes

sex           object
anxlevel      object
stress        object
totalworry    object
construct     object
dtype: object

In [4]:
crime_2 = crime.apply(pd.to_numeric, errors = 'coerce')

In [5]:
crime_2.dtypes

sex           float64
anxlevel      float64
stress        float64
totalworry    float64
construct     float64
dtype: object

In [6]:
crime_2.sex.unique()

array([ 2.,  1., nan])

In [7]:
crime_2.anxlevel.unique()

array([ 2.,  3.,  1., nan])

In [8]:
# Can use the isna method and sum method to get counts of how many missing datapoints per variable. 
crime_2.isna().sum()

sex            1
anxlevel       7
stress         1
totalworry     7
construct     13
dtype: int64

In [9]:
# Can also work out the percentage of missing values using isna.mean method:

crime_2.isna().mean()

sex           0.004255
anxlevel      0.029787
stress        0.004255
totalworry    0.029787
construct     0.055319
dtype: float64

In [10]:
# To drop missing values, can use the dropna method.

crime_2 = crime_2.dropna()

In [11]:
crime_2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 209 entries, 0 to 234
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   sex         209 non-null    float64
 1   anxlevel    209 non-null    float64
 2   stress      209 non-null    float64
 3   totalworry  209 non-null    float64
 4   construct   209 non-null    float64
dtypes: float64(5)
memory usage: 9.8 KB


In [12]:
crime_2 = crime_2.reset_index()

In [13]:
crime_2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 209 entries, 0 to 208
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   index       209 non-null    int64  
 1   sex         209 non-null    float64
 2   anxlevel    209 non-null    float64
 3   stress      209 non-null    float64
 4   totalworry  209 non-null    float64
 5   construct   209 non-null    float64
dtypes: float64(5), int64(1)
memory usage: 9.9 KB


In [14]:
crime_2

Unnamed: 0,index,sex,anxlevel,stress,totalworry,construct
0,0,2.0,2.0,1.30,3.03750,3.048780
1,1,2.0,2.0,2.10,3.21875,2.951220
2,2,1.0,3.0,1.95,2.02500,3.292683
3,3,2.0,2.0,2.10,1.80625,2.195122
4,4,2.0,2.0,2.05,2.56250,2.804878
...,...,...,...,...,...,...
204,230,2.0,2.0,1.90,2.50000,3.317073
205,231,2.0,2.0,2.05,3.58750,2.975610
206,232,2.0,1.0,1.15,2.39375,4.097561
207,233,1.0,2.0,1.65,2.00000,3.317073


In [15]:
crime_2.isna().sum()

index         0
sex           0
anxlevel      0
stress        0
totalworry    0
construct     0
dtype: int64

In [16]:
crime_3 = crime.apply(pd.to_numeric, errors = 'coerce')

In [17]:
crime_3.isna().mean()

sex           0.004255
anxlevel      0.029787
stress        0.004255
totalworry    0.029787
construct     0.055319
dtype: float64

In [18]:
crime_3.head()

Unnamed: 0,sex,anxlevel,stress,totalworry,construct
0,2.0,2.0,1.3,3.0375,3.04878
1,2.0,2.0,2.1,3.21875,2.95122
2,1.0,3.0,1.95,2.025,3.292683
3,2.0,2.0,2.1,1.80625,2.195122
4,2.0,2.0,2.05,2.5625,2.804878


In [19]:
# To drop columns in which more than a set threshold % of data are missing, use thresh:

crime_3.dropna(thresh = len(crime_3) * 0.97, axis = 'columns').head()

# Here I have set the threshold at 97% of the dataframe length. This only excludes the construct
# variable which had 5% missing values. 

Unnamed: 0,sex,anxlevel,stress,totalworry
0,2.0,2.0,1.3,3.0375
1,2.0,2.0,2.1,3.21875
2,1.0,3.0,1.95,2.025
3,2.0,2.0,2.1,1.80625
4,2.0,2.0,2.05,2.5625
