# More Data Wrangling 

In this notebook I will demonstrate how to generate simulated datasets using numpy's random.rand function. I will also import a dataset and demonstrate ways to manipulate the data by changing row and column order and also how  to convert the data types of variables. 

In [1]:
import pandas as pd

import numpy as np

In [2]:
# Importing a dataset as csv files to analyse. 

crime = pd.read_csv('fearofcrime.csv')


In [3]:
# Checking the version of pandas we are using. 

pd.__version__

# Note the use of the dunder (double underscore)

'1.4.2'

In [4]:
# To create a data frame using the random.rand function from numpy

pd.DataFrame(np.random.rand(4, 8))

Unnamed: 0,0,1,2,3,4,5,6,7
0,0.016279,0.516233,0.460377,0.148602,0.766921,0.879118,0.349683,0.157601
1,0.646693,0.715658,0.022841,0.506087,0.800738,0.189129,0.415593,0.803969
2,0.199266,0.14374,0.42451,0.807728,0.040724,0.105372,0.866681,0.580053
3,0.3207,0.086457,0.617641,0.690454,0.126939,0.40133,0.571028,0.377512


In [5]:
# Can add column names to the above like this:

pd.DataFrame(np.random.rand(4, 8), columns = list('ABCDEFGH'))

# Note that only one string is used for the column names with no delimiter. 

Unnamed: 0,A,B,C,D,E,F,G,H
0,0.519893,0.816257,0.321045,0.245766,0.974654,0.805161,0.232,0.187791
1,0.434057,0.28508,0.115166,0.684706,0.30103,0.52855,0.928429,0.360535
2,0.983604,0.925952,0.950246,0.849881,0.093573,0.283324,0.243453,0.60386
3,0.108982,0.65775,0.996438,0.150209,0.8778,0.125709,0.614282,0.323559


In [6]:
# Reverse row order

# Looking at the fear of crime dataset

crime.head()

Unnamed: 0,sex,anxlevel,stress,totalworry,construct
0,2,2,1.3,3.0375,3.04878048780488
1,2,2,2.1,3.21875,2.95121951219512
2,1,3,1.95,2.025,3.29268292682927
3,2,2,2.1,1.80625,2.19512195121951
4,2,2,2.05,2.5625,2.80487804878049


In [7]:
crime.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 235 entries, 0 to 234
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   sex         235 non-null    object
 1   anxlevel    235 non-null    object
 2   stress      235 non-null    object
 3   totalworry  235 non-null    object
 4   construct   235 non-null    object
dtypes: object(5)
memory usage: 9.3+ KB


In [8]:
# To reverse the order of the rows can use the dot loc method
# This would be most useful with alphabetically organised data
# or to organised from last to first example/ participant.

crime.loc[:: -1].head()

Unnamed: 0,sex,anxlevel,stress,totalworry,construct
234,1,2,1.75,2.09375,3.39024390243902
233,1,2,1.65,2.0,3.31707317073171
232,2,1,1.15,2.39375,4.09756097560976
231,2,2,2.05,3.5875,2.97560975609756
230,2,2,1.9,2.5,3.31707317073171


In [9]:
# Having reversed the order of the rows you can then reset the index
# so it starts at zero (if you needed/ wanted to do this)

crime.loc[:: -1].reset_index(drop = True).head()

# Index back from zero upwards in ascending order. 

Unnamed: 0,sex,anxlevel,stress,totalworry,construct
0,1,2,1.75,2.09375,3.39024390243902
1,1,2,1.65,2.0,3.31707317073171
2,2,1,1.15,2.39375,4.09756097560976
3,2,2,2.05,3.5875,2.97560975609756
4,2,2,1.9,2.5,3.31707317073171


In [10]:
# Reverse column order

crime.loc[:, ::-1].head()

# The first colon says select all rows and the ::-1 tells python to reverse the order of the columns. 
# We have then combined this with the head() method to see the first five lines. 

Unnamed: 0,construct,totalworry,stress,anxlevel,sex
0,3.04878048780488,3.0375,1.3,2,2
1,2.95121951219512,3.21875,2.1,2,2
2,3.29268292682927,2.025,1.95,3,1
3,2.19512195121951,1.80625,2.1,2,2
4,2.80487804878049,2.5625,2.05,2,2


In [11]:
# Select columns by data type
# For some reason these are all shown as objects. 
# Asking for the data types for the crime dataframe:

crime.dtypes

sex           object
anxlevel      object
stress        object
totalworry    object
construct     object
dtype: object

In [12]:
# Trying an alternative method using the to_numeric function on the orginal crime dataframe. 
# We can use use coerce to convert any missing or invalid values in our data to NaN values. 
crime_2 = crime.apply(pd.to_numeric, errors = 'coerce')

In [13]:
# Checking the data types we see that all variables have been converted to floating point numbers. 
# Obviously we have two categorical variables in the dataset and now need to convert these back to objects. 
crime_2.dtypes

sex           float64
anxlevel      float64
stress        float64
totalworry    float64
construct     float64
dtype: object

In [14]:
# Changing sex back to an object

crime_2['sex'] = crime_2['sex'].astype(str)

In [15]:
# Now changing the anxlevel variable

crime_2['anxlevel'] = crime_2['anxlevel'].astype(str)

In [16]:
# This is now better with sex and anxlevel shown as objects and the others shown as floats. 

crime_2.dtypes

sex            object
anxlevel       object
stress        float64
totalworry    float64
construct     float64
dtype: object

In [17]:
# As these two objects are categorical with a finite number of values we could also convert them to categories.
# Categories are a form of dynamic enumeration. They are most useful if the range of possible values is fixed and finite.
# Saving them as a category data type also seems to save memory space. 

# The above string object variables can be converted to categorical variables in a similar way
# but specifying 'category'

crime_2['sex'] = crime_2['sex'].astype('category')

In [18]:
# Converting the anxlevel variable to a category:

crime_2['anxlevel'] = crime_2['anxlevel'].astype('category')

In [19]:
# The data seems to be appropriate now, with two categorical variables and three numeric variables. 
crime_2.dtypes

sex           category
anxlevel      category
stress         float64
totalworry     float64
construct      float64
dtype: object