# Creating a data map and data plan

 A data map is an overview of the dataset. You use it to spot potential problems in your data, such as
 - Redundant variables
 - Possible errors
 - Missing values
 - Variable transformations

Checking for these problems goes into a data plan, which is a list of tasks you have to perform to ensure the integrity of your data

In [1]:
import pandas as pd

In [2]:
#create a dataframe using dictionary
df = pd.DataFrame({'A': [0,0,0,0,0,1,1],
 'B': [1,2,3,5,4,2,5],
 'C': [5,3,4,1,1,2,3]})
df

Unnamed: 0,A,B,C
0,0,1,5
1,0,2,3
2,0,3,4
3,0,5,1
4,0,4,1
5,1,2,2
6,1,5,3


In [3]:
#create a df groubby A 
a_group_desc = df.groupby('A').describe()
a_group_desc

Unnamed: 0_level_0,B,B,B,B,B,B,B,B,C,C,C,C,C,C,C,C
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
A,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
0,5.0,3.0,1.581139,1.0,2.0,3.0,4.0,5.0,5.0,2.8,1.788854,1.0,1.0,3.0,4.0,5.0
1,2.0,3.5,2.12132,2.0,2.75,3.5,4.25,5.0,2.0,2.5,0.707107,2.0,2.25,2.5,2.75,3.0


In [4]:
#another way to see the result
unstacked = a_group_desc.unstack()
unstacked

          A
B  count  0    5.000000
          1    2.000000
   mean   0    3.000000
          1    3.500000
   std    0    1.581139
          1    2.121320
   min    0    1.000000
          1    2.000000
   25%    0    2.000000
          1    2.750000
   50%    0    3.000000
          1    3.500000
   75%    0    4.000000
          1    4.250000
   max    0    5.000000
          1    5.000000
C  count  0    5.000000
          1    2.000000
   mean   0    2.800000
          1    2.500000
   std    0    1.788854
          1    0.707107
   min    0    1.000000
          1    2.000000
   25%    0    1.000000
          1    2.250000
   50%    0    3.000000
          1    2.500000
   75%    0    4.000000
          1    2.750000
   max    0    5.000000
          1    3.000000
dtype: float64

In [5]:
#Create a Dictionary of series
d = {'Name':pd.Series(['Tom','James','Ricky','Vin','Steve','Smith','Jack',
   'Lee','David','Gasper','Betina','Andres']),
   'Age':pd.Series([25,26,25,23,30,29,23,34,40,30,51,46]),
   'Rating':pd.Series([4.23,3.24,3.98,2.56,3.20,4.6,3.8,3.78,2.98,4.80,4.10,30.65])
}

#Create a DataFrame
df = pd.DataFrame(d)
df

Unnamed: 0,Name,Age,Rating
0,Tom,25,4.23
1,James,26,3.24
2,Ricky,25,3.98
3,Vin,23,2.56
4,Steve,30,3.2
5,Smith,29,4.6
6,Jack,23,3.8
7,Lee,34,3.78
8,David,40,2.98
9,Gasper,30,4.8


sum

In [6]:
#find the sum
df.sum()

Name      TomJamesRickyVinSteveSmithJackLeeDavidGasperBe...
Age                                                     382
Rating                                                71.92
dtype: object

Mean

In [7]:
#find the mean
df.mean()

Age       31.833333
Rating     5.993333
dtype: float64

Standard deviation 

In [8]:
df.std()

Age       9.232682
Rating    7.792918
dtype: float64

Count

In [9]:
df.count()

Name      12
Age       12
Rating    12
dtype: int64

Median

In [10]:
df.median()

Age       29.50
Rating     3.89
dtype: float64

Mode

In [11]:
df.mode()

Unnamed: 0,Name,Age,Rating
0,Andres,23.0,2.56
1,Betina,25.0,2.98
2,David,30.0,3.2
3,Gasper,,3.24
4,Jack,,3.78
5,James,,3.8
6,Lee,,3.98
7,Ricky,,4.1
8,Smith,,4.23
9,Steve,,4.6


Minimum value record

In [12]:
df.min()

Name      Andres
Age           23
Rating      2.56
dtype: object

Maximum value record

In [13]:
df.max()

Name        Vin
Age          51
Rating    30.65
dtype: object

Product

In [14]:
df.prod()

Age       7.158408e+17
Rating    5.307176e+07
dtype: float64

Cummutative Sum

In [15]:
df.cumsum()

Unnamed: 0,Name,Age,Rating
0,Tom,25,4.23
1,TomJames,51,7.47
2,TomJamesRicky,76,11.45
3,TomJamesRickyVin,99,14.01
4,TomJamesRickyVinSteve,129,17.21
5,TomJamesRickyVinSteveSmith,158,21.81
6,TomJamesRickyVinSteveSmithJack,181,25.61
7,TomJamesRickyVinSteveSmithJackLee,215,29.39
8,TomJamesRickyVinSteveSmithJackLeeDavid,255,32.37
9,TomJamesRickyVinSteveSmithJackLeeDavidGasper,285,37.17


Describe Function

In [16]:
df.describe()

Unnamed: 0,Age,Rating
count,12.0,12.0
mean,31.833333,5.993333
std,9.232682,7.792918
min,23.0,2.56
25%,25.0,3.23
50%,29.5,3.89
75%,35.5,4.3225
max,51.0,30.65


# Manipulating Categorical Variables

 A categorical variable is one that has a specific value from a limited selection of values. The number of values is usually fixed
 - Categorical()
 - cat.categories
 - cat.codes

## Creating categorical variables

In [17]:
car_colors = pd.Series(['Blue', 'Red', 'Green'],dtype='category')

In [18]:
car_colors

0     Blue
1      Red
2    Green
dtype: category
Categories (3, object): [Blue, Green, Red]

Example 1

In [19]:
# using categorical function
pd.Categorical([1, 2, 3, 1, 2, 3])

[1, 2, 3, 1, 2, 3]
Categories (3, int64): [1, 2, 3]

Example 2

In [20]:
pd.Categorical(['a', 'b', 'c', 'a', 'b', 'c'])

[a, b, c, a, b, c]
Categories (3, object): [a, b, c]

Example 3 

In [21]:
c = pd.Categorical(['a','b','c','a','b','c'], ordered=True,categories=['c', 'b', 'a'])

In [22]:
c

[a, b, c, a, b, c]
Categories (3, object): [c < b < a]

In [23]:
c.min()

'c'

In [24]:
c.max()

'a'

Example 4

In [25]:
studentRec1=pd.Series({'Name':'TRisha','Gender':'Female','City':'Anand','cpi':8})
studentRec2=pd.Series({'Name':'Krisha','Gender':'Female','City':'Vadodara','cpi':8.5})
studentRec3=pd.Series({'Name':'krish','Gender':'Male','City':'Anand','cpi':9})


In [26]:
df2=pd.DataFrame([studentRec1,studentRec2,studentRec3])
df2

Unnamed: 0,Name,Gender,City,cpi
0,TRisha,Female,Anand,8.0
1,Krisha,Female,Vadodara,8.5
2,krish,Male,Anand,9.0


In [27]:
df2.Gender=pd.Categorical(df2.Gender,['Female','Male'])
df2
df2.Gender


0    Female
1    Female
2      Male
Name: Gender, dtype: category
Categories (2, object): [Female, Male]

In [28]:
df2.Gender.cat.categories

Index(['Female', 'Male'], dtype='object')

In [29]:
df2.Gender

0    Female
1    Female
2      Male
Name: Gender, dtype: category
Categories (2, object): [Female, Male]

In [30]:
pd.get_dummies(df2.Gender)

Unnamed: 0,Female,Male
0,1,0
1,1,0
2,0,1


Example 5

In [31]:
emp1=pd.Series({'Name':'TRisha','Gender':'Female','designation':'Programmer','Sal':30000})
emp2=pd.Series({'Name':'Rudra','Gender':'Male','designation':'Senior Programmer','Sal':50000})
emp3=pd.Series({'Name':'Ayan','Gender':'Male','designation':'Team Lead','Sal':55000})
emp4=pd.Series({'Name':'Krishna','Gender':'Female','designation':'Project Manager','Sal':100000})
ProjectTeam=pd.DataFrame([emp1,emp2,emp3,emp4])
ProjectTeam

Unnamed: 0,Name,Gender,designation,Sal
0,TRisha,Female,Programmer,30000
1,Rudra,Male,Senior Programmer,50000
2,Ayan,Male,Team Lead,55000
3,Krishna,Female,Project Manager,100000


In [32]:
ProjectTeam.designation=pd.Categorical(['Programmer', 'Senior Programmer','Team Lead','Project Manager'],ordered = True, categories=['Programmer', 'Senior Programmer','Team Lead','Project Manager'])

In [33]:
ProjectTeam.designation

0           Programmer
1    Senior Programmer
2            Team Lead
3      Project Manager
Name: designation, dtype: category
Categories (4, object): [Programmer < Senior Programmer < Team Lead < Project Manager]

In [34]:
ProjectTeam

Unnamed: 0,Name,Gender,designation,Sal
0,TRisha,Female,Programmer,30000
1,Rudra,Male,Senior Programmer,50000
2,Ayan,Male,Team Lead,55000
3,Krishna,Female,Project Manager,100000


In [35]:
ProjectTeam.designation = ProjectTeam.designation.cat.codes

In [36]:
ProjectTeam

Unnamed: 0,Name,Gender,designation,Sal
0,TRisha,Female,0,30000
1,Rudra,Male,1,50000
2,Ayan,Male,2,55000
3,Krishna,Female,3,100000


## Renaming levels

 cat.categories

In [37]:
car_colors = pd.Series(['Blue', 'Red', 'Green'],
 dtype='category')
car_colors

0     Blue
1      Red
2    Green
dtype: category
Categories (3, object): [Blue, Green, Red]

In [38]:
car_data = pd.Series(
 pd.Categorical(
 ['Blue', 'Green', 'Red', 'Blue', 'Red'], categories=car_colors, ordered=False))


In [39]:
car_data

0     Blue
1    Green
2      Red
3     Blue
4      Red
dtype: category
Categories (3, object): [Blue, Green, Red]

In [40]:
car_colors.cat.categories = ["Purple", "Yellow", "Mauve"]
car_data.cat.categories = car_colors

In [41]:
car_data

0    Purple
1    Yellow
2     Mauve
3    Purple
4     Mauve
dtype: category
Categories (3, object): [Purple, Yellow, Mauve]

In [42]:
car_colors

0    Purple
1     Mauve
2    Yellow
dtype: category
Categories (3, object): [Purple, Yellow, Mauve]

## Combining levels

In [43]:
car_colors = pd.Series(['Blue', 'Red', 'Green'],
 dtype='category')


In [44]:
car_data = pd.Series(
 pd.Categorical(
 ['Blue', 'Green', 'Red', 'Green', 'Red', 'Green'],
 categories=car_colors, ordered=False))


In [45]:
car_data.cat.categories = ["Blue_Red", "Red", "Green"]

In [46]:
car_data

0    Blue_Red
1         Red
2       Green
3         Red
4       Green
5         Red
dtype: category
Categories (3, object): [Blue_Red, Red, Green]

In [47]:
car_data.isin(['Red'])

0    False
1     True
2    False
3     True
4    False
5     True
dtype: bool

In [48]:
car_data.loc[car_data.isin(['Red'])] = 'Blue_Red'

In [49]:
car_data

0    Blue_Red
1    Blue_Red
2       Green
3    Blue_Red
4       Green
5    Blue_Red
dtype: category
Categories (3, object): [Blue_Red, Red, Green]

## References

- Python for data science for dummies by  John Paul Mueller,Luca Massaron