# Combining Data on Grades

In [1]:
import numpy as np
import pandas as pd

pd.set_option('display.max_columns', 10)
pd.set_option('display.max_rows', 10)

### Given: Three Numpy Arrays of Grades

In [2]:
rng = np.random.default_rng(seed=42)
ar1 = rng.choice(
    ['A', 'B', 'C', 'D', 'F'], 100, p=[.2, .4, .29, .08, .03]
)
ar2 = rng.choice(
    ['A', 'B', 'C', 'D', 'F'], 50, p=[.3, .4, .2, .1, 0]
)
ar3 = rng.choice(
    ['a', 'b', 'c', 'd', 'f'], 200, p=[.15, .45, .25, .12, .03]
)

### Create pandas Series from these Arrays
Use the default index for each. Save the series as `s1`, `s2` and `s3`

In [3]:
s1 = pd.Series(ar1)
s2 = pd.Series(ar2)
s3 = pd.Series(ar3)

### Get the Value Counts of each of the Series
Save the resulting series as `grades1`, `grades2`, and `grades3`

In [4]:
grades1 = s1.value_counts()
grades2 = s2.value_counts()
grades3 = s3.value_counts()
grades1, grades2, grades3

(B    38
 C    34
 A    22
 D     4
 F     2
 dtype: int64,
 B    24
 A    14
 D     6
 C     6
 dtype: int64,
 b    81
 c    50
 a    38
 d    28
 f     3
 dtype: int64)

#### The following code creates the table shown in the manual. You'll learn about `DataFrame`s soon.

In [5]:
pd.DataFrame(
{
    'grades1': grades1,
    'grades2': grades2,
    'grades3': grades3,
})

Unnamed: 0,grades1,grades2,grades3
A,22.0,14.0,
B,38.0,24.0,
C,34.0,6.0,
D,4.0,6.0,
F,2.0,,
a,,,38.0
b,,,81.0
c,,,50.0
d,,,28.0
f,,,3.0


### Compare the indexes of the three `grades` variables 
You should see that the index for `grades3` uses lowercase letters, while the other two use uppercase letters.

In [6]:
grades1.index, grades2.index, grades3.index

(Index(['B', 'C', 'A', 'D', 'F'], dtype='object'),
 Index(['B', 'A', 'D', 'C'], dtype='object'),
 Index(['b', 'c', 'a', 'd', 'f'], dtype='object'))

### Reindex `grades3` to use uppercase letters
This is a little tricky because the indices are not in alphabetical order. You will need to sort them first, and then set the index for `grades3` to use capital letters.

* We only really need to sort `grades1` and `grades2` if we're going to compare them, but we may as well.

In [7]:
grades1.sort_index(inplace=True)
grades2.sort_index(inplace=True)
grades3.sort_index(inplace=True)

grades3.index = ['A', 'B', 'C', 'D', 'F']
grades3.index

Index(['A', 'B', 'C', 'D', 'F'], dtype='object')

In [8]:
grades3

A    38
B    81
C    50
D    28
F     3
dtype: int64

### Add the three `grades` Series together
Don't forget to set the fill value to 0.

In [9]:
grades_all = grades1.add(grades2, fill_value=0).add(grades3, fill_value=0)
grades_all

A     74.0
B    143.0
C     90.0
D     38.0
F      5.0
dtype: float64

### From `grades_all`, create a `grades_breakout` Series that holds the share of each grade.
`grades_breakout.sum()` should equal 1

In [10]:
grades_breakout = grades_all / grades_all.sum()
grades_breakout, grades_breakout.sum()

(A    0.211429
 B    0.408571
 C    0.257143
 D    0.108571
 F    0.014286
 dtype: float64,
 0.9999999999999999)

## A different approach: First, change case of values of `s3`.

In [11]:
s3 = s3.str.upper()
s3

0      C
1      D
2      A
3      B
4      A
      ..
195    C
196    D
197    B
198    B
199    A
Length: 200, dtype: object

### Then combine the Series of grades.

In [12]:
s_all = pd.concat([s1, s2, s3])
s_all

0      C
1      B
2      C
3      C
4      A
      ..
195    C
196    D
197    B
198    B
199    A
Length: 350, dtype: object

### Then get the value counts of the combined series.

In [13]:
s_all.value_counts()

B    143
C     90
A     74
D     38
F      5
dtype: int64

In [14]:
grades_breakout = s_all.value_counts(normalize=True)
grades_breakout.sort_index(inplace=True)
grades_breakout

A    0.211429
B    0.408571
C    0.257143
D    0.108571
F    0.014286
dtype: float64