In [76]:
import pandas as pd
surveys_df = pd.read_csv('../surveys.csv')

### Exercise 1
- Swap the order of column names in `surveys_df[['plot_id', 'species_id']]`
- Repeat one of the column names like `surveys_df[['plot_id', 'plot_id', 'species_id']]`.
How does the results look like and why?
- Which error occurrs in `surveys_df['plot_id', 'species_id']` and why?
- Which error occurrs in `surveys_df['speciess']`?

In [77]:
print(surveys_df[['species_id', 'plot_id']])

      species_id  plot_id
0             NL        2
1             NL        3
2             DM        2
3             DM        7
4             DM        3
...          ...      ...
35544         AH       15
35545         AH       15
35546         RM       10
35547         DO        7
35548        NaN        5

[35549 rows x 2 columns]


In [78]:
surveys_df[['plot_id', 'plot_id', 'species_id']] # repeating column plot_id

Unnamed: 0,plot_id,plot_id.1,species_id
0,2,2,NL
1,3,3,NL
2,2,2,DM
3,7,7,DM
4,3,3,DM
...,...,...,...
35544,15,15,AH
35545,15,15,AH
35546,10,10,RM
35547,7,7,DO


In [79]:
surveys_df['plot_id', 'species_id'] 
# The tuple, or combination ('plot_id', 'species_id') is not a 
# column name (key) in the dataframe --> KeyError: ('plot_id', 'species_id')

KeyError: ('plot_id', 'species_id')

In [80]:
surveys_df['speciess']
# 'spiciess' is not a column name (key) in the dataframe

KeyError: 'speciess'

### Exercise 2
What happens when you execute:
- `surveys_df[0:1]`
- `surveys_df[:4]`
- `surveys_df[:-1]`

In [81]:
surveys_df[0:1] # shows the first row of the dataframe
surveys_df[:4] # shows the first four rows from index 0 to index 3
surveys_df[:-1] # shows all rows of the dataframe

Unnamed: 0,record_id,month,day,year,plot_id,species_id,sex,hindfoot_length,weight
0,1,7,16,1977,2,NL,M,32.0,
1,2,7,16,1977,3,NL,M,33.0,
2,3,7,16,1977,2,DM,F,37.0,
3,4,7,16,1977,7,DM,M,36.0,
4,5,7,16,1977,3,DM,M,35.0,
...,...,...,...,...,...,...,...,...,...
35543,35544,12,31,2002,15,US,,,
35544,35545,12,31,2002,15,AH,,,
35545,35546,12,31,2002,15,AH,,,
35546,35547,12,31,2002,10,RM,F,15.0,14.0


### Exercise 3
What happens in the following two examples?

- ```surveys_df.iloc[0:4, 1:4]```;
- ```surveys_df.loc[0:4, 1:4]```.


In [84]:
print(surveys_df.iloc[0:4, 1:4])
surveys_df.loc[0:4, 1:4] # the function loc works with indices for rows (0:4), 
# but not with indices for columns (1:4). COlumns do have names in our dataframe

   month  day  year
0      7   16  1977
1      7   16  1977
2      7   16  1977
3      7   16  1977


TypeError: cannot do slice indexing on Index with these indexers [1] of type int

### Exercise 4
- Create a new DataFrame that only contains observations with sex values that are not female or male. Print the number of rows in this new DataFrame. Verify the result by comparing the number of rows in the new DataFrame with the number of rows in the surveys DataFrame where sex is NaN (hint: there is a function `isnull`).
- Create a new DataFrame that contains only observations that are of sex male or female and where weight values are greater than 0.

In [86]:
df = surveys_df[(surveys_df['sex'].isnull())]
print("Number of rows:", len(df))
print("Unique values in column 'sex':", df['sex'].unique())

Number of rows: 2511
Unique values in column 'sex': [nan]


### Exercise 5: Putting it all together 
1. Clean the column *sex* (leave out samples we do not know whether they are male or female) and save the result as a new dataframe `clean_df`.
2. Fill undefined *weight* values with the mean of all valid weights in `surveys_df`.
3. Calculate the average weight of that new DataFrame `clean_df`

In [88]:
# Step 1
# sex is 'F' or 'M'. The `|` means or.
clean_df = surveys_df[(surveys_df['sex']=='F') | (surveys_df['sex']=='M')]
# or not sex is null. The `~` means not.
clean_df = surveys_df[~(surveys_df['sex'].isnull())]

# Step 2
clean_df.weight.fillna(surveys_df.weight.mean())

# Step 3
print("Average weight of surveys_df:", surveys_df.weight.mean())
print("Average weight of clean_df:", clean_df.weight.mean())

Average weight of surveys_df: 42.672428212991356
Average weight of clean_df: 42.60316325896464


## Exercise 6
Let's see in which plots animals get more food. Calculate the average weight per plot! Complete the code below.

In [91]:
grouped_data = surveys_df.groupby("plot_id")
grouped_data['weight'].mean()

plot_id
1     51.822911
2     52.251688
3     32.654386
4     47.928189
5     40.947802
6     36.738893
7     20.663009
8     47.758001
9     51.432358
10    18.541219
11    43.451757
12    49.496169
13    40.445660
14    46.277199
15    27.042578
16    24.585417
17    47.889593
18    40.005922
19    21.105166
20    48.665303
21    24.627794
22    54.146379
23    19.634146
24    43.679167
Name: weight, dtype: float64

## Exercise 7
Investigate the group keys and row indexes for this more complex grouping example. 
Why are there more than 48 groups?
What happened to the third group and why dos it not turn up in our statistics?

In [94]:
grouped_data = surveys_df.groupby(['sex', 'plot_id'])
print(len(grouped_data.groups))
grouped_data.groups.keys() # we also have a categorial value 'nan'.

72


dict_keys([('F', 1), ('F', 2), ('F', 3), ('F', 4), ('F', 5), ('F', 6), ('F', 7), ('F', 8), ('F', 9), ('F', 10), ('F', 11), ('F', 12), ('F', 13), ('F', 14), ('F', 15), ('F', 16), ('F', 17), ('F', 18), ('F', 19), ('F', 20), ('F', 21), ('F', 22), ('F', 23), ('F', 24), ('M', 1), ('M', 2), ('M', 3), ('M', 4), ('M', 5), ('M', 6), ('M', 7), ('M', 8), ('M', 9), ('M', 10), ('M', 11), ('M', 12), ('M', 13), ('M', 14), ('M', 15), ('M', 16), ('M', 17), ('M', 18), ('M', 19), ('M', 20), ('M', 21), ('M', 22), ('M', 23), ('M', 24), (nan, 1), (nan, 2), (nan, 3), (nan, 4), (nan, 5), (nan, 6), (nan, 7), (nan, 8), (nan, 9), (nan, 10), (nan, 11), (nan, 12), (nan, 13), (nan, 14), (nan, 15), (nan, 16), (nan, 17), (nan, 18), (nan, 19), (nan, 20), (nan, 21), (nan, 22), (nan, 23), (nan, 24)])

### Exercise 8
Would it make sense to group our data frame by the column *weight*? Why or why not?

In [96]:
# In real life nearly every sample has a unique value. So nearly every sample would 
# be placed in an own group.
# In our training data you can see that there are quite some values for weight. So
# usually it is not a good idea to categorise (group) data on such values.
print("Number of rows:", len(surveys_df))
print(len(surveys_df['weight'].unique())) #includes nan
print(len(surveys_df.groupby(['weight']).groups)) #does not include nan

Number of rows: 35549
256
255
