# Fictional Army - Filtering and Sorting

### Import libraries

In [1]:
import numpy as np
import pandas as pd

### Step 1. This is the data given as a dictionary. Create a dataframe and assign it to a variable called army. 

In [2]:
# Create an example dataframe about a fictional army
raw_data = {'regiment': ['Nighthawks', 'Nighthawks', 'Nighthawks', 'Nighthawks', 'Dragoons', 'Dragoons', 'Dragoons', 'Dragoons', 'Scouts', 'Scouts', 'Scouts', 'Scouts'],
            'company': ['1st', '1st', '2nd', '2nd', '1st', '1st', '2nd', '2nd','1st', '1st', '2nd', '2nd'],
            'deaths': [523, 52, 25, 616, 43, 234, 523, 62, None, 73, 37, 35],
            'battles': [5, 42, 2, 2, 4, 7, 8, 3, 4, 7, 8, 9],
            'size': [1045, 957, 1099, 1400, 1592, 1006, 987, 849, 973, 1005, 1099, 1523],
            'veterans': [1, 5, 62, 26, 73, 37, 949, 48, 48, 435, 63, 345],
            'readiness': [1, None, 3, None, 2, None, 2, None, None, 1, 2, None],
            'armored': [1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1],
            'deserters': [4, 24, 31, 2, 3, 4, 24, 31, 2, 3, 2, 3],
            'origin': ['Arizona', 'California', 'Texas', 'Florida', 'Maine', 'Iowa', 'Alaska', 'Washington', 'Oregon', 'Wyoming', 'Louisana', 'Georgia']}

### Step 2. Check missing values

In [3]:
df = pd.DataFrame(raw_data)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12 entries, 0 to 11
Data columns (total 10 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   regiment   12 non-null     object 
 1   company    12 non-null     object 
 2   deaths     11 non-null     float64
 3   battles    12 non-null     int64  
 4   size       12 non-null     int64  
 5   veterans   12 non-null     int64  
 6   readiness  6 non-null      float64
 7   armored    12 non-null     int64  
 8   deserters  12 non-null     int64  
 9   origin     12 non-null     object 
dtypes: float64(2), int64(5), object(3)
memory usage: 1.1+ KB


In [5]:
df.describe(include="all")

Unnamed: 0,regiment,company,deaths,battles,size,veterans,readiness,armored,deserters,origin
count,12,12,11.0,12.0,12.0,12.0,6.0,12.0,12.0,12
unique,3,2,,,,,,,,12
top,Nighthawks,1st,,,,,,,,Arizona
freq,4,6,,,,,,,,1
mean,,,202.090909,8.416667,1127.916667,174.333333,1.833333,0.583333,11.083333,
std,,,234.300429,10.849871,240.241719,280.254214,0.752773,0.514929,12.324833,
min,,,25.0,2.0,849.0,1.0,1.0,0.0,2.0,
25%,,,40.0,3.75,983.5,34.25,1.25,0.0,2.75,
50%,,,62.0,6.0,1025.5,55.0,2.0,1.0,3.5,
75%,,,378.5,8.0,1174.25,141.0,2.0,1.0,24.0,


In [6]:
df.head()

Unnamed: 0,regiment,company,deaths,battles,size,veterans,readiness,armored,deserters,origin
0,Nighthawks,1st,523.0,5,1045,1,1.0,1,4,Arizona
1,Nighthawks,1st,52.0,42,957,5,,0,24,California
2,Nighthawks,2nd,25.0,2,1099,62,3.0,1,31,Texas
3,Nighthawks,2nd,616.0,2,1400,26,,1,2,Florida
4,Dragoons,1st,43.0,4,1592,73,2.0,0,3,Maine


In [7]:
df[df["readiness"].isnull()]

Unnamed: 0,regiment,company,deaths,battles,size,veterans,readiness,armored,deserters,origin
1,Nighthawks,1st,52.0,42,957,5,,0,24,California
3,Nighthawks,2nd,616.0,2,1400,26,,1,2,Florida
5,Dragoons,1st,234.0,7,1006,37,,1,4,Iowa
7,Dragoons,2nd,62.0,3,849,48,,1,31,Washington
8,Scouts,1st,,4,973,48,,0,2,Oregon
11,Scouts,2nd,35.0,9,1523,345,,1,3,Georgia


### Step 3. Drop features, if there are more than 30% missing values.

Al ser 12 entradas, debemos establecer el punto de corte en 4 entradas nulas (3 entradas nulas supondría un 25% de valores nulos)

In [8]:
df.dropna(axis = 1, thresh=len(df.index)*0.7)

Unnamed: 0,regiment,company,deaths,battles,size,veterans,armored,deserters,origin
0,Nighthawks,1st,523.0,5,1045,1,1,4,Arizona
1,Nighthawks,1st,52.0,42,957,5,0,24,California
2,Nighthawks,2nd,25.0,2,1099,62,1,31,Texas
3,Nighthawks,2nd,616.0,2,1400,26,1,2,Florida
4,Dragoons,1st,43.0,4,1592,73,0,3,Maine
5,Dragoons,1st,234.0,7,1006,37,1,4,Iowa
6,Dragoons,2nd,523.0,8,987,949,0,24,Alaska
7,Dragoons,2nd,62.0,3,849,48,1,31,Washington
8,Scouts,1st,,4,973,48,0,2,Oregon
9,Scouts,1st,73.0,7,1005,435,0,3,Wyoming


### Step 4. Fill missing values with the mean of their regiment.

In [9]:
df["deaths"] = df.groupby("regiment")["deaths"].transform(lambda val: val.fillna(val.mean()))
df.info()
df["deaths"]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12 entries, 0 to 11
Data columns (total 10 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   regiment   12 non-null     object 
 1   company    12 non-null     object 
 2   deaths     12 non-null     float64
 3   battles    12 non-null     int64  
 4   size       12 non-null     int64  
 5   veterans   12 non-null     int64  
 6   readiness  6 non-null      float64
 7   armored    12 non-null     int64  
 8   deserters  12 non-null     int64  
 9   origin     12 non-null     object 
dtypes: float64(2), int64(5), object(3)
memory usage: 1.1+ KB


0     523.000000
1      52.000000
2      25.000000
3     616.000000
4      43.000000
5     234.000000
6     523.000000
7      62.000000
8      48.333333
9      73.000000
10     37.000000
11     35.000000
Name: deaths, dtype: float64

In [30]:
medias = dict(df.groupby("regiment")["readiness"].mean())
medias


{'Dragoons': np.float64(2.0),
 'Nighthawks': np.float64(2.0),
 'Scouts': np.float64(1.5)}

In [11]:
df["readiness"] = df.groupby("regiment")["readiness"].transform(lambda val: val.fillna(val.mean()))
df

Unnamed: 0,regiment,company,deaths,battles,size,veterans,readiness,armored,deserters,origin
0,Nighthawks,1st,523.0,5,1045,1,1.0,1,4,Arizona
1,Nighthawks,1st,52.0,42,957,5,2.0,0,24,California
2,Nighthawks,2nd,25.0,2,1099,62,3.0,1,31,Texas
3,Nighthawks,2nd,616.0,2,1400,26,2.0,1,2,Florida
4,Dragoons,1st,43.0,4,1592,73,2.0,0,3,Maine
5,Dragoons,1st,234.0,7,1006,37,2.0,1,4,Iowa
6,Dragoons,2nd,523.0,8,987,949,2.0,0,24,Alaska
7,Dragoons,2nd,62.0,3,849,48,2.0,1,31,Washington
8,Scouts,1st,48.333333,4,973,48,1.5,0,2,Oregon
9,Scouts,1st,73.0,7,1005,435,1.0,0,3,Wyoming


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12 entries, 0 to 11
Data columns (total 10 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   regiment   12 non-null     object 
 1   company    12 non-null     object 
 2   deaths     12 non-null     float64
 3   battles    12 non-null     int64  
 4   size       12 non-null     int64  
 5   veterans   12 non-null     int64  
 6   readiness  12 non-null     float64
 7   armored    12 non-null     int64  
 8   deserters  12 non-null     int64  
 9   origin     12 non-null     object 
dtypes: float64(2), int64(5), object(3)
memory usage: 1.1+ KB


### Step 5. Set the 'origin' colum as the index of the dataframe

In [13]:
df.set_index("origin",inplace=True)

### Step 6. Select the 'deaths', 'size' and 'deserters' columns from Maine and Alaska

In [14]:
df.head()

Unnamed: 0_level_0,regiment,company,deaths,battles,size,veterans,readiness,armored,deserters
origin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Arizona,Nighthawks,1st,523.0,5,1045,1,1.0,1,4
California,Nighthawks,1st,52.0,42,957,5,2.0,0,24
Texas,Nighthawks,2nd,25.0,2,1099,62,3.0,1,31
Florida,Nighthawks,2nd,616.0,2,1400,26,2.0,1,2
Maine,Dragoons,1st,43.0,4,1592,73,2.0,0,3


In [15]:
df.loc[["Maine", "Alaska"],["deaths", "size", "deserters"]]

Unnamed: 0_level_0,deaths,size,deserters
origin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Maine,43.0,1592,3
Alaska,523.0,987,24


### Step 7. Select the rows 3 to 7 and the columns 3 to 6

In [31]:
df.iloc[2:7,2:6]

Unnamed: 0_level_0,deaths,battles,size,veterans
origin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Texas,25.0,2,1099,62
Florida,616.0,2,1400,26
Maine,43.0,4,1592,73
Iowa,234.0,7,1006,37
Alaska,523.0,8,987,949


### Step 8. Select every row after the fourth row and all columns

In [32]:
df.iloc[3:]

Unnamed: 0_level_0,regiment,company,deaths,battles,size,veterans,readiness,armored,deserters
origin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Florida,Nighthawks,2nd,616.0,2,1400,26,2.0,1,2
Maine,Dragoons,1st,43.0,4,1592,73,2.0,0,3
Iowa,Dragoons,1st,234.0,7,1006,37,2.0,1,4
Alaska,Dragoons,2nd,523.0,8,987,949,2.0,0,24
Washington,Dragoons,2nd,62.0,3,849,48,2.0,1,31
Oregon,Scouts,1st,48.333333,4,973,48,1.5,0,2
Wyoming,Scouts,1st,73.0,7,1005,435,1.0,0,3
Louisana,Scouts,2nd,37.0,8,1099,63,2.0,1,2
Georgia,Scouts,2nd,35.0,9,1523,345,1.5,1,3


### Step 9. Select every row up to the 4th row and all columns

In [18]:
df.iloc[:4]

Unnamed: 0_level_0,regiment,company,deaths,battles,size,veterans,readiness,armored,deserters
origin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Arizona,Nighthawks,1st,523.0,5,1045,1,1.0,1,4
California,Nighthawks,1st,52.0,42,957,5,2.0,0,24
Texas,Nighthawks,2nd,25.0,2,1099,62,3.0,1,31
Florida,Nighthawks,2nd,616.0,2,1400,26,2.0,1,2


### Step 10. Select the 3rd column up to the 7th column

In [19]:
df[df.columns[3:7]]

Unnamed: 0_level_0,battles,size,veterans,readiness
origin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Arizona,5,1045,1,1.0
California,42,957,5,2.0
Texas,2,1099,62,3.0
Florida,2,1400,26,2.0
Maine,4,1592,73,2.0
Iowa,7,1006,37,2.0
Alaska,8,987,949,2.0
Washington,3,849,48,2.0
Oregon,4,973,48,1.5
Wyoming,7,1005,435,1.0


### Step 11. Select rows where df.deaths is greater than 50

In [35]:
df[df["deaths"]>50]

Unnamed: 0_level_0,regiment,company,deaths,battles,size,veterans,readiness,armored,deserters
origin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Arizona,Nighthawks,1st,523.0,5,1045,1,1.0,1,4
California,Nighthawks,1st,52.0,42,957,5,2.0,0,24
Florida,Nighthawks,2nd,616.0,2,1400,26,2.0,1,2
Iowa,Dragoons,1st,234.0,7,1006,37,2.0,1,4
Alaska,Dragoons,2nd,523.0,8,987,949,2.0,0,24
Washington,Dragoons,2nd,62.0,3,849,48,2.0,1,31
Wyoming,Scouts,1st,73.0,7,1005,435,1.0,0,3


In [34]:
df[df["deaths"]>50].index

Index(['Arizona', 'California', 'Florida', 'Iowa', 'Alaska', 'Washington',
       'Wyoming'],
      dtype='object', name='origin')

### Step 12. Select rows where df.deaths is greater than 500 or less than 50

In [37]:
df[(df["deaths"]<50) | (df["deaths"] > 500)]

Unnamed: 0_level_0,regiment,company,deaths,battles,size,veterans,readiness,armored,deserters
origin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Arizona,Nighthawks,1st,523.0,5,1045,1,1.0,1,4
Texas,Nighthawks,2nd,25.0,2,1099,62,3.0,1,31
Florida,Nighthawks,2nd,616.0,2,1400,26,2.0,1,2
Maine,Dragoons,1st,43.0,4,1592,73,2.0,0,3
Alaska,Dragoons,2nd,523.0,8,987,949,2.0,0,24
Oregon,Scouts,1st,48.333333,4,973,48,1.5,0,2
Louisana,Scouts,2nd,37.0,8,1099,63,2.0,1,2
Georgia,Scouts,2nd,35.0,9,1523,345,1.5,1,3


In [38]:
df[(df["deaths"]<50) | (df["deaths"] > 500)].index

Index(['Arizona', 'Texas', 'Florida', 'Maine', 'Alaska', 'Oregon', 'Louisana',
       'Georgia'],
      dtype='object', name='origin')

### Step 13. Select all the regiments not named "Dragoons"

In [39]:
df.regiment[df["regiment"] != "Dragoons"]

origin
Arizona       Nighthawks
California    Nighthawks
Texas         Nighthawks
Florida       Nighthawks
Oregon            Scouts
Wyoming           Scouts
Louisana          Scouts
Georgia           Scouts
Name: regiment, dtype: object

### Step 14. Select the rows called Texas and Arizona

In [40]:
df.loc[["Texas","Arizona"]]

Unnamed: 0_level_0,regiment,company,deaths,battles,size,veterans,readiness,armored,deserters
origin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Texas,Nighthawks,2nd,25.0,2,1099,62,3.0,1,31
Arizona,Nighthawks,1st,523.0,5,1045,1,1.0,1,4


### Step 15. Select the third cell in the row named Arizona

In [41]:
df.loc["Arizona"].iloc[2]

np.float64(523.0)

### Step 16. Select the third cell down in the column named deaths

In [42]:
df["deaths"].iloc[2]

np.float64(25.0)