In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

# Cleaning outliers

The cars dudes have this problem, maybe you do too?

In [None]:
cars = pd.read_csv('https://corgis-edu.github.io/corgis/datasets/csv/cars/cars.csv')
cars

In [None]:
cars.max()

Max highway mpg 223 !?!?!? That's gotta be a mistake. Here's how to find it...

In [None]:
cars[ cars['Fuel Information.Highway mpg'] > 100 ]

...and fix it. Using that index right there and the column name, we can reach in and replace it with a reasonable value:

In [None]:
cars.at[3686, 'Fuel Information.Highway mpg'] = 23
cars['Fuel Information.Highway mpg'].describe()

But what if there's still a problem in another column? The standard way to deal with everything at once is using 1.5xIQR (Inter-Quartile-Range -- remember the placement of the boxplot whiskers?). As [this stackoverflow answer](https://stackoverflow.com/questions/35827863/remove-outliers-in-pandas-dataframe-using-percentiles) says, 'Use this code and don't waste your time':

In [None]:
# Start over with the outlier in again
cars = pd.read_csv('https://corgis-edu.github.io/corgis/datasets/csv/cars/cars.csv')
Q1 = cars.quantile(0.25)
Q3 = cars.quantile(0.75)
IQR = Q3-Q1
outliers = cars[ ((cars < (Q1 - 1.5 * IQR)) | (cars > (Q3 + 1.5 * IQR))).any(axis=1)]
outliers

In [None]:
outliers.max()

In [None]:
# Here's where we strip cars of outliers. Same as before except add a tilde ~
cars     = cars[~((cars < (Q1 - 2 * IQR)) | (cars > (Q3 + 2 * IQR))).any(axis=1)]
cars.max()

This kind of thing might remove more stuff than you want, like all the Bentleys and Lamborghinis etc that have outlier horsepower. You can try increasing 1.5 to maybe 2-3, or maybe just stick with the repair of isolated cells as shown above

# Setting column names

It could be your data has no names on top of the columns, it could be you want to use your own column names which are easier to type:

In [None]:
abalone=pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.data')
abalone

In [None]:
colnames=['Sex', 'Length', 'Diameter', 'Height', 'Whole Weight', 'Shucked Weight', 'Viscera Weight', 'Shell Weight', 'Rings']
abalone=pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.data', names=colnames)
abalone

# Selecting DataFrame rows based on a condition

Certainly everybody will have to do this fundamental pandas operation 

In [None]:
abalone['Sex'].value_counts()

In [None]:
Mab = abalone[ abalone['Sex'] == 'M' ]
Mab

In [None]:
Fab = abalone[ abalone['Sex'] == 'F' ]
Fab

In [None]:
Iab = abalone[ abalone['Sex'] == 'I' ]
Iab

# Control the size of Seaborn catplots

This is cramped, and there are a lot of errors about points not fitting

In [None]:
sns.catplot(data=abalone, x='Sex', y='Whole Weight', hue='Rings', kind='swarm')

sns.catplot() accepts height and aspect arguments. height=8 makes it taller, and aspect=2 makes it twice as wide as tall

In [None]:
sns.catplot(data=abalone, x='Sex', y='Whole Weight', hue='Rings', kind='swarm', height=8, aspect=2)

While we're here, let's try some text and annotation

In [None]:
sns.catplot(data=abalone, x='Sex', y='Whole Weight', hue='Rings', kind='swarm', height=8, aspect=2)
a = plt.gca()
# Note that all the x values are relative to category ticks being at 0,1,2
a.text(0.5, 2.5, 'Howdy', color='purple')
a.annotate('Doody', xy=(1, 2.65), xytext=(1.5,2.4),
          color='r', arrowprops=dict(arrowstyle="->", color='r'))


# Linear Regression

Numpy can do that!

In [None]:
elec = pd.read_csv('https://corgis-edu.github.io/corgis/datasets/csv/electricity/electricity.csv')
elec

In [None]:
# These points look pretty linearly related
plt.scatter(elec['Demand.Summer Peak'], elec['Demand.Winter Peak'])

The x range goes out past 30000, but the y range only goes up to about 25000. If we were to graph a line with slope of 1, it would not fit this data

In [None]:
xrange = np.arange(0, 32000, 1000) # this is (0, 1000, 2000, ...31000) not 32k, because python
ax = plt.gca()
ax.scatter(elec['Demand.Summer Peak'], elec['Demand.Winter Peak'])
ax.plot(xrange, xrange, c='r', ls='--')

In [None]:
slope, inter = np.polyfit(elec['Demand.Summer Peak'], elec['Demand.Winter Peak'], 1) # 1 is for linear
slope, inter

In [None]:
slope*xrange + inter

In [None]:
# Note that, rather than telling matplotlib to graph two pandas Series for you
# you can tell pandas to tell matplotlib to graph two Series for you
# Note ax= saves using an extra line for ax=plt.gca()
ax=elec.plot('Demand.Summer Peak', 'Demand.Winter Peak', # x and y column names, without elec[]
             kind='scatter')                             # kind of like seaborn
ax.plot(xrange, slope*xrange + inter, c='green')

# Grouping rows with common values

The electricity dataset has multiple power companies per state. It can be useful to aggregate into a per-state DataFrame

In [None]:
# Sometimes the reasonable way to groupby() is .sum()
elecStateSum = elec.groupby('Utility.State').sum()
elecStateSum.head()

In [None]:
# Sometimes (Demand.Summer.Peak ? ) max() may be more appropriate, or maybe sometimes mean()
elecStateMax = elec.groupby('Utility.State').max()
elecStateMax.head()

# More complicated grouping

There are multiple car makers from the same country, a new column can be put together to create a bulkier grouping than individual manufacturers

In [None]:
cars['Identification.Make'].value_counts()

In [None]:
# Set this first, EVERYBODY gets other, so after we set a bunch, we can see who we missed
cars['Country'] = 'Other'
cars['Country'].value_counts()

In [None]:
# I put this on multiple lines just so you can see the important parts more clearly
cars.loc[ 
          cars['Identification.Make'].isin( ['Saab', 'Volvo'] ), 
          'Country' 
        ] = 'Sweden'

In [None]:
cars.loc[ cars['Identification.Make'].isin( ['Audi', 'BMW', 'Mercedes-Benz', 'Porsche', 'Volkswagen' ] ), 'Country' ] = 'Germany'

In [None]:
cars['Country'].value_counts()

In [None]:
# Check what's left over
leftovers = cars[ cars['Country'] == 'Other' ]
leftovers['Identification.Make'].value_counts()

# Making a function to graph lots of stuff

In [None]:
bball = pd.read_csv('https://raw.githubusercontent.com/RubeRad/tcscs/master/Kenpom_cbb_dataALL.csv')
bball.head()

I expect this kind of code would get a bit repetitive, and cut&paste is never a good idea

In [None]:
plt.figure()
axes=plt.gca()

kentacky = bball[ bball['Team'] == 'Kentucky']
kx = kentacky['Year']
ky = kentacky['AdjO']
axes.plot(kx, ky)

alababama = bball[ bball['Team'] == 'Alabama']
ax = alababama['Year']
ay = alababama['AdjO']
axes.plot(ax, ay)

tennesaw  = bball[ bball['Team'] == 'Tennessee']
tx = tennesaw['Year']
ty = tennesaw['AdjO']
axes.plot(tx, ty)

axes.legend(['Kentucky', 'Alabama', 'Tennessee'])

plt.show()


Better to make a function to do this kind of thing generically

In [None]:
def multi_plot(adf,        # a DataFrame
               scol_name,  # the name of the column we are selecting rows from
               scol_vals,  # the list of values we are selecting; one plot for each
               xcol_name,  # the name of the column to use for X in each plot
               ycol_name,  # the name of the column to use for Y in each plot
               color=None, # if you let this stay None, matplotlib will choose colors
               alpha=None, # if you let this stay None, alpha=opacity=1
               scatter=False, # default plot (line)
               axes=None   # if you don't pass axes in, axes for a new figure will be returned
              ):

    if axes == None:
      plt.figure()
      axes=plt.gca() 

    for val in scol_vals:
        # Grab a DataFrame of just the values we want from the selection column
        sub_df = adf[ adf[scol_name] == val ]
        
        # Grab the Series (columns) for X and Y
        xs = sub_df[xcol_name]
        ys = sub_df[ycol_name]
        
        # Plot these xs vs ys
        if scatter:
          axes.scatter(xs, ys, color=color, alpha=alpha)
        else:
          axes.plot(xs, ys, color=color, alpha=alpha)
            
    return axes # caller may want to graph more stuff, add titles, mess with range, etc etc
    

In [None]:
#good_teams = ['Duke', 'Kentucky','Baylor','Virginia','Gonzaga']
axes = multi_plot(bball, 'Team', bball['Team'].unique(), 'Year', 'AdjO', color='gray',   alpha=0.1)
axes = multi_plot(bball, 'Team', ['Duke'],               'Year', 'AdjO', color='blue',   axes=axes)
axes = multi_plot(bball, 'Team', ['Baylor'],             'Year', 'AdjO', color='yellow', axes=axes)
axes = multi_plot(bball, 'Team', ['Virginia'],           'Year', 'AdjO', color='orange', axes=axes)
#axes.legend( good_teams )

# 2023 tips

In [None]:
# Recommend clear this cell's output when it's done
!pip install geopandas
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import geopandas as gpd

In [None]:
# these are how to load the world, U.S. states and U.S. counties geopandas datasets
world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))
states = gpd.read_file('https://raw.githubusercontent.com/RubeRad/camcom/master/us-albers.json')
counties = gpd.read_file('https://raw.githubusercontent.com/RubeRad/tcscs/master/us_counties.json')

## Merging pandas data with geopandas data
### One data row per map row
Very often, your interesting data will be in a pandas dataframe, but your map will be in a separate geopandas dataframe. This example shows how to merge them together in the simple case that your pieces of data are already just 1-for-1 with your map objects (map is states, data is one row per state)

In [None]:
# Slurp a csv off the web
ev2016 = pd.read_csv('https://raw.githubusercontent.com/RubeRad/camcom/master/2016ev.csv')
ev2016.head() # note one state, one row, state name is in column 'State'

In [None]:
states.head(2) # also one state, one row, state name is in column 'name'

Pandas has its own merge command, but to make sure your merged DataFrame retains its geopandas mapping goodness, you want to use geopandas own merge, like this:

* `ev2016map = `: choose a variable name to hold the merged data
* `states`: this is the map of the states
* `.merge( ... )`: we want to merge another DataFrame together with this mapping one
* `ev2016`: this is the other DataFrame we want to merge
* -- but how do we know which rows should merge together? They need matching state names 'Alabama', 'Alaska', ...
* `left_on='name'`: this is the column in `states` that has those state names
* `right_on='State'`: this is the column in `ev2016` that has matching state names

In [None]:
ev2016map = states.merge(ev2016, left_on='name', right_on='State')

### Checking the merged DataFrames
`shape` helps to show if anything might have fallen out of the merge

In [None]:
ev2016.shape

In [None]:
states.shape

In [None]:
ev2016map.shape

If you do a `head()` on the merged dataframe (and maybe scroll to the right) you can check that it has columns from both dataframes

In [None]:
ev2016map.head(3)

You can also check the list of columns

In [None]:
ev2016map.columns

You can also `info()`, look especially at the "Non-Null Count"

In [None]:
ev2016map.info()

### Using the merged frame for plotting

What's the difference between these?

In [None]:
states.plot()

In [None]:
ev2016map.plot()

No difference, really! Both times they are making the map by plotting the Geometry column, and ignoring any data that is in any other columns

Here, when you specify a column, geopandas can see that it's a categorical column with two values, and it chooses its own colors.

In [None]:
ev2016map.plot(column='Winning Party')

If you want to choose your own colors, you can make a new column filled with colors like this:

In [None]:
# 'red' and 'blue' is a little intense
ev2016map['winning_party_color'] = ev2016map['Winning Party'].map({'Republicans':'pink', 'Democrats':'lightblue'})
ev2016map.head()
# scroll right to see new column 'party_color'

In [None]:
ev2016map.winning_party_color.value_counts()

In [None]:
ev2016map.plot(color=ev2016map.winning_party_color) #, edgecolor='gray')

## Merging with multiple data rows per map row

This CORGIS dataset is the Washington Post database on deaths by police shootings (from 2015 to late 2021). 

Note there are multiple shootings per state -- because there are only 51 'states' (including DC) in the map, and over 6000 shootings in the WaPo data!

In [None]:
wapo = pd.read_csv('https://corgis-edu.github.io/corgis/datasets/csv/police_shootings/police_shootings.csv')
wapo

In [None]:
wapo.columns

In [None]:
states.columns

### Naive merging -- too many rows!
Let's first try merging as before.
* `wapo` has the state in column `Incident.Location.State`, with 2-letter values like `WA, OR, KS`, etc
* to merge with the `states` mapping dataframe, we need to use column `iso_3166_2`

In [None]:
naive = states.merge(wapo, left_on='iso_3166_2', right_on='Incident.Location.State')

OK that has no errors, so it merged. What do we actually have? How big is it?

In [None]:
naive

In [None]:
# This works but is slow!
naive.plot()

In [None]:
# This is also slow!
# When it's done, what is this 'age' that is plotted for each state?
naive.plot(column='Person.Age', legend=True)

What's happening there is that geopandas is going through all 6000+ rows of `naive`, and drawing each state. So each of the 50 states is overdrawn as many times as that state shows up in the dataframe. And the color that is shown for `'Person.Age'` is the age of whatever is the last person that gets (over)plotted for that state!

Not useful!

What we need is a data frame size 51, so it can line up individual ready-to-plot numerical values against the states.

### Plotting *Counts*
Let's say we want to just plot the *number* of shootings in each state (and without all that redundant overplotting). We need a 51-row dataframe with those counts in it, so when we merge, we have exactly the data we want. 
1. *First* work on the data to get it to 51 rows
1. *Then* merge it with the map so it plots right (and quick)

In [None]:
# Note in the head(), Incident.Location.State is displayed lower, because it is the new 'index'
# if you want that to go away, then add...                         THIS  
wapo_counts = wapo.groupby('Incident.Location.State').count() # .reset_index()
wapo_counts.head(3)

***NOTE:*** All the columns have the same values! It's just the count of the number of rows that got combined by the `groupby()`. We can choose any of them when we end up plotting!

In [None]:
wapo_counts.shape

In [None]:
states.shape

In [None]:
count_merge = states.merge(wapo_counts, left_on='iso_3166_2', right_on='Incident.Location.State')
count_merge.shape

In [None]:
count_merge.head(3)

In [None]:
count_merge.plot(column='Person.Name', legend=True)

So this is basically just saying "there are more people in CA and TX to get shot by the police". It would be better to make this per-capita, or per-million. Pandas can do that

In [None]:
# Make a new column that does arithmetic on other columns
count_merge['count_per_million'] = count_merge['Person.Name'] / count_merge.census * 1000000
count_merge.count_per_million.describe()

In [None]:
count_merge.plot(column='count_per_million', legend=True)

### Plotting *Averages*
In that last example we used `groupby().counts()`. There are other options, most notably `groupby().mean()` but also `.sum()`, `.max()`, `.min()`, etc.

In [None]:
wapo_avg = wapo.groupby('Incident.Location.State').mean()
# the warning is saying like "I don't know how to take an average of a bunch of names, imma leave that out"

In [None]:
wapo_avg.head()

Of those remaining numerical columns that `groupby().mean()` knew how to compute a statewide average for, most of them are uninteresting (average month of the year is about 6, average day of the month is about 15, yeah years have 12 months and months have about 30 days)

* What meaning might be present in per-state average Person.Age?
* What meaning might be present in per-state average Incident.Date.Year?

As before, we have to merge with the map before plotting

In [None]:
avg_merge = states.merge(wapo_avg, left_on='iso_3166_2', right_on='Incident.Location.State')
avg_merge.shape

In [None]:
avg_merge.plot(column='Person.Age', legend=True)

What does this say? Why is NH (VT?) bright yellow? Why are CA/IL dark blue? Why are ND/SD/AK dark blue?

In [None]:
avg_merge.plot(column='Incident.Date.Year', legend=True)

What does this say? What does it mean that WY is so dark? What are the brightest states and what does that mean?

## Adding coarse categories
50 states is a lot, that would be too many colors to distinguish, or too many bars in a barplot to see what's going on. You can add a new column with a coarser categorization.

This example shows how to add a categorical column to a dataframe and visualize it with geopandas or seaborn.
I'm going to take a guess at labeling states as to whether they have a dominant party, or are a 'battleground' state.

In [None]:
# First create a new column, and fill it entirely with a nonsense value
states['dominant'] = 'NOPE'     # creating a new column needs [' '] syntax
states.dominant.value_counts()  # once the column exists, can switch to the simpler . syntax

In [None]:
# Step-by-step, choose a bunch of states and set their value in the new column
# Note all these 2-letter abbreviations match what is found in column 
#            states.iso_3166_2
Rstates = ['WY','KY','AL','MS','TX','ND','SD']  # not a complete list yet

# now we select those rows
whichrows = states.iso_3166_2.isin( Rstates )

# the new column full of NOPE that we need to fill in is called 'dominant'
whichcol = 'dominant'

# now we can use the pandas loc[] command to set the color value for those states
states.loc[whichrows, whichcol] = 'Republican'

In [None]:
# How does it look now?
states.dominant.value_counts()

In [None]:
# Now let's do the same in two lines, for some blue states
Dstates = ['CA', 'OR', 'WA', 'NY', 'MA', 'DC']
states.loc[ states.iso_3166_2.isin(Dstates),  'dominant' ] = 'Democrat'
# could actually do it in one line, putting that whole [] list inside the isin()
# but that would be a very long line, hard to read

In [None]:
# How does it look now?
states.dominant.value_counts()

In [None]:
Bstates = ['PA','OH','FL','GA','WI','MN','MI']
states.loc[ states.iso_3166_2.isin(Bstates), 'dominant'] = 'Battleground'

In [None]:
# How does it look now?
states.dominant.value_counts()

In [None]:
# what's missing? make a slice to see what states still need assignments
nopestates = states[ states.dominant == 'NOPE' ]
nopestates.head()
#nopestates.iso_3166_2.value_counts()

In [None]:
# when it's all done it will look more like this
Rstates = ['WY','KY','AL','MS','TX','ND','SD','AR','LA','MT','OK','TN','WV','ID','KS','SC','UT','IA','NE','NH'] 
Dstates = ['CA', 'OR', 'WA', 'NY', 'MA', 'DC','CT','IL','VA','DE','MD','NJ','VT','ME','RI']
Bstates = ['PA','OH','FL','GA','WI','MN','MI','AZ','CO','IN','NM','NC','MO','NV','AK','HI']
states.loc[ states.iso_3166_2.isin(Rstates),  'dominant' ] = 'Republican'
states.loc[ states.iso_3166_2.isin(Dstates),  'dominant' ] = 'Democrat'
states.loc[ states.iso_3166_2.isin(Bstates),  'dominant' ] = 'Battleground'

In [None]:
nopestates = states[ states.dominant == 'NOPE' ]
nopestates.head()

### Show the categorization: Way 1
Make a separate slice per category and use a separate plot to put them all on the same axes

In [None]:
dfRep = states[ states.dominant=='Republican']
dfDem = states[ states.dominant=='Democrat']
dfBtl = states[ states.dominant=='Battleground']

In [None]:
fig = plt.figure(figsize=(12,10))
axes=plt.gca()

dfRep.plot(ax=axes, color='r')
dfDem.plot(ax=axes, color='b')
dfBtl.plot(ax=axes, color='purple')

plt.show()



### Show the categorization: Way 2
Make a new column full of color codes/names and tell geopandas to use it

In [None]:
states['dominant_party_color'] = 'red' # everybody's red, EXCEPT...
# use the .loc command to change colors for the other categories
#           < first part is which rows   >
#                                            < second part is col >
#                                                                      < last is new value>
states.loc[ states.dominant=='Democrat',     'dominant_party_color' ] = 'blue'
states.loc[ states.dominant=='Battleground', 'dominant_party_color' ] = 'purple'

In [None]:
states.head()
# scroll right to check dominant_party_color

In [None]:
states.dominant_party_color.value_counts()

In [None]:
states.plot(color = states.dominant_party_color)

### Other things you can do with categorization: Use seaborn for plotting

In [None]:
# re-merge with washington post counts, now that states has the categorization columns
count_merge = states.merge(wapo_counts, left_on='iso_3166_2', right_on='Incident.Location.State')
count_merge.shape

In [None]:
sns.catplot(data=count_merge, x='dominant', y='Person.Name')

### Other things you can do with categorization: Use geopandas for mapping

In [None]:
avg_merge = states.merge(wapo_avg, left_on='iso_3166_2', right_on='Incident.Location.State')
avg_merge.shape

In [None]:
dfRep = avg_merge[ avg_merge.dominant=='Republican']
dfDem = avg_merge[ avg_merge.dominant=='Democrat']
dfBtl = avg_merge[ avg_merge.dominant=='Battleground']

In [None]:
fig = plt.figure(figsize=(12,10))
axes=plt.gca()

dfRep.plot(ax=axes, column='Person.Age', cmap='Reds',    legend=True)
dfDem.plot(ax=axes, column='Person.Age', cmap='Blues',   legend=True)
dfBtl.plot(ax=axes, column='Person.Age', cmap='Purples', legend=True)

plt.show()