In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

# Cleaning outliers

The cars dudes have this problem, maybe you do too?

In [None]:
cars = pd.read_csv('https://corgis-edu.github.io/corgis/datasets/csv/cars/cars.csv')
cars

In [None]:
cars.max()

Max highway mpg 223 !?!?!? That's gotta be a mistake. Here's how to find it...

In [None]:
cars[ cars['Fuel Information.Highway mpg'] > 100 ]

...and fix it. Using that index right there and the column name, we can reach in and replace it with a reasonable value:

In [None]:
cars.at[3686, 'Fuel Information.Highway mpg'] = 23
cars['Fuel Information.Highway mpg'].describe()

But what if there's still a problem in another column? The standard way to deal with everything at once is using 1.5xIQR (Inter-Quartile-Range -- remember the placement of the boxplot whiskers?). As [this stackoverflow answer](https://stackoverflow.com/questions/35827863/remove-outliers-in-pandas-dataframe-using-percentiles) says, 'Use this code and don't waste your time':

In [None]:
# Start over with the outlier in again
cars = pd.read_csv('https://corgis-edu.github.io/corgis/datasets/csv/cars/cars.csv')
Q1 = cars.quantile(0.25)
Q3 = cars.quantile(0.75)
IQR = Q3-Q1
outliers = cars[ ((cars < (Q1 - 1.5 * IQR)) | (cars > (Q3 + 1.5 * IQR))).any(axis=1)]
outliers

In [None]:
outliers.max()

In [None]:
# Here's where we strip cars of outliers. Same as before except add a tilde ~
cars     = cars[~((cars < (Q1 - 2 * IQR)) | (cars > (Q3 + 2 * IQR))).any(axis=1)]
cars.max()

This kind of thing might remove more stuff than you want, like all the Bentleys and Lamborghinis etc that have outlier horsepower. You can try increasing 1.5 to maybe 2-3, or maybe just stick with the repair of isolated cells as shown above

# Setting column names

It could be your data has no names on top of the columns, it could be you want to use your own column names which are easier to type:

In [None]:
abalone=pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.data')
abalone

In [None]:
colnames=['Sex', 'Length', 'Diameter', 'Height', 'Whole Weight', 'Shucked Weight', 'Viscera Weight', 'Shell Weight', 'Rings']
abalone=pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.data', names=colnames)
abalone

# Selecting DataFrame rows based on a condition

Certainly everybody will have to do this fundamental pandas operation 

In [None]:
abalone['Sex'].value_counts()

In [None]:
Mab = abalone[ abalone['Sex'] == 'M' ]
Mab

In [None]:
Fab = abalone[ abalone['Sex'] == 'F' ]
Fab

In [None]:
Iab = abalone[ abalone['Sex'] == 'I' ]
Iab

# Control the size of Seaborn catplots

This is cramped, and there are a lot of errors about points not fitting

In [None]:
sns.catplot(data=abalone, x='Sex', y='Whole Weight', hue='Rings', kind='swarm')

sns.catplot() accepts height and aspect arguments. height=8 makes it taller, and aspect=2 makes it twice as wide as tall

In [None]:
sns.catplot(data=abalone, x='Sex', y='Whole Weight', hue='Rings', kind='swarm',  height=8, aspect=2)

# Grouping rows with common values

The electricity dataset has multiple power companies per state. It can be useful to aggregate into a per-state DataFrame

In [None]:
elec = pd.read_csv('https://corgis-edu.github.io/corgis/datasets/csv/electricity/electricity.csv')
elec

In [None]:
# Sometimes the reasonable way to groupby() is .sum()
elecStateSum = elec.groupby('Utility.State').sum()
elecStateSum.head()

In [None]:
# Sometimes (Demand.Summer.Peak ? ) max() may be more appropriate, or maybe sometimes mean()
elecStateMax = elec.groupby('Utility.State').max()
elecStateMax.head()

# More complicated grouping

There are multiple car makers from the same country, a new column can be put together to create a bulkier grouping than individual manufacturers

In [None]:
cars['Identification.Make'].value_counts()

In [None]:
# Set this first, EVERYBODY gets other, so after we set a bunch, we can see who we missed
cars['Country'] = 'Other'
cars['Country'].value_counts()

In [None]:
# I put this on multiple lines just so you can see the important parts more clearly
cars.loc[ 
          cars['Identification.Make'].isin( ['Saab', 'Volvo'] ), 
          'Country' 
        ] = 'Sweden'

In [None]:
cars.loc[ cars['Identification.Make'].isin( ['Audi', 'BMW', 'Mercedes-Benz', 'Porsche', 'Volkswagen' ] ), 'Country' ] = 'Germany'

In [None]:
cars['Country'].value_counts()

In [None]:
# Check what's left over
leftovers = cars[ cars['Country'] == 'Other' ]
leftovers['Identification.Make'].value_counts()

# Making a function to graph lots of stuff

In [None]:
bball = pd.read_csv('https://raw.githubusercontent.com/RubeRad/tcscs/master/Kenpom_cbb_dataALL.csv')
bball.head()

I expect this kind of code would get a bit repetitive, and cut&paste is never a good idea

In [None]:
plt.figure()
axes=plt.gca()

kentacky = bball[ bball['Team'] == 'Kentucky']
kx = kentacky['Year']
ky = kentacky['AdjO']
axes.plot(kx, ky)

alababama = bball[ bball['Team'] == 'Alabama']
ax = alababama['Year']
ay = alababama['AdjO']
axes.plot(ax, ay)

tennesaw  = bball[ bball['Team'] == 'Tennessee']
tx = tennesaw['Year']
ty = tennesaw['AdjO']
axes.plot(tx, ty)

axes.legend(['Kentucky', 'Alabama', 'Tennessee'])

plt.show()


Better to make a function to do this kind of thing generically

In [None]:
def multi_plot(adf,        # a DataFrame
               scol_name,  # the name of the column we are selecting rows from
               scol_vals,  # the list of values we are selecting; one plot for each
               xcol_name,  # the name of the column to use for X in each plot
               ycol_name,  # the name of the column to use for Y in each plot
               color=None, # if you let this stay None, matplotlib will choose colors
               alpha=None, # if you let this stay None, alpha=opacity=1
               scatter=False, # default plot (line)
               axes=None   # if you don't pass axes in, axes for a new figure will be returned
              ):

    if axes == None:
      plt.figure()
      axes=plt.gca() 

    for val in scol_vals:
        # Grab a DataFrame of just the values we want from the selection column
        sub_df = adf[ adf[scol_name] == val ]
        
        # Grab the Series (columns) for X and Y
        xs = sub_df[xcol_name]
        ys = sub_df[ycol_name]
        
        # Plot these xs vs ys
        if scatter:
          axes.scatter(xs, ys, color=color, alpha=alpha)
        else:
          axes.plot(xs, ys, color=color, alpha=alpha)
            
    return axes # caller may want to graph more stuff, add titles, mess with range, etc etc
    

In [None]:
#good_teams = ['Duke', 'Kentucky','Baylor','Virginia','Gonzaga']
axes = multi_plot(bball, 'Team', bball['Team'].unique(), 'Year', 'AdjO', color='gray',   alpha=0.1)
axes = multi_plot(bball, 'Team', ['Duke'],               'Year', 'AdjO', color='blue',   axes=axes)
axes = multi_plot(bball, 'Team', ['Baylor'],             'Year', 'AdjO', color='yellow', axes=axes)
axes = multi_plot(bball, 'Team', ['Virginia'],           'Year', 'AdjO', color='orange', axes=axes)
#axes.legend( good_teams )