# Lab

## Import

In [None]:
import numpy as np
import matplotlib.pyplot as plt

In [None]:
plt.rcParams['font.family'] = 'Avenir'
plt.rcParams['mathtext.fontset'] = 'stix'
plt.rcParams['font.size'] = 12
plt.rcParams['figure.dpi'] = 300

In [None]:
IMG_EXPORT_CONFIG = {
    'dpi': 500,
    'bbox_inches': 'tight',
    'pad_inches': 0.15,
}

## Code

### Numpy

In [None]:
x = np.array([3, 4, 5])
y = np.array([4, 9, 7])
x + y

In [None]:
x_2d = np.array([[1, 2], [3, 4]])
x_2d.ndim

In [None]:
x.dtype, x_2d.dtype

In [None]:
np.sum(x), np.sum(x_2d)

In [None]:
x = np.array([1, 2, 3, 4, 5, 6])
x_reshape = x.reshape((2, 3))
x_reshape

In [None]:
x_reshape[0, 0]

In [None]:
x_reshape.T

In [None]:
np.round(np.sqrt(x), 2), np.round(np.sqrt(x_reshape), 4)

In [None]:
x**2, x_reshape**2

In [None]:
x = np.round(np.random.normal(size=50), 2)
x

In [None]:
y = np.round(np.random.normal(loc=50, scale=1, size=50), 2)

In [None]:
# correlation matrix between x and y
np.corrcoef(x, x+y)

In [None]:
# rng = random number generator
# use `default_rng` to ensure reproducibility
rng = np.random.default_rng(seed=42)
# normal (Gaussian) distribution
rng.normal(scale=5, size=2)

In [None]:
# standard normal distribution
rng.standard_normal(size=10)

In [None]:
y = np.round(rng.standard_normal(size=10), 2)

In [None]:
np.mean(y), y.mean()

In [None]:
np.var(y), y.var(), np.mean((y - y.mean())**2)

In [None]:
np.sqrt(y.var()), y.std()

In [None]:
x = rng.standard_normal((10, 3))
x

In [None]:
x.mean(axis=0), x.mean(axis=1)

In [None]:
np.linspace(-np.pi, np.pi, 50)

### Graph

In [None]:
fig, ax = plt.subplots(figsize=(8, 8))
x = rng.standard_normal(size=100)
y = rng.standard_normal(size=100)
# fig.set_size_inches(12, 3)
ax.scatter(x, y, marker='o')
ax.set_xlabel('This is the $x$-axis')
ax.set_ylabel('This is the $y$-axis')
ax.set_title('Plot of $x$ vs $y$')
fig.tight_layout()

In [None]:
fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(15, 8))
axes[0, 1].scatter(x, y, marker='o')
axes[1, 2].scatter(x, y, marker='+')
axes[0, 1].set_xlabel('This is the $x$-axis')
axes[0, 1].set_ylabel('This is the $y$-axis')
axes[0, 1].set_title('Plot of $x$ vs $y$')
fig.tight_layout()

In [None]:
fig, ax = plt.subplots(figsize=(8, 8))
x = rng.standard_normal(size=100)
y = rng.standard_normal(size=100)
ax.scatter(x, y, marker='o')
ax.set_xlim([-3, 3])
ax.set_ylim([-4, 4])
fig.tight_layout()

In [None]:
fig.savefig('Test Plot.png',  **IMG_EXPORT_CONFIG)
fig.savefig('Test Plot.pdf', **IMG_EXPORT_CONFIG)

In [None]:
x = np.linspace(-np.pi, np.pi, 50)
y = x
f = np.multiply.outer(np.cos(y), 1 / (1 + x**2))

fig, ax = plt.subplots(figsize=(8, 8))
ax.contour(x, y, f, levels=45)
fig.tight_layout()

# Applied

## Import

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
plt.rcParams['font.family'] = 'Avenir'
plt.rcParams['mathtext.fontset'] = 'stix'
plt.rcParams['font.size'] = 12
plt.rcParams['figure.dpi'] = 300

In [None]:
IMG_EXPORT_CONFIG = {
    'dpi': 500,
    'bbox_inches': 'tight',
    'pad_inches': 0.15,
}

In [None]:
sns.set_theme(style='whitegrid')
sns.set_palette('hls', 8)

## Code

### Exercise 8 / 65,66

(a) Use the `pd.read_csv()` function to read the data into Python. Call the loaded data `college`. Make sure that you have the directory set to the correct location for the data.

In [None]:
college_df = pd.read_csv('data/College.csv')
college_df.head()

(b) Look at the data used in the notebook by creating and running a new cell with just the code `college` in it. You should notice that the first column is just the name of each university in a column named something like `Unnamed: 0`.

```python
college2 = pd.read_csv('College.csv', index_col=0)
college3 = college.rename({'Unnamed: 0': 'College'}, axis=1)
college3 = college3.set_index('College')
```

In [None]:
college2_df = pd.read_csv('data/College.csv', index_col=0)
college2_df.head()

In [None]:
college3_df = college_df.rename({'Unnamed: 0': 'College'}, axis=1)
college3_df = college3_df.set_index('College')
college3_df.head()

In [None]:
college_df = college3_df

(c) Use the `describe()` method of to produce a numerical summary of the variables in the data set.

In [None]:
college_df.describe()

(d) Use the `pd.plotting.scatter_matrix()` function to produce a scatterplot matrix of the first columns `[Top10perc, Apps, Enroll]`. Recall that you can reference a list `C` of columns of a data frame `A` using `A[C]`.

In [None]:
# fig, ax = plt.subplots(3, 3)
# pd.plotting.scatter_matrix(college[['Top10perc', 'Apps', 'Enroll']], ax=ax)
# fig.tight_layout()

sns.pairplot(data=college_df[['Top10perc', 'Apps', 'Enroll']])

(e) Use the `boxplot()` method of `college` to produce side-by-side boxplots of `Outstate` versus `Private`.

In [None]:
college_df.head()

In [None]:
fig, ax = plt.subplots()
sns.boxplot(ax=ax, x='Private', y='Outstate', data=college_df)
fig.tight_layout()

(f) Create a new qualitative variable, called `Elite`, by *binning* the `Top10perc` variable into two groups based on whether or not the proportion of students coming from the top 10% of their high school classes exceeds 50%.

In [None]:
college_df['Elite'] = 'No'
college_df.loc[college_df['Top10perc'] > 50,'Elite'] = 'Yes'

Use the `value_counts()` method of `college['Elite']` to see how many elite universities there are. Finally, use the `boxplot()` method again to produce side-by-side boxplots of `Outstate` versus `Elite`.

In [None]:
fig, ax = plt.subplots()
sns.boxplot(ax=ax, x='Elite', y='Outstate', data=college_df)
fig.tight_layout()

(g) Use the `plot.hist()` method of college to produce some histograms with differing numbers of bins for a few of the quantitative variables.

In [None]:
numeric_types = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
object_types = ['object']
quantitative = college_df.select_dtypes(include=numeric_types).columns
qualitative = college_df.select_dtypes(include=object_types).columns
print(f'Quantitative features: {str(quantitative.format())}')
print(f'Qualitative features: {str(qualitative.format())}')

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(8, 8))
sns.histplot(x='Apps', data=college_df, bins=10, ax=axes[0, 0], shrink=0.8)
sns.histplot(x='Accept', data=college_df, bins=10, ax=axes[0, 1], shrink=0.8)
sns.histplot(x='Enroll', data=college_df, bins=10, ax=axes[1, 0], shrink=0.8)
sns.histplot(x='Outstate', data=college_df, bins=10, ax=axes[1, 1], shrink=0.8)
fig.tight_layout()

### Exercise 9 / 66,67

(a) Which of the predictors are quantitative, and which are qualitative?

In [None]:
auto_df = pd.read_csv('data/Auto.csv')
first_column = auto_df.pop('name')
auto_df.insert(0, 'name', first_column)
auto_df.head()

In [None]:
numeric_types = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
object_types = ['object']
quantitative = auto_df.select_dtypes(include=numeric_types).columns
qualitative = auto_df.select_dtypes(include=object_types).columns
print(f'Quantitative features: {str(quantitative.format())}')
print(f'Qualitative features: {str(qualitative.format())}')

(b) What is the range of each quantitative predictor? You can answer this using the `min()` and `max()` methods in `numpy`.

In [None]:
quantitative_df = auto_df[quantitative.values]
auto_min_max_df = pd.DataFrame({
    'min': quantitative_df.min(),
    'max': quantitative_df.max(),
})
auto_min_max_df

(c) What is the mean and standard deviation of each quantitative predictor?

In [None]:
auto_mean_std_df = pd.DataFrame({
    'mean': quantitative_df.mean(),
    'std': quantitative_df.std(),
})
auto_mean_std_df

(d) Now remove the 10th through 85th observations. What is the range, mean, and standard deviation of each predictor in the subset of the data that remains?

In [None]:
auto_dropped_df = auto_df.drop(auto_df.index[10:86])
auto_dropped_quantitative_df = auto_dropped_df[quantitative.values]
auto_dropped_mean_std_df = pd.DataFrame({
    'min': auto_dropped_quantitative_df.min(),
    'max': auto_dropped_quantitative_df.max(),
    'mean': auto_dropped_quantitative_df.mean(),
    'std': auto_dropped_quantitative_df.std(),
})
auto_dropped_mean_std_df