## Data Acquistion Exercises

In [7]:
import env
import acquire
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
import warnings
warnings.filterwarnings(action='ignore')
import matplotlib.pyplot as plt
import seaborn as sns
import scipy as sp

1. Use a an SQL Query containing datasets as a source from the iris data. Create a pandas dataframe, df_iris, from this data.

In [None]:
df = acquire.get_iris_data()

- Print the first 3 rows

In [None]:
df.head(3)

- Print the number of rows and columns (shape)

In [None]:
df.shape

- Print the column names

In [None]:
df.columns

- Print the data type of each column

In [None]:
df.dtypes

- Print the summary statistics for each of the numeric variables. Would you recommend rescaling the data based on these statistics?

In [None]:
df.describe()

** I would not recommend rescaling the data because the values are all on the same scale.

2. Read Table1_CustDetails the excel module dataset, Excel_Exercises.xlsx, into a dataframe, df_excel

In [None]:
df_excel = pd.read_excel('Excel_Exercises.xlsx', sheet_name=0)

- Assign the first 100 rows to a new dataframe, df_excel_sample

In [None]:
df_excel_sample = df_excel[:100]

- Print the number of rows of your original dataframe

In [None]:
len(df_excel.index)

- Print the first 5 column names

In [None]:
list(df_excel.columns[:5])

- Print the column names that have a data type of object

In [None]:
num_cols = list(df_excel.dtypes[df_excel.dtypes=='object'].index)

- Compute the range for each of the numeric variables.

In [None]:
df_excel.describe()

3. Read the data from this google sheet into a dataframe, df_google

In [None]:
sheet_url = 'https://docs.google.com/spreadsheets/d/1Uhtml8KY19LILuZsrDtlsHHDC9wuDGUSe8LTEwvdI5g/edit#gid=341089357'

csv_export_url = sheet_url.replace('/edit#gid=', '/export?format=csv&gid=')

df_googlesheet = pd.read_csv(csv_export_url)

- Print the first 3 rows

In [None]:
df_googlesheet.head(3)

- Print the number of rows and columns

In [None]:
df_googlesheet.shape

- Print the column names

In [None]:
df_googlesheet.columns

- Print the data type of each column

In [None]:
df_googlesheet.info()

- Print the summary statistics for each of the numeric variables

In [None]:
df_googlesheet.describe()

- Print the unique values for each of your categorical variables

In [None]:
for col in df_googlesheet:
    if df_googlesheet[col].dtype=='O':
       print(df_googlesheet[col].value_counts()[0:5])

## Data Preperation Exercises

1. Iris Data

- Use the function defined in acquire.py to load the iris data.

In [None]:
from acquire import get_iris_data

iris_df = get_iris_data()
iris_df.head()

- Drop the species_id and measurement_id columns.

In [None]:
iris_df.drop(columns = ['species_id', 'measurement_id'], inplace = True)

- Rename the species_name column to just species.

In [None]:
iris_df['species'] = iris_df['species_name']

- Encode the species name using a sklearn label encoder. Research the inverse_transform method of the label encoder. How might this be useful?

In [None]:
encoder = LabelEncoder()

encoder.fit(iris_df.species)

iris_df.species = encoder.transform(iris_df.species)

- Create a function named prep_iris that accepts the untransformed iris data, and returns the data with the transformations above applied.

In [None]:
def prep_iris(iris_df):
    iris_df.drop(columns=['species_id', 'measurement_id'], inplace=True)
    iris_df.rename(columns={'species_name':'species'}, inplace=True)
    encoder = LabelEncoder()
    encoder.fit(iris_df.species)
    iris_df.species = encoder.transform(iris_df.species)
    return iris_df

2. Titanic Data

- Use the function you defined in acquire.py to load the titanic data set.

In [None]:
from acquire import get_titanic_data

titanic = get_titanic_data()

- Handle the missing values in the embark_town and embarked columns.

In [None]:
def titanic_missing_fill(titanic):
    titanic.embark_town.fillna('Other', inplace = True)
    titanic.embarked.fillna('Unknown', inplace = True)
    return titanic

- Remove the deck column.

In [None]:
def titanic_remove_columns(titanic):
    return titanic.drop(columns = ['deck'])

- Use a label encoder to transform the embarked column.

In [None]:
def encode_titanic(titanic):
    encoder = LabelEncoder()
    encoder.fit(titanic.embarked)
    titanic.embarked = encoder.transform(titanic.embarked)
    return titanic, encoder

- Scale the age and fare columns using a min max scaler. Why might this be beneficial? When might you not want to do this?

In [None]:
def scale_titanic(titanic):
    scaler = MinMaxScaler()
    scaler.fit(titanic[['age','fare']])
    titanic[['age','fare']] = scaler.transform(titanic[['age','fare']])
    return titanic, scaler

- Create a function named prep_titanic that accepts the untransformed titanic data, and returns the data with the transformations above applied.

In [None]:
def prep_titanic(titanic):
    titanic = titanic_missing_fill(titanic)
    titanic = titanic_remove_columns(titanic)
    titanic, encoder = encode_titanic(titanic)
    titanic, scaler = scale_titanic(titanic)
    return titanic, encoder, scaler

## Exploratory Analysis Exercises

- Use the iris dataset.

In [1]:
from acquire import get_iris_data
from prepare import prep_iris
from split_scale import split_my_data

iris = get_iris_data()
iris.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1685 entries, 0 to 1684
Data columns (total 4 columns):
customer_id        1685 non-null object
monthly_charges    1685 non-null float64
tenure             1685 non-null int64
total_charges      1685 non-null float64
dtypes: float64(2), int64(1), object(1)
memory usage: 65.8+ KB


Unnamed: 0,species_id,measurement_id,sepal_length,sepal_width,petal_length,petal_width,species_name
0,1,1,5.1,3.5,1.4,0.2,setosa
1,1,2,4.9,3.0,1.4,0.2,setosa
2,1,3,4.7,3.2,1.3,0.2,setosa
3,1,4,4.6,3.1,1.5,0.2,setosa
4,1,5,5.0,3.6,1.4,0.2,setosa


In [2]:
def prep_iris(iris_df):
    iris_df.drop(columns=['species_id', 'measurement_id'], inplace=True)
    iris_df.rename(columns={'species_name':'species'}, inplace=True)
    # encoder = LabelEncoder()
    # encoder.fit(iris_df.species)
    # iris_df.species = encoder.transform(iris_df.species)
    return iris_df

In [3]:
iris_data = prep_iris(iris)

1. Split data into train (70%) & test (30%) samples.

In [4]:
iris_train, iris_test  = split_my_data(iris_data, .70)

2. Create a swarmplot using a melted dataframe of all your numeric variables. The x-axis is the variable name, the y-axis is the measure. Add another dimension using color to represent species. Document takeaways from this visualization.

In [5]:
iris_train.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
114,5.8,2.8,5.1,2.4,virginica
136,6.3,3.4,5.6,2.4,virginica
53,5.5,2.3,4.0,1.3,versicolor
19,5.1,3.8,1.5,0.3,setosa
38,4.4,3.0,1.3,0.2,setosa


In [None]:
sns.set(style="whitegrid", palette="muted")

# "Melt" the dataset to "long-form" representation

df_melt = pd.melt(iris_train[['sepal_length','sepal_width','petal_length', 'petal_width', 'species']], "species", var_name="variables")

plt.figure(figsize = (8,6))

# Draw a categorical scatterplot to show each observation
p = sns.swarmplot(x="variables", y="value", hue='species', palette=["r", "c", "y"], data=df_melt)
#p.set(yscale="log")

plt.show()

3. Create 4 subplots (2 rows x 2 columns) of scatterplots

sepal_length x sepal_width

petal_length x petal_width

sepal_area x petal_area

sepal_length x petal_length

What are your takeaways?

In [None]:
# Create new columns that for sepal_area and petal_area

In [None]:
iris_train['sepal_area'] = iris_train.sepal_length * iris_train.sepal_width
iris_train['petal_area'] = iris_train.petal_length * iris_train.petal_width

In [None]:
_, ax = plt.subplots(nrows=2, ncols=2, figsize=(14,8))

plt.subplot(2, 2, 1)
sns.scatterplot(x="sepal_length", y="sepal_width", hue='species',data = iris_train)

plt.subplot(2, 2, 2)
sns.scatterplot(x="petal_length", y="petal_width", hue='species', data = iris_train)

plt.subplot(2, 2, 3)
sns.scatterplot(x="sepal_area", y="petal_area", hue='species', data = iris_train)

plt.subplot(2, 2, 4)
sns.scatterplot(x="sepal_length", y="petal_width", hue='species',data = iris_train)

4. Create a heatmap of each variable layering correlation coefficient on top

In [None]:
plt.figure(figsize = (14, 8))

sns.heatmap(iris_train.corr(), cmap='Reds', annot = True)

5. Create a scatter matrix visualizing the interaction of each variable

In [None]:
from matplotlib import cm
from matplotlib.ticker import FormatStrFormatter

cmap = cm.get_cmap('gnuplot')
axes = pd.plotting.scatter_matrix(
    iris_train[['sepal_length','sepal_width','petal_length', 'petal_width']], marker='o', s=40,
    hist_kwds={'bins':15},  figsize=(14,9), cmap=cmap)
for ax in axes.flatten():
    ax.yaxis.set_major_formatter(FormatStrFormatter('%.2f'))


# axes = pd.plotting.scatter_matrix(df, alpha=0.3, figsize=(9,9), diagonal='kde')
# for ax in axes.flatten():
#    ax.yaxis.set_major_formatter(FormatStrFormatter('%.2f')) 

6. Is the sepal length significantly different in virginica compared to versicolor? Run an experiment to test this.

- Must include null hypothesis, alternative hypothesis, t-test, results, summary
- H
0
: the difference in sepal length between virginica and versicolor is insignificant.
- H
a
: the difference in sepal length between virginica and versicolor is substantial.
- We will test if the sepal length of virginica is significantly different than that of the versicolor.
- If there is difference, then variable sepal_length is a good choice to keep as a feature.
- We can use a t-test here, as sepal_length is somewhat normally distributed.

In [9]:
sp.stats.ttest_ind(
    iris_train[iris_train.species == 'virginica'].sepal_length.dropna(),
    iris_train[iris_train.species == 'versicolor'].sepal_length.dropna())

Ttest_indResult(statistic=4.232219188580116, pvalue=6.811720108343317e-05)

We reject the null hypothesis that there is no significant difference between sepal_length from Virginica and Versicolor species.