## Data Acquistion Exercises

In [16]:
import env
import acquire
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler

1. Use a an SQL Query containing datasets as a source from the iris data. Create a pandas dataframe, df_iris, from this data.

In [None]:
df = acquire.get_iris_data()

- Print the first 3 rows

In [None]:
df.head(3)

- Print the number of rows and columns (shape)

In [None]:
df.shape

- Print the column names

In [None]:
df.columns

- Print the data type of each column

In [None]:
df.dtypes

- Print the summary statistics for each of the numeric variables. Would you recommend rescaling the data based on these statistics?

In [None]:
df.describe()

** I would not recommend rescaling the data because the values are all on the same scale.

2. Read Table1_CustDetails the excel module dataset, Excel_Exercises.xlsx, into a dataframe, df_excel

In [None]:
df_excel = pd.read_excel('Excel_Exercises.xlsx', sheet_name=0)

- Assign the first 100 rows to a new dataframe, df_excel_sample

In [None]:
df_excel_sample = df_excel[:100]

- Print the number of rows of your original dataframe

In [None]:
len(df_excel.index)

- Print the first 5 column names

In [None]:
list(df_excel.columns[:5])

- Print the column names that have a data type of object

In [None]:
num_cols = list(df_excel.dtypes[df_excel.dtypes=='object'].index)

- Compute the range for each of the numeric variables.

In [None]:
df_excel.describe()

3. Read the data from this google sheet into a dataframe, df_google

In [None]:
sheet_url = 'https://docs.google.com/spreadsheets/d/1Uhtml8KY19LILuZsrDtlsHHDC9wuDGUSe8LTEwvdI5g/edit#gid=341089357'

csv_export_url = sheet_url.replace('/edit#gid=', '/export?format=csv&gid=')

df_googlesheet = pd.read_csv(csv_export_url)

- Print the first 3 rows

In [None]:
df_googlesheet.head(3)

- Print the number of rows and columns

In [None]:
df_googlesheet.shape

- Print the column names

In [None]:
df_googlesheet.columns

- Print the data type of each column

In [None]:
df_googlesheet.info()

- Print the summary statistics for each of the numeric variables

In [None]:
df_googlesheet.describe()

- Print the unique values for each of your categorical variables

In [None]:
for col in df_googlesheet:
    if df_googlesheet[col].dtype=='O':
       print(df_googlesheet[col].value_counts()[0:5])

## Data Preperation Exercises

1. Iris Data

- Use the function defined in acquire.py to load the iris data.

In [8]:
from acquire import get_iris_data

iris_df = get_iris_data()
iris_df.head()

Unnamed: 0,species_id,measurement_id,sepal_length,sepal_width,petal_length,petal_width,species_name
0,1,1,5.1,3.5,1.4,0.2,setosa
1,1,2,4.9,3.0,1.4,0.2,setosa
2,1,3,4.7,3.2,1.3,0.2,setosa
3,1,4,4.6,3.1,1.5,0.2,setosa
4,1,5,5.0,3.6,1.4,0.2,setosa


- Drop the species_id and measurement_id columns.

In [11]:
iris_df.drop(columns = ['species_id', 'measurement_id'], inplace = True)

- Rename the species_name column to just species.

In [15]:
iris_df['species'] = iris_df['species_name']

- Encode the species name using a sklearn label encoder. Research the inverse_transform method of the label encoder. How might this be useful?

In [24]:
encoder = LabelEncoder()

encoder.fit(iris_df.species)

iris_df.species = encoder.transform(iris_df.species)

- Create a function named prep_iris that accepts the untransformed iris data, and returns the data with the transformations above applied.

In [25]:
def prep_iris(iris_df):
    iris_df.drop(columns=['species_id', 'measurement_id'], inplace=True)
    iris_df.rename(columns={'species_name':'species'}, inplace=True)
    encoder = LabelEncoder()
    encoder.fit(iris_df.species)
    iris_df.species = encoder.transform(iris_df.species)
    return iris_df

2. Titanic Data

- Use the function you defined in acquire.py to load the titanic data set.

In [31]:
from acquire import get_titanic_data

titanic = get_titanic_data()
titanic.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


- Handle the missing values in the embark_town and embarked columns.

In [35]:
def titanic_missing_fill(titanic):
    titanic.embark_town.fillna('Other', inplace = True)
    titanic.embarked.fillna('Unknown', inplace = True)
    return titanic

- Remove the deck column.

In [39]:
def titanic_remove_columns(titanic):
    return titanic.drop(columns = ['deck'])

- Use a label encoder to transform the embarked column.

In [42]:
def encode_titanic(titanic):
    encoder = LabelEncoder()
    encoder.fit(titanic.embarked)
    titanic.embarked = encoder.transform(titanic.embarked)
    return titanic, encoder

- Scale the age and fare columns using a min max scaler. Why might this be beneficial? When might you not want to do this?

In [49]:
def scale_titanic(titanic):
    scaler = MinMaxScaler()
    scaler.fit(titanic[['age','fare']])
    titanic[['age','fare']] = scaler.transform(titanic[['age','fare']])
    return titanic, scaler

- Create a function named prep_titanic that accepts the untransformed titanic data, and returns the data with the transformations above applied.

In [64]:
def prep_titanic(titanic):
    titanic = titanic_missing_fill(titanic)
    titanic = titanic_remove_columns(titanic)
    titanic, encoder = encode_titanic(titanic)
    titanic, scaler = scale_titanic(titanic)
    return titanic, encoder, scaler