In [1]:
#import and alias appropriately pandas, numpy, matplotlib.pyplot, and seaborn

import pandas as pd
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plt

import seaborn as sns 

In [2]:
#read in the combined dataset from the first pandas lab

data = pd.read_csv("https://raw.githubusercontent.com/Zipcoder/DataEngineering.Labs.Libraries/master/\
Pandas%20-%20Part%201%20(importing%20data%2C%20summary%20stats)/Combined%20Wine%20Data.csv?\
token=ALOLXOJ73RI2TJV7EOL3DHK6HNRPQ")

In [3]:
#display the top of the df

data.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,color
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,red
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,red
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,red
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,red
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,red


Let's jump into feature engineering. To quote Wikipedia:

    "Feature engineering is the process of using domain knowledge of the data to create features that make machine learning algorithms work."
    
In practical terms, feature engineering is just adding to a data set, using your domain knowledge.

To show this - let's start with a very simple example. Let's go back to our earlier example. Let's add a column to our data set that simply indicates whether a wine has above average sweetness or not. In reality, this is probably not a super helpful feature, but we can take what we learn here, and then you can create your own features.

Although there are certainly ways to do this in just pandas, we're going to jump into combining numpy and pandas. These two libraries are used together all the time. Many popular libraries (including pandas) are built upon numpy.

Do some reading on the <a href="https://docs.scipy.org/doc/numpy/reference/generated/numpy.where.html">np.where()</a> function, before moving on.




In [4]:
#using np.where() add a column named 'sweet_or_not' that is True/False based on whether the row has above average sugar

data['sweet_or_not'] = np.where(data['residual sugar'] >= data['residual sugar'].mean(), True, False)

In [5]:
#added a cell to find mean residual sugar so that we can check if the True False WHERE condition worked as expected or not
data['residual sugar'].mean()

5.4432353393874156

In [6]:
#display the top of the df to see your new column

data.head()



Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,color,sweet_or_not
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,red,False
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,red,False
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,red,False
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,red,False
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,red,False


Let's get a little more complex now. Let's add a column based on multiple conditions. I want to find wines that are:

 - Ranked 6 or higher in the quality column
 - Above average sweetness
 - Above the 75% percentile in alcohol content
 - Reds only
 
Below, define the qualities you'd like to filter by.

In [7]:
#let's define our qualities up front

quality = 6 #your quality cutoff
sweet = True #sweet or not
alcohol = data.alcohol.quantile(0.75) #alchol cutoff
color = 'red' #color

Now let's build an array that is True/False, based on whether all the conditions we specified above are met.

Do some reading on the <a href="https://docs.scipy.org/doc/numpy/reference/generated/numpy.logical_and.html">np.logical_and()</a> function, before moving on.

In [8]:
#create an array of True/False based on your conditions specified above

condition = np.logical_and(data['quality'] >=quality, data['sweet_or_not'] == sweet)
condition = np.logical_and(condition, data['alcohol'] > alcohol)
#condition = np.logical_and(condition, data['color'] > color)
#The output of the code was showing all white wines for me. So I changed the condition from color > color to color == color. now my output is showing all red wines as expected#condition = np.logical_and(condition, data['color'] > color)
condition = np.logical_and(condition, data['color'] == color)
condition = np.logical_and(condition, data['color'] == color)

In [9]:
#show the value counts for your array

#condition
condition.value_counts()

False    6485
True       12
dtype: int64

In [10]:
#add a column named 'i_like_it' based on if your conditional array above is True/False

data['i_like_it'] = condition
#red['color'] = "red"


In [11]:
#let's save just the ones we like to a csv -  we'll use the file in the next lab.

the_ones_i_like = data[data['i_like_it'] == True]
the_ones_i_like.to_csv('my favorite wines.csv')

In [14]:
# printed the data frame to check the final output and now it shows red wine.
the_ones_i_like

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,color,sweet_or_not,i_like_it
278,10.3,0.32,0.45,6.4,0.073,5.0,13.0,0.9976,3.23,0.82,12.6,8,red,True,True
378,11.4,0.625,0.66,6.2,0.088,6.0,24.0,0.9988,3.11,0.99,13.3,6,red,True,True
413,9.9,0.4,0.53,6.7,0.097,6.0,19.0,0.9986,3.27,0.82,11.7,7,red,True,True
494,6.5,0.39,0.23,8.3,0.051,28.0,91.0,0.9952,3.44,0.55,12.1,6,red,True,True
501,10.4,0.44,0.73,6.55,0.074,38.0,76.0,0.999,3.17,0.85,12.0,7,red,True,True
502,10.4,0.44,0.73,6.55,0.074,38.0,76.0,0.999,3.17,0.85,12.0,7,red,True,True
538,12.9,0.35,0.49,5.8,0.066,5.0,35.0,1.0014,3.2,0.66,12.0,7,red,True,True
974,8.8,0.33,0.41,5.9,0.073,7.0,13.0,0.99658,3.3,0.62,12.1,7,red,True,True
1038,8.7,0.41,0.41,6.2,0.078,25.0,42.0,0.9953,3.24,0.77,12.6,7,red,True,True
1079,7.9,0.3,0.68,8.3,0.05,37.5,278.0,0.99316,3.01,0.51,12.3,7,red,True,True
