In [1]:
import pandas as pd

In [2]:
data = pd.read_csv('house_rental_data.csv')

In [3]:
data.head()

Unnamed: 0.1,Unnamed: 0,Sqft,Floor,TotalFloor,Bedroom,Living.Room,Bathroom,Price
0,1,1177.698,2,7,2,2,2,62000
1,2,2134.8,5,7,4,2,2,78000
2,3,1138.56,5,7,2,2,1,58000
3,4,1458.78,2,7,3,2,2,45000
4,5,967.776,11,14,3,2,2,45000


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 645 entries, 0 to 644
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Unnamed: 0   645 non-null    int64  
 1   Sqft         645 non-null    float64
 2   Floor        645 non-null    int64  
 3   TotalFloor   645 non-null    int64  
 4   Bedroom      645 non-null    int64  
 5   Living.Room  645 non-null    int64  
 6   Bathroom     645 non-null    int64  
 7   Price        645 non-null    int64  
dtypes: float64(1), int64(7)
memory usage: 40.4 KB


# CREATING A DATAFRAME

In [5]:
# using a dictionary
# drawback is columns can be saved in any order

dataframe1 = pd.DataFrame({
    'Product_id' : [1, 2, 3, 4],
    'name' : ['apple', 'ball', 'cat', 'dog'],
    'Price' : [25, 35, 45, 55]
})

In [6]:
print(dataframe1)

   Product_id   name  Price
0           1  apple     25
1           2   ball     35
2           3    cat     45
3           4    dog     55


In [7]:
# creating dataframe using a list
#advantage - columns are in order as we set

dataframe2 = pd.DataFrame([
    [1, 'apple', 25],
    [2, 'ball', 35],
    [3, 'cat', 45],
    [4, 'dog', 55]
],
columns = ['product_id', 'name', 'price'])

In [8]:
print(dataframe2)

   product_id   name  price
0           1  apple     25
1           2   ball     35
2           3    cat     45
3           4    dog     55


# SELECTING COLUMNS

In [9]:
#One way to select is using dot 
# dataframe_name.column_name

price = data.Price
print(price)

0      62000
1      78000
2      58000
3      45000
4      45000
       ...  
640    45000
641    24800
642    45000
643    65000
644    36000
Name: Price, Length: 645, dtype: int64


In [10]:
#another way is by Selecting the column as if you were selecting a value from a dictionary using a key
#dataframe_name['column_name']

sqft = data['Sqft']
print(sqft)

0      1177.698
1      2134.800
2      1138.560
3      1458.780
4       967.776
         ...   
640    1359.156
641     377.148
642     740.064
643    1707.840
644    1376.946
Name: Sqft, Length: 645, dtype: float64


In [11]:
#check type of sqft - we will notice it is not DataFrame instead it is a Series

print(type(sqft))


<class 'pandas.core.series.Series'>


How is a Pandas series different from a dataframe?

In Pandas a series is a one-dimensional object that contains any type of data, similar in ways to a Numpy array.
A dataframe is a two-dimensional object that can hold multiple columns of different types of data. They are similar to a table in SQL

In [12]:
#selecting multiple columns
# Note: *Make sure that you have a double set of brackets ([[]]), or this command won’t work!

price_sqft = data[['Price', 'Sqft']]
print(price_sqft)

     Price      Sqft
0    62000  1177.698
1    78000  2134.800
2    58000  1138.560
3    45000  1458.780
4    45000   967.776
..     ...       ...
640  45000  1359.156
641  24800   377.148
642  45000   740.064
643  65000  1707.840
644  36000  1376.946

[645 rows x 2 columns]


In [13]:
print(type(price_sqft))

<class 'pandas.core.frame.DataFrame'>


Can we select columns of a Pandas dataframe in any order?

Answer

You certainly can! When selecting multiple columns from a DataFrame, you can order the columns however you would like them to appear. This is particularly useful because if we wanted to see the data in a certain way different from the original column order, we can reorder them in the output however we need.

# SELECTING ROWS

In [14]:
#Purely integer-location based indexing for selection by position.
#Selecting a single row

data.iloc[2]

Unnamed: 0         3.00
Sqft            1138.56
Floor              5.00
TotalFloor         7.00
Bedroom            2.00
Living.Room        2.00
Bathroom           1.00
Price          58000.00
Name: 2, dtype: float64

In [15]:
print(type(data.iloc[2]))

<class 'pandas.core.series.Series'>


How come the result of the .iloc is not a list? Why does it show a series?

When using iloc in Pandas, it will return a specified row, but in the form of a Series.

The left column in the result is composed of indexes which are the column names from the dataframe. And the right column is composed of the values for each column in the row of data.

This is a helpful feature in Pandas because it displays all the information about the columns and values of a row in a clear way.

In [16]:
#Selecting multiple rows

multiple_rows = data.iloc[-3: ]
print(multiple_rows)

     Unnamed: 0      Sqft  Floor  TotalFloor  Bedroom  Living.Room  Bathroom  \
642         646   740.064     13          14        1            1         1   
643         647  1707.840      3          14        3            2         2   
644         648  1376.946      6           7        3            2         1   

     Price  
642  45000  
643  65000  
644  36000  


In [17]:
print(type(multiple_rows))

<class 'pandas.core.frame.DataFrame'>


# SELECTING ROWS WITH LOGIC

In [18]:
##### Selecting rows with Logic 1
# You can select a subset of a DataFrame by using logical statements:
# df[df.MyColumnName == desired_column_value]

logic1 = data[data.Bathroom == 5]
print(logic1)

     Unnamed: 0      Sqft  Floor  TotalFloor  Bedroom  Living.Room  Bathroom  \
182         185  3255.570      4           7        4            2         5   
248         251  3255.570      4           7        4            2         5   
391         395  3255.570      4           7        4            2         5   
572         576  5856.468      7          21        6            2         5   

      Price  
182  130000  
248  120000  
391  120000  
572  180000  


In [19]:
# Selecting rows with Logic 2
# You can also combine multiple logical statements, as long as each statement is in parentheses.
# In Python, | means “or” and & means “and”.
# df[(df.MyColumnName == desired_column_value) | or & (df.MyColumnName == desired_column_value)]
logic2 = data[(data.Bathroom == 5) & (data.TotalFloor == 21)]
print(logic2)

     Unnamed: 0      Sqft  Floor  TotalFloor  Bedroom  Living.Room  Bathroom  \
572         576  5856.468      7          21        6            2         5   

      Price  
572  180000  


In [20]:
print(type(logic2))

<class 'pandas.core.frame.DataFrame'>


In [21]:
# Selecting rows with logic 3
# We could use the isin command to check that df.name is one of a list of values:

## df[df.name_of_column.isin(['value1','value2','value3'.....])] 
# *important NOTE* ---  .isin([]) only takes LIST as an argument

logic3 = data[data['Unnamed: 0'].isin([185,90,80])]
print(logic3)

     Unnamed: 0      Sqft  Floor  TotalFloor  Bedroom  Living.Room  Bathroom  \
79           80   914.406      2           4        2            2         1   
89           90  2312.700      7          15        4            2         2   
182         185  3255.570      4           7        4            2         5   

      Price  
79    37000  
89    70000  
182  130000  


# SETTING INDICES (INDEX)

When we select a subset of a DataFrame using logic, we end up with non-consecutive indices. This is inelegant and makes it hard to use .iloc().

We can fix this using the method .reset_index(). For example, here is a DataFrame called df with non-consecutive indices:
If we use the command df.reset_index(), we get a new DataFrame with a new set of indices

In [22]:
logic3.reset_index()

Unnamed: 0.1,index,Unnamed: 0,Sqft,Floor,TotalFloor,Bedroom,Living.Room,Bathroom,Price
0,79,80,914.406,2,4,2,2,1,37000
1,89,90,2312.7,7,15,4,2,2,70000
2,182,185,3255.57,4,7,4,2,5,130000


Note that the old indices have been moved into a new column called 'index'. Unless you need those values for something special, it’s probably better to use the keyword drop=True so that you don’t end up with that extra column. If we run the command df.reset_index(drop=True), we get a new DataFrame that looks like this:

In [23]:
logic3.reset_index(drop=True)

Unnamed: 0.1,Unnamed: 0,Sqft,Floor,TotalFloor,Bedroom,Living.Room,Bathroom,Price
0,80,914.406,2,4,2,2,1,37000
1,90,2312.7,7,15,4,2,2,70000
2,185,3255.57,4,7,4,2,5,130000


Using .reset_index() will return a new DataFrame, but we usually just want to modify our existing DataFrame. If we use the keyword inplace=True we can just modify our existing DataFrame.

In [24]:
logic3.reset_index(inplace = True, drop=True)

# ADDING A COLUMN

In [25]:
# Recall the DataFrame - dataframe1 we created in the beginning:
print(dataframe1)

   Product_id   name  Price
0           1  apple     25
1           2   ball     35
2           3    cat     45
3           4    dog     55


In [26]:
# Adding a Column I:
# One way that we can add a new column is by giving a list of the same length as the existing DataFrame.

dataframe1['cost'] = [1,2,3,4]
print(dataframe1)

   Product_id   name  Price  cost
0           1  apple     25     1
1           2   ball     35     2
2           3    cat     45     3
3           4    dog     55     4


We can see a new column 'cost' is added to the DataFrame

In [27]:
# Adding a Column 2:
# We can also add a new column that is the same for all rows in the DataFrame

dataframe1['email'] = 'product@email'
print(dataframe1)

   Product_id   name  Price  cost          email
0           1  apple     25     1  product@email
1           2   ball     35     2  product@email
2           3    cat     45     3  product@email
3           4    dog     55     4  product@email


In [28]:
# Adding a Column 3:
# we can add a new column by performing a function on the existing columns

dataframe1['tax'] = dataframe1.Price * .05
print(dataframe1)

   Product_id   name  Price  cost          email   tax
0           1  apple     25     1  product@email  1.25
1           2   ball     35     2  product@email  1.75
2           3    cat     45     3  product@email  2.25
3           4    dog     55     4  product@email  2.75


# PERFORMING COLUMN OPERATIONS

Often, the column that we want to add is related to existing columns, but requires a calculation more complex than multiplication or addition.

We can use the apply function to apply a function to every value in a particular column.

In [29]:
#from string import upper
#dataframe1['upper_name'] = dataframe1.name.apply(upper)
# fix this problem

# LAMBDA FUNCTION

A lambda function is a way of defining a function in a single line of code. Usually, we would assign them to a variable.

mylambda = lambda x: (x * 2) + 3

print(mylambda(5))

The output: 13

In [30]:
# lambda function mylambda that returns the first and last letters of a string

mylambda = lambda x: x[0]+x[-1]
print(mylambda('Hello World'))

Hd


In [31]:
# We can make our lambdas more complex by using a modified form of an if statement.
# n general, the syntax for an if function in a lambda function is:
# lambda x: [OUTCOME IF TRUE] if [CONDITIONAL] else [OUTCOME IF FALSE]


In [32]:
# Applying Lambda to a column
split = lambda x : x.split('@')[-1]
dataframe1['split_email'] = dataframe1.email.apply(split)
print(dataframe1)

   Product_id   name  Price  cost          email   tax split_email
0           1  apple     25     1  product@email  1.25       email
1           2   ball     35     2  product@email  1.75       email
2           3    cat     45     3  product@email  2.25       email
3           4    dog     55     4  product@email  2.75       email


In [33]:
# Applying lamda to row
# inside apply it is mandatory to set axis = 1 or else code wont work

with_tax = lambda row : row['Price'] * row['tax'] if row['Price'] > 40 else row['Price']

dataframe1['price_with_tax'] = dataframe1.apply(with_tax, axis = 1)

print(dataframe1)

   Product_id   name  Price  cost          email   tax split_email  \
0           1  apple     25     1  product@email  1.25       email   
1           2   ball     35     2  product@email  1.75       email   
2           3    cat     45     3  product@email  2.25       email   
3           4    dog     55     4  product@email  2.75       email   

   price_with_tax  
0           25.00  
1           35.00  
2          101.25  
3          151.25  


# Renaming Columns

We can change all of the column names at once by setting the .columns property to a different list. This is great when you need to change all of the column names at once, but be careful! You can easily mislabel columns if you get the ordering wrong

In [34]:
# renaming the columns of dataframe2
# but first lets recall the coulmn name

print(dataframe2)

   product_id   name  price
0           1  apple     25
1           2   ball     35
2           3    cat     45
3           4    dog     55


In [35]:
dataframe2.columns = ['Product_ID', 'Product_Name', 'Product_Price']
print(dataframe2)

   Product_ID Product_Name  Product_Price
0           1        apple             25
1           2         ball             35
2           3          cat             45
3           4          dog             55


In [36]:
dataframe2.rename(columns = {
    'Product_Name' : 'Name'
}, inplace = True)

print(dataframe2)

   Product_ID   Name  Product_Price
0           1  apple             25
1           2   ball             35
2           3    cat             45
3           4    dog             55


# AGGREGATES IN PANDAS

This lesson you will learn about aggregates in Pandas. An aggregate statistic is a way of creating a single number that describes a group of numbers. Common aggregate statistics include mean, median, or standard deviation.

You will also learn how to rearrange a DataFrame into a pivot table, which is a great way to compare data across two dimensions

We will learn how to combine all of the values from a column for a single calculation.

The general syntax for these calculations is:

df.column_name.command()


In [37]:
# before performing lets recall our data DataFrame

data.head()

Unnamed: 0.1,Unnamed: 0,Sqft,Floor,TotalFloor,Bedroom,Living.Room,Bathroom,Price
0,1,1177.698,2,7,2,2,2,62000
1,2,2134.8,5,7,4,2,2,78000
2,3,1138.56,5,7,2,2,1,58000
3,4,1458.78,2,7,3,2,2,45000
4,5,967.776,11,14,3,2,2,45000


In [38]:
print(data.Floor.unique())

[ 2  5 11 10  4  9  6  3 19  7 21  8  1 12 13 18 14 16 15 22 17]


In [39]:
print(data.Price.max())

250000


GROUPBY FUNCTION

In [40]:
total_floor = data.groupby('TotalFloor').Price.max()
print(total_floor)

TotalFloor
1      12000
3      32500
4     150000
5     100000
6     180000
7     168000
8      63000
9     160000
10    120000
11    185000
12    150000
13    206000
14    250000
15    180000
16    200000
17    100000
18    135000
19    200000
20     55000
21    180000
24    145000
25    110000
26     19000
27     60000
38    225000
Name: Price, dtype: int64


In [41]:
print(type(total_floor))

<class 'pandas.core.series.Series'>


In [82]:
data.groupby([data.TotalFloor,data.Bathroom]).Price.sum()

TotalFloor  Bathroom
1           1            12000
3           1            32500
4           1           574099
            2           748998
            3           192000
                         ...  
25          1            65000
            2           854000
26          1            19000
27          2           117000
38          3           225000
Name: Price, Length: 68, dtype: int64

RESET INDEX

In [42]:
total_floor = data.groupby('TotalFloor').Price.max().reset_index()
print(total_floor)

    TotalFloor   Price
0            1   12000
1            3   32500
2            4  150000
3            5  100000
4            6  180000
5            7  168000
6            8   63000
7            9  160000
8           10  120000
9           11  185000
10          12  150000
11          13  206000
12          14  250000
13          15  180000
14          16  200000
15          17  100000
16          18  135000
17          19  200000
18          20   55000
19          21  180000
20          24  145000
21          25  110000
22          26   19000
23          27   60000
24          38  225000


In [43]:
# sometimes we want to rename the column name for proper appropriation

total_floor = total_floor.rename(columns = {
    'Price' : 'Cost'
})
print(total_floor)

    TotalFloor    Cost
0            1   12000
1            3   32500
2            4  150000
3            5  100000
4            6  180000
5            7  168000
6            8   63000
7            9  160000
8           10  120000
9           11  185000
10          12  150000
11          13  206000
12          14  250000
13          15  180000
14          16  200000
15          17  100000
16          18  135000
17          19  200000
18          20   55000
19          21  180000
20          24  145000
21          25  110000
22          26   19000
23          27   60000
24          38  225000


In [44]:
# Now if we print the type we will see it is a DataFrame after using the .reset_index()

print(type(total_floor))

<class 'pandas.core.frame.DataFrame'>


Calculating Aggregate Functions III

Sometimes, the operation that you want to perform is more complicated than mean or count. In those cases, you can use the apply method and lambda functions, just like we did for individual column operations. Note that the input to our lambda function will always be a list of values.

In [45]:
import numpy as np

In [46]:
# np.percentile can calculate any percentile over an array of values

high_prices = data.groupby('TotalFloor').Price.apply(lambda x: np.percentile(x, 75)).reset_index()

In [47]:
print(high_prices)

    TotalFloor      Price
0            1   12000.00
1            3   32500.00
2            4   45249.75
3            5   45750.00
4            6   82000.00
5            7   75000.00
6            8   56500.00
7            9   58750.00
8           10   82500.00
9           11   50000.00
10          12   65000.00
11          13   93499.25
12          14   77499.25
13          15  102500.00
14          16   86888.00
15          17   77500.00
16          18   83750.00
17          19  180000.00
18          20   55000.00
19          21  139500.00
20          24  145000.00
21          25   98000.00
22          26   19000.00
23          27   59250.00
24          38  225000.00


Calculating Aggregate Functions IV
Sometimes, we want to group by more than one column. We can easily do this by passing a list of column names into the groupby method.

In [48]:
random_data = data.groupby(['Living.Room', 'Bathroom']).Price.max().reset_index()

In [49]:
print(random_data)

    Living.Room  Bathroom   Price
0             0         0   85000
1             0         1   67000
2             1         1   89000
3             1         2  206000
4             2         1   60000
5             2         2  250000
6             2         3  225000
7             2         4  180000
8             2         5  180000
9             3         2   46000
10            3         3  150000
11            3         4  180000
12            4         4  180000


# PIVOT TABLE

In [50]:
count_room = data.groupby(['Floor', 'Bathroom'])['Unnamed: 0'].count().reset_index()

In [51]:
print(count_room)

    Floor  Bathroom  Unnamed: 0
0       1         1           3
1       1         2          11
2       1         3           7
3       1         4           1
4       2         1          32
5       2         2          66
6       2         3           4
7       2         4           1
8       3         1          19
9       3         2          42
10      3         3           8
11      4         1          17
12      4         2          52
13      4         3           2
14      4         4           1
15      4         5           3
16      5         1          34
17      5         2          52
18      5         3           2
19      6         1          20
20      6         2          51
21      6         3           4
22      7         1          20
23      7         2          17
24      7         3           4
25      7         5           1
26      8         1          16
27      8         2          19
28      8         3           1
29      9         1           7
30      

In [52]:
pivot_col = count_room.pivot(columns = 'Floor',
                            index = 'Bathroom',
                            values = 'Unnamed: 0').reset_index()

In [53]:
print(pivot_col)

Floor  Bathroom     1     2     3     4     5     6     7     8     9  ...  \
0             0   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   
1             1   3.0  32.0  19.0  17.0  34.0  20.0  20.0  16.0   7.0  ...   
2             2  11.0  66.0  42.0  52.0  52.0  51.0  17.0  19.0  18.0  ...   
3             3   7.0   4.0   8.0   2.0   2.0   4.0   4.0   1.0   1.0  ...   
4             4   1.0   1.0   NaN   1.0   NaN   NaN   NaN   NaN   NaN  ...   
5             5   NaN   NaN   NaN   3.0   NaN   NaN   1.0   NaN   NaN  ...   

Floor   12   13   14   15   16   17   18   19   21   22  
0      NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  
1      4.0  5.0  5.0  NaN  NaN  NaN  NaN  NaN  NaN  1.0  
2      5.0  9.0  1.0  2.0  NaN  2.0  3.0  NaN  2.0  NaN  
3      4.0  NaN  NaN  NaN  1.0  NaN  NaN  4.0  1.0  NaN  
4      NaN  NaN  NaN  2.0  1.0  NaN  NaN  3.0  1.0  NaN  
5      NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  

[6 rows x 22 columns]


# MULTIPLE DATAFRAMES

In order to efficiently store data, we often spread related information across multiple tables.

In [54]:
# Before doing this we need to make two dataframes
demo1 = pd.DataFrame([
     [1,2,3,1,'2017-01-01'],
     [2,2,2,3,'2017-01-01'],
     [3,3,1,1,'2017-01-01'],
     [4,3,2,2,'2017-02-01'],
     [5,3,3,3,'2017-02-01'],
     [6,1,4,2,'2017-03-01'],
     [7,1,1,1,'2017-02-02'],
     [8,1,4,1,'2017-02-02']
 ],
 columns = ['order_id', 'customer_id', 'product_id', 'quantity', 'timestamp'])
demo1.head()

Unnamed: 0,order_id,customer_id,product_id,quantity,timestamp
0,1,2,3,1,2017-01-01
1,2,2,2,3,2017-01-01
2,3,3,1,1,2017-01-01
3,4,3,2,2,2017-02-01
4,5,3,3,3,2017-02-01


In [55]:
demo2 = pd.DataFrame([
    [1, 'thing-a-ma-jig', 5],
    [2, 'whatcha-ma-call-it', 10],
    [3, 'doo-hickey', 7],
    [4, 'gizmo', 3]
],
 columns = ['product_id', 'description', 'price'])
demo2.head()


Unnamed: 0,product_id,description,price
0,1,thing-a-ma-jig,5
1,2,whatcha-ma-call-it,10
2,3,doo-hickey,7
3,4,gizmo,3


In [56]:
merge1 = pd.merge(demo1, demo2)
print(merge1)

   order_id  customer_id  product_id  quantity   timestamp  \
0         1            2           3         1  2017-01-01   
1         5            3           3         3  2017-02-01   
2         2            2           2         3  2017-01-01   
3         4            3           2         2  2017-02-01   
4         3            3           1         1  2017-01-01   
5         7            1           1         1  2017-02-02   
6         6            1           4         2  2017-03-01   
7         8            1           4         1  2017-02-02   

          description  price  
0          doo-hickey      7  
1          doo-hickey      7  
2  whatcha-ma-call-it     10  
3  whatcha-ma-call-it     10  
4      thing-a-ma-jig      5  
5      thing-a-ma-jig      5  
6               gizmo      3  
7               gizmo      3  


INNER MERGE 2

Inner Merge III
In addition to using pd.merge, each DataFrame has its own merge method. For instance, if you wanted to merge orders with customers, you could use:

new_df = orders.merge(customers)

This produces the same DataFrame as if we had called pd.merge(orders, customers).

We generally use this when we are joining more than two DataFrames together because we can “chain” the commands. The following command would merge orders to customers, and then the resulting DataFrame to products:

big_df = orders.merge(customers)\
    .merge(products)

In [57]:
merge2 = demo1.merge(demo2)

In [58]:
print(merge2)

   order_id  customer_id  product_id  quantity   timestamp  \
0         1            2           3         1  2017-01-01   
1         5            3           3         3  2017-02-01   
2         2            2           2         3  2017-01-01   
3         4            3           2         2  2017-02-01   
4         3            3           1         1  2017-01-01   
5         7            1           1         1  2017-02-02   
6         6            1           4         2  2017-03-01   
7         8            1           4         1  2017-02-02   

          description  price  
0          doo-hickey      7  
1          doo-hickey      7  
2  whatcha-ma-call-it     10  
3  whatcha-ma-call-it     10  
4      thing-a-ma-jig      5  
5      thing-a-ma-jig      5  
6               gizmo      3  
7               gizmo      3  


In [59]:
# to merge 3 different DataFrames we need to create one more DataFrame i.e demo3

demo3 = pd.DataFrame([
    [1, 'John Smith', 'Main St.' '212-123-4567'],
    [2, 'Jane Doe', '456 Park Ave.', '949-867-5309'],
    [3, 'Joe Schmo', '798 Broadway', '112-358-1321']
],
columns = ['customer_id', 'customer_name', 'address', 'phone_number'])

In [60]:
demo3.head()

Unnamed: 0,customer_id,customer_name,address,phone_number
0,1,John Smith,Main St.212-123-4567,
1,2,Jane Doe,456 Park Ave.,949-867-5309
2,3,Joe Schmo,798 Broadway,112-358-1321


In [61]:
merge3 = demo1.merge(demo2).merge(demo3)

In [62]:
print(merge3)

   order_id  customer_id  product_id  quantity   timestamp  \
0         1            2           3         1  2017-01-01   
1         2            2           2         3  2017-01-01   
2         5            3           3         3  2017-02-01   
3         4            3           2         2  2017-02-01   
4         3            3           1         1  2017-01-01   
5         7            1           1         1  2017-02-02   
6         6            1           4         2  2017-03-01   
7         8            1           4         1  2017-02-02   

          description  price customer_name               address  phone_number  
0          doo-hickey      7      Jane Doe         456 Park Ave.  949-867-5309  
1  whatcha-ma-call-it     10      Jane Doe         456 Park Ave.  949-867-5309  
2          doo-hickey      7     Joe Schmo          798 Broadway  112-358-1321  
3  whatcha-ma-call-it     10     Joe Schmo          798 Broadway  112-358-1321  
4      thing-a-ma-jig      5     Joe

In [63]:
list1 = [1,1,2,1,3,2,4,4,4,4,1,2,5]
list2 = []

for i in list1:
    if i in list2:
        continue
    else:
        list2.append(i)
print(list2)

[1, 2, 3, 4, 5]
