# 2. Refine the Data
 
> "Data is messy"

- **Missing** e.g. Check for missing or incomplete data
- **Quality** e.g. Check for duplicates, accuracy, unusual data
- **Parse** e.g. extract year and month from date
- **Convert** e.g. free text to coded value
- **Derive** e.g. gender from title
- **Calculate** e.g. percentages, proportion
- **Remove** e.g. remove redundant data
- **Merge** e.g. first and surname for full name
- **Aggregate** e.g. rollup by year, cluster by area
- **Filter** e.g. exclude based on location
- **Sample** e.g. extract a representative data
- **Summary** e.g. show summary stats like mean

In [1]:
# Load the libraries
import numpy as np
import pandas as pd

In [11]:
# Load the data again!
df = pd.read_csv("D:\Self-Study\PYTHON\Week_5\Pandas\data\Weed_Price.csv", parse_dates=[-1])

In [12]:
df.head()

Unnamed: 0,State,HighQ,HighQN,MedQ,MedQN,LowQ,LowQN,date
0,Alabama,339.06,1042,198.64,933,149.49,123,2014-01-01
1,Alaska,288.75,252,260.6,297,388.58,26,2014-01-01
2,Arizona,303.31,1941,209.35,1625,189.45,222,2014-01-01
3,Arkansas,361.85,576,185.62,544,125.87,112,2014-01-01
4,California,248.78,12096,193.56,12812,192.92,778,2014-01-01


In [15]:
df.dtypes

State             object
HighQ            float64
HighQN             int64
MedQ             float64
MedQN              int64
LowQ             float64
LowQN              int64
date      datetime64[ns]
dtype: object

## 2.1 Missing Data

By “missing” data we simply mean null or “not present for whatever reason”. Lets see if we can find the missing data in our data set either because it exists and was not collected or it never existed

In [38]:
df = pd.read_csv("D:\Self-Study\PYTHON\Week_5\Pandas\data\Weed_Price.csv", parse_dates=[-1])
df.head()

Unnamed: 0,State,HighQ,HighQN,MedQ,MedQN,LowQ,LowQN,date
0,Alabama,339.06,1042,198.64,933,149.49,123,2014-01-01
1,Alaska,288.75,252,260.6,297,388.58,26,2014-01-01
2,Arizona,303.31,1941,209.35,1625,189.45,222,2014-01-01
3,Arkansas,361.85,576,185.62,544,125.87,112,2014-01-01
4,California,248.78,12096,193.56,12812,192.92,778,2014-01-01


In [57]:
df.groupby(['State'],as_index=False)['HighQ','HighQN','MedQ','MedQN','LowQ','LowQN'].mean()

  df.groupby(['State'],as_index=False)['HighQ','HighQN','MedQ','MedQN','LowQ','LowQN'].mean()


Unnamed: 0,State,HighQ,HighQN,MedQ,MedQN,LowQ,LowQN
0,Alabama,339.561849,1379.414254,204.606169,1270.351893,146.832603,161.14922
1,Alaska,291.482004,321.244989,262.046392,407.917595,387.232727,32.334076
2,Arizona,300.667483,2392.465479,209.365345,2137.414254,190.82686,279.006682
3,Arkansas,348.056147,751.988864,190.414655,724.683742,127.345455,135.902004
4,California,245.376125,14947.073497,191.268909,16769.821826,190.795992,976.298441
5,Colorado,238.918708,2816.218263,196.532517,2457.512249,226.79062,165.349666
6,Connecticut,341.694076,1625.120267,271.323898,1777.227171,253.024876,110.229399
7,Delaware,366.781849,440.971047,231.230312,372.587973,205.045992,39.175947
8,District of Columbia,348.177416,575.091314,288.251314,494.650334,210.563554,46.583519
9,Florida,302.570312,8415.03118,217.882561,7127.216036,153.205372,632.077951


In [54]:
f = {"HighQ":["mean"],"MedQ":lambda x:x.iloc[0]}

In [55]:
df.groupby('State',as_index=False)['HighQ','MedQ'].agg(f)

  df.groupby('State',as_index=False)['HighQ','MedQ'].agg(f)


Unnamed: 0_level_0,State,HighQ,MedQ
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,<lambda>
0,Alabama,339.561849,198.64
1,Alaska,291.482004,260.6
2,Arizona,300.667483,209.35
3,Arkansas,348.056147,185.62
4,California,245.376125,193.56
5,Colorado,238.918708,195.29
6,Connecticut,341.694076,273.97
7,Delaware,366.781849,226.25
8,District of Columbia,348.177416,295.67
9,Florida,302.570312,220.03


In [None]:
#df.rename(columns={'present_name':'changed_name'},inplace=True)

In [56]:
# set column name for the resultant column after groupby and applying agg function on only one column

##work for both default and 1.1.0 pandas version
pd.DataFrame(df.groupby(['State'],as_index=False)['HighQ'].agg({'mean_HighQ':'mean', 'count_HighQ':'count'})) 



Unnamed: 0,State,mean_HighQ,count_HighQ
0,Alabama,339.561849,449
1,Alaska,291.482004,449
2,Arizona,300.667483,449
3,Arkansas,348.056147,449
4,California,245.376125,449
5,Colorado,238.918708,449
6,Connecticut,341.694076,449
7,Delaware,366.781849,449
8,District of Columbia,348.177416,449
9,Florida,302.570312,449


In [10]:
#set column name for multiple columns on which multiple agg functions are applied after groupby

##for default versions and 1.1.0 pandas version specifically
df1 = pd.DataFrame(df.groupby('State').agg({'HighQ': ['count','nunique'],'MedQ': ['sum','median']})).reset_index()
df1.columns = df1.columns.map('_'.join)
#alternative --- df1.columns = ["_".join(x) for x in df1.columns.ravel()]
df1
#otherwise suggested is to use rename function after the prev steps to set used given column names 


##for default pandas version (hopefully should work)
#df.groupby('State')['HighQ','MedQ'].agg({'new_HighQ':{'HighQ':'count'},'new_MedQ':{'MedQ':'sum'}}).reset_index()

Unnamed: 0,State_,HighQ_count,HighQ_nunique,MedQ_sum,MedQ_median
0,Alabama,449,211,91868.17,204.54
1,Alaska,449,100,117658.83,261.53
2,Arizona,449,250,94005.04,209.35
3,Arkansas,449,195,85496.18,183.61
4,California,449,256,85879.74,191.57
5,Colorado,449,245,88243.1,196.58
6,Connecticut,449,270,121824.43,271.62
7,Delaware,449,125,103822.41,230.27
8,District of Columbia,449,166,129424.84,288.86
9,Florida,449,292,97829.27,217.79


In [16]:
df.isnull()

Unnamed: 0,State,HighQ,HighQN,MedQ,MedQN,LowQ,LowQN,date
0,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...
22894,False,False,False,False,False,True,False,False
22895,False,False,False,False,False,True,False,False
22896,False,False,False,False,False,True,False,False
22897,False,False,False,False,False,True,False,False


In [8]:
df['LowQ'].isnull().sum()

10557

In [17]:
df.isnull().sum()

State         0
HighQ         0
HighQN        0
MedQ          0
MedQN         0
LowQ      10557
LowQN         0
date          0
dtype: int64

In [106]:
df['LowQ'].isnull().sum()

10557

In [5]:
# We can see the bottom rows which have NaN values
df.tail()

Unnamed: 0,State,HighQ,HighQN,MedQ,MedQN,LowQ,LowQN,date
22894,Virginia,364.98,3513,293.12,3079,,284,2014-12-31
22895,Washington,233.05,3337,189.92,3562,,160,2014-12-31
22896,West Virginia,359.35,551,224.03,545,,60,2014-12-31
22897,Wisconsin,350.52,2244,272.71,2221,,167,2014-12-31
22898,Wyoming,322.27,131,351.86,197,,12,2014-12-31


**Pandas will represent missing value by NaN**

What can we do this with missing value?
- Drop these rows / columns? Use `.dropna(how='any')`
- Fill with a dummy value? Use `.fillna(value=dummy)`
- Impute the cell with the most recent value? Use `.fillna(method='ffill')`
- Interpolate the amount in a linear fashion? Use `.interpolate()`

We use the `inplace = True` operator to avoid making a copy of the dataframe and changing the dataframe itself

In [19]:
dummy = pd.DataFrame([[np.nan, 2, np.nan, 0],
                    [3, 4, np.nan, 1],
                    [np.nan, np.nan, np.nan, 5],
                    [np.nan, 3, np.nan, 4]],
                  columns=list('ABCD'))

In [20]:
dummy

Unnamed: 0,A,B,C,D
0,,2.0,,0
1,3.0,4.0,,1
2,,,,5
3,,3.0,,4


In [11]:
dummy_1=pd.DataFrame(index = range(3) , columns=list('abcd'))

In [12]:
dummy_1['a'] = [1,2,3]
dummy_1

Unnamed: 0,a,b,c,d
0,1,,,
1,2,,,
2,3,,,


In [29]:
dummy.fillna(method='bfill')

Unnamed: 0,A,B,C,D
0,3.0,2.0,,0
1,3.0,4.0,,1
2,,3.0,,5
3,,3.0,,4


In [22]:
#dummy.fillna(0,inplace=True)
dummy.fillna(method='ffill')
#dummy.fillna(method='bfill')

Unnamed: 0,A,B,C,D
0,,2.0,,0
1,3.0,4.0,,1
2,3.0,4.0,,5
3,3.0,3.0,,4


In [28]:
dummy

Unnamed: 0,A,B,C,D
0,,2.0,,0
1,3.0,4.0,,1
2,,,,5
3,,3.0,,4


In [31]:
values = {'A': 0, 'B': 1, 'C': 'apple', 'D': 3}
#dummy.fillna(value=values)
dummy.fillna(value=values,limit=2)
#dummy.fillna(method='ffill')

Unnamed: 0,A,B,C,D
0,0.0,2.0,apple,0
1,3.0,4.0,apple,1
2,0.0,1.0,,5
3,,3.0,,4


In [18]:
dummy.mean()

A    3.0
B    3.0
C    NaN
D    2.5
dtype: float64

In [17]:
dummy.fillna(dummy.mean())

Unnamed: 0,A,B,C,D
0,3.0,2.0,,0
1,3.0,4.0,,1
2,3.0,3.0,,5
3,3.0,3.0,,4


In [19]:
dummy.A.fillna(dummy.A.mean())

0    3.0
1    3.0
2    3.0
3    3.0
Name: A, dtype: float64

In [23]:
# Lets sort this data frame by State and Date
df.sort_values(['State','date'], inplace=True)

In [25]:
df[['State','HighQ','HighQN']]

Unnamed: 0,State,HighQ,HighQN
20094,Alabama,339.65,1033
20859,Alabama,339.65,1033
21573,Alabama,339.75,1036
22287,Alabama,339.75,1036
22797,Alabama,339.42,1040
...,...,...,...
4997,Wyoming,313.72,148
5762,Wyoming,313.72,148
6527,Wyoming,313.72,148
7343,Wyoming,313.72,148


In [26]:
df = df.fillna(0)

In [36]:
# Lets fill the missing value with last available value
df.fillna(method = "ffill", inplace=True)

In [27]:
df.isnull().sum()

State     0
HighQ     0
HighQN    0
MedQ      0
MedQN     0
LowQ      0
LowQN     0
date      0
dtype: int64

In [28]:
df.tail()

Unnamed: 0,State,HighQ,HighQN,MedQ,MedQN,LowQ,LowQN,date
4997,Wyoming,313.72,148,317.38,226,0.0,13,2015-06-07
5762,Wyoming,313.72,148,317.38,226,0.0,13,2015-06-08
6527,Wyoming,313.72,148,317.38,226,0.0,13,2015-06-09
7343,Wyoming,313.72,148,317.38,226,0.0,13,2015-06-10
8159,Wyoming,313.72,148,317.38,226,0.0,13,2015-06-11


In [29]:
df.count()

State     22899
HighQ     22899
HighQN    22899
MedQ      22899
MedQN     22899
LowQ      22899
LowQN     22899
date      22899
dtype: int64

In [40]:
df.shape[0],df.shape[1]

(22899, 8)

### Exercise

Fill the missing value with a backward fill.

Fill the missing values with the mean for the column.

## 2.2 Quality of the Data 

Lets check for completeness.

**Say, do we have data on each date for all the 51 states?**

In [42]:
df.columns

Index(['State', 'HighQ', 'HighQN', 'MedQ', 'MedQN', 'LowQ', 'LowQN', 'date'], dtype='object')

In [41]:
df.State.unique()

array(['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California',
       'Colorado', 'Connecticut', 'Delaware', 'District of Columbia',
       'Florida', 'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana',
       'Iowa', 'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Montana',
       'Nebraska', 'Nevada', 'New Hampshire', 'New Jersey', 'New Mexico',
       'New York', 'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma',
       'Oregon', 'Maryland', 'Massachusetts', 'Michigan', 'Minnesota',
       'Mississippi', 'Missouri', 'Pennsylvania', 'Rhode Island',
       'South Carolina', 'South Dakota', 'Tennessee', 'Texas', 'Utah',
       'Vermont', 'Virginia', 'Washington', 'West Virginia', 'Wisconsin',
       'Wyoming'], dtype=object)

In [30]:
df.State.nunique()

51

In [31]:
pd.unique(df["State"])

array(['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California',
       'Colorado', 'Connecticut', 'Delaware', 'District of Columbia',
       'Florida', 'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana',
       'Iowa', 'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland',
       'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi',
       'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Hampshire',
       'New Jersey', 'New Mexico', 'New York', 'North Carolina',
       'North Dakota', 'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania',
       'Rhode Island', 'South Carolina', 'South Dakota', 'Tennessee',
       'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington',
       'West Virginia', 'Wisconsin', 'Wyoming'], dtype=object)

In [34]:
s1=set(df.HighQ.unique())
len(s1)

6943

**Lets check the dates and see if they are all continuous**

In [39]:
df[(df['State']=='Alabama') & (df['HighQ']>=300)] 
#df[(df["year"] == 2014) & (df["State"] == "California")]

Unnamed: 0,State,HighQ,HighQN,MedQ,MedQN,LowQ,LowQN,date
0,Alabama,339.06,1042,198.64,933,149.49,123,2014-01-01
51,Alabama,337.54,1539,208.24,1463,,182,2015-01-01
102,Alabama,338.80,1095,200.02,984,152.02,129,2014-02-01
153,Alabama,335.85,1583,209.19,1524,,193,2015-02-01
204,Alabama,340.14,1140,202.42,1034,146.33,133,2014-03-01
...,...,...,...,...,...,...,...,...
22644,Alabama,340.37,1378,203.51,1236,146.35,159,2014-07-31
22695,Alabama,340.73,1403,205.01,1278,,161,2014-08-31
22746,Alabama,338.92,1451,204.95,1347,,171,2014-10-31
22797,Alabama,339.42,1040,198.68,932,149.49,123,2013-12-31


KeyError: 321

## 2.3  Parse the Data

Lets see if we can get the year, month, week and weekdays from the date. Pandas has got good built in functionality for timeseries data using the DatetimeIndex method 

In [41]:
df['year'] = pd.DatetimeIndex(df['date']).year
df['month'] = pd.DatetimeIndex(df['date']).month
df['week'] = pd.DatetimeIndex(df['date']).week
df['weekday'] = pd.DatetimeIndex(df['date']).weekday

  df['week'] = pd.DatetimeIndex(df['date']).week


In [42]:
df.head()

Unnamed: 0,State,HighQ,HighQN,MedQ,MedQN,LowQ,LowQN,date,year,month,week,weekday
0,Alabama,339.06,1042,198.64,933,149.49,123,2014-01-01,2014,1,1,2
1,Alaska,288.75,252,260.6,297,388.58,26,2014-01-01,2014,1,1,2
2,Arizona,303.31,1941,209.35,1625,189.45,222,2014-01-01,2014,1,1,2
3,Arkansas,361.85,576,185.62,544,125.87,112,2014-01-01,2014,1,1,2
4,California,248.78,12096,193.56,12812,192.92,778,2014-01-01,2014,1,1,2


In [45]:
#remove duplicate rows
df['State'].drop_duplicates()

0                  Alabama
1                   Alaska
2                  Arizona
3                 Arkansas
4               California
5                 Colorado
6              Connecticut
7                 Delaware
8     District of Columbia
9                  Florida
10                 Georgia
11                  Hawaii
12                   Idaho
13                Illinois
14                 Indiana
15                    Iowa
16                  Kansas
17                Kentucky
18               Louisiana
19                   Maine
20                 Montana
21                Nebraska
22                  Nevada
23           New Hampshire
24              New Jersey
25              New Mexico
26                New York
27          North Carolina
28            North Dakota
29                    Ohio
30                Oklahoma
31                  Oregon
32                Maryland
33           Massachusetts
34                Michigan
35               Minnesota
36             Mississippi
3

In [43]:
df.drop_duplicates()

Unnamed: 0,State,HighQ,HighQN,MedQ,MedQN,LowQ,LowQN,date
0,Alabama,339.06,1042,198.64,933,149.49,123,2014-01-01
1,Alaska,288.75,252,260.60,297,388.58,26,2014-01-01
2,Arizona,303.31,1941,209.35,1625,189.45,222,2014-01-01
3,Arkansas,361.85,576,185.62,544,125.87,112,2014-01-01
4,California,248.78,12096,193.56,12812,192.92,778,2014-01-01
...,...,...,...,...,...,...,...,...
22894,Virginia,364.98,3513,293.12,3079,0.00,284,2014-12-31
22895,Washington,233.05,3337,189.92,3562,0.00,160,2014-12-31
22896,West Virginia,359.35,551,224.03,545,0.00,60,2014-12-31
22897,Wisconsin,350.52,2244,272.71,2221,0.00,167,2014-12-31


In [53]:
df.groupby('State')['HighQ','LowQN'].sum().reset_index()

  df.groupby('State')['HighQ','LowQN'].sum().reset_index()


Unnamed: 0,State,HighQ,LowQN
0,Alabama,152463.27,72356
1,Alaska,130875.42,14518
2,Arizona,134999.7,125274
3,Arkansas,156277.21,61020
4,California,110173.88,438358
5,Colorado,107274.5,74242
6,Connecticut,153420.64,49493
7,Delaware,164685.05,17590
8,District of Columbia,156331.66,20916
9,Florida,135854.07,283803


In [27]:
f = {}

In [69]:
df

Unnamed: 0,State,HighQ,HighQN,MedQ,MedQN,LowQ,LowQN,date,year,month,week,weekday
20094,Alabama,339.65,1033,198.04,926,147.15,122,2013-12-27,2013,12,52,4
20859,Alabama,339.65,1033,198.04,926,147.15,122,2013-12-28,2013,12,52,5
21573,Alabama,339.75,1036,198.26,929,149.49,123,2013-12-29,2013,12,52,6
22287,Alabama,339.75,1036,198.81,930,149.49,123,2013-12-30,2013,12,1,0
22797,Alabama,339.42,1040,198.68,932,149.49,123,2013-12-31,2013,12,1,1
0,Alabama,339.06,1042,198.64,933,149.49,123,2014-01-01,2014,1,1,2
765,Alabama,339.20,1043,198.64,933,149.49,123,2014-01-02,2014,1,1,3
1479,Alabama,339.20,1043,198.64,933,148.48,124,2014-01-03,2014,1,1,4
2244,Alabama,339.20,1043,198.43,934,148.48,124,2014-01-04,2014,1,1,5
3009,Alabama,339.32,1046,198.13,936,148.48,124,2014-01-05,2014,1,1,6


In [58]:
df.State.value_counts()

Kentucky                449
Pennsylvania            449
Utah                    449
Ohio                    449
Alaska                  449
Colorado                449
Minnesota               449
North Carolina          449
Mississippi             449
Missouri                449
Oregon                  449
Iowa                    449
South Dakota            449
Wyoming                 449
Delaware                449
Nebraska                449
Texas                   449
Massachusetts           449
Washington              449
West Virginia           449
Maryland                449
Oklahoma                449
Hawaii                  449
South Carolina          449
Arizona                 449
Louisiana               449
Wisconsin               449
Michigan                449
Illinois                449
Florida                 449
Vermont                 449
Montana                 449
Virginia                449
Rhode Island            449
District of Columbia    449
Idaho               

In [61]:
df.State.value_counts()[0]

449

In [62]:
df.State.value_counts().index[0]

'Kentucky'

In [144]:
df['year'].value_counts()[0]

18564

In [156]:
df.State.value_counts()

449

In [17]:
df["weekday"].value_counts()

6    3315
0    3315
4    3264
3    3264
2    3264
1    3264
5    3213
Name: weekday, dtype: int64

## 2.4 Aggregate the Data

To aggregate, we typically use the “group by” function, which involves the following steps

- Splitting the data into groups based on some criteria
- Applying a function to each group independently
- Combining the results into a data structure

In [43]:
df_mean = df.groupby(["State"],as_index=False).mean()

In [44]:
df_mean

Unnamed: 0,State,HighQ,HighQN,MedQ,MedQN,LowQ,LowQN,year,month,week,weekday
0,Alabama,339.561849,1379.414254,204.606169,1270.351893,146.832603,161.14922,2014.167038,5.953229,23.812918,2.995546
1,Alaska,291.482004,321.244989,262.046392,407.917595,387.232727,32.334076,2014.167038,5.953229,23.812918,2.995546
2,Arizona,300.667483,2392.465479,209.365345,2137.414254,190.82686,279.006682,2014.167038,5.953229,23.812918,2.995546
3,Arkansas,348.056147,751.988864,190.414655,724.683742,127.345455,135.902004,2014.167038,5.953229,23.812918,2.995546
4,California,245.376125,14947.073497,191.268909,16769.821826,190.795992,976.298441,2014.167038,5.953229,23.812918,2.995546
5,Colorado,238.918708,2816.218263,196.532517,2457.512249,226.79062,165.349666,2014.167038,5.953229,23.812918,2.995546
6,Connecticut,341.694076,1625.120267,271.323898,1777.227171,253.024876,110.229399,2014.167038,5.953229,23.812918,2.995546
7,Delaware,366.781849,440.971047,231.230312,372.587973,205.045992,39.175947,2014.167038,5.953229,23.812918,2.995546
8,District of Columbia,348.177416,575.091314,288.251314,494.650334,210.563554,46.583519,2014.167038,5.953229,23.812918,2.995546
9,Florida,302.570312,8415.03118,217.882561,7127.216036,153.205372,632.077951,2014.167038,5.953229,23.812918,2.995546


In [76]:
df_mean.head()

Unnamed: 0,State,HighQ,HighQN,MedQ,MedQN,LowQ,LowQN
0,Alabama,339.561849,1379.414254,204.606169,1270.351893,79.139176,161.14922
1,Alaska,291.482004,321.244989,262.046392,407.917595,208.708953,32.334076
2,Arizona,300.667483,2392.465479,209.365345,2137.414254,102.851002,279.006682
3,Arkansas,348.056147,751.988864,190.414655,724.683742,68.63608,135.902004
4,California,245.376125,14947.073497,191.268909,16769.821826,102.834365,976.298441


Add Pivot table examples and exercise

## 2.5 Derive the Data

Lets us load the demographic dataset and create a new column for others in the population

In [63]:
pwd

'D:\\Self-Study\\PYTHON\\Week_5\\Pandas\\Pandas_notebooks'

In [64]:
df_demo = pd.read_csv("D:\Self-Study\PYTHON\Week_5\Pandas\data\Demographics_State.csv")

In [65]:
df_demo.head()

Unnamed: 0,region,total_population,percent_white,percent_black,percent_asian,percent_hispanic,per_capita_income,median_rent,median_age
0,alabama,4799277,67,26,1,4,23680,501,38.1
1,alaska,720316,63,3,5,6,32651,978,33.6
2,arizona,6479703,57,4,3,30,25358,747,36.3
3,arkansas,2933369,74,15,1,7,22170,480,37.5
4,california,37659181,40,6,13,38,29527,1119,35.4


In [66]:
df_demo['Random'] = "US"

In [67]:
df_demo

Unnamed: 0,region,total_population,percent_white,percent_black,percent_asian,percent_hispanic,per_capita_income,median_rent,median_age,Random
0,alabama,4799277,67,26,1,4,23680,501,38.1,US
1,alaska,720316,63,3,5,6,32651,978,33.6,US
2,arizona,6479703,57,4,3,30,25358,747,36.3,US
3,arkansas,2933369,74,15,1,7,22170,480,37.5,US
4,california,37659181,40,6,13,38,29527,1119,35.4,US
5,colorado,5119329,70,4,3,21,31109,825,36.1,US
6,connecticut,3583561,70,9,4,14,37892,880,40.2,US
7,delaware,908446,65,21,3,8,29819,828,38.9,US
8,district of columbia,619371,35,49,3,10,45290,1154,33.8,US
9,florida,19091156,57,15,2,23,26236,838,41.0,US


In [74]:
'Alabama'=='alabama'

False

In [79]:
df_demo[['total_population','region']]

Unnamed: 0,total_population,region
0,4799277,alabama
1,720316,alaska
2,6479703,arizona
3,2933369,arkansas
4,37659181,california
5,5119329,colorado
6,3583561,connecticut
7,908446,delaware
8,619371,district of columbia
9,19091156,florida


In [68]:
df_demo["percent_other"] = 100 - df_demo["percent_white"] - df_demo["percent_black"] - df_demo["percent_asian"] - df_demo["percent_hispanic"]

In [70]:
df_demo.head()

Unnamed: 0,region,total_population,percent_white,percent_black,percent_asian,percent_hispanic,per_capita_income,median_rent,median_age,Random,percent_other
0,alabama,4799277,67,26,1,4,23680,501,38.1,US,2
1,alaska,720316,63,3,5,6,32651,978,33.6,US,23
2,arizona,6479703,57,4,3,30,25358,747,36.3,US,6
3,arkansas,2933369,74,15,1,7,22170,480,37.5,US,3
4,california,37659181,40,6,13,38,29527,1119,35.4,US,3


Add an exercise?

## 2.6 Merge the Data 

Lets merge the demographic dataset with the price dataset

In [71]:
# Let us change the column name region to State
df_demo = df_demo.rename(columns={'region':'State'})

In [72]:
df_demo.columns

Index(['State', 'total_population', 'percent_white', 'percent_black',
       'percent_asian', 'percent_hispanic', 'per_capita_income', 'median_rent',
       'median_age', 'Random', 'percent_other'],
      dtype='object')

In [73]:
# We can now merge Demographic and Price mean data into one single data frame
df_merge = pd.merge(df_mean, df_demo, how='inner', on='State')
df_merge.head()

Unnamed: 0,State,HighQ,HighQN,MedQ,MedQN,LowQ,LowQN,year,month,week,...,total_population,percent_white,percent_black,percent_asian,percent_hispanic,per_capita_income,median_rent,median_age,Random,percent_other


In [None]:
df_mean.merge(df_merge,how=,on='')

What happened? Why is there no data in the dataframe?

In [75]:
# Change the State in df_mean to lower case
df_mean['State'] = df_mean['State'].str.lower()

In [76]:
df_mean.head()

Unnamed: 0,State,HighQ,HighQN,MedQ,MedQN,LowQ,LowQN,year,month,week,weekday
0,alabama,339.561849,1379.414254,204.606169,1270.351893,146.832603,161.14922,2014.167038,5.953229,23.812918,2.995546
1,alaska,291.482004,321.244989,262.046392,407.917595,387.232727,32.334076,2014.167038,5.953229,23.812918,2.995546
2,arizona,300.667483,2392.465479,209.365345,2137.414254,190.82686,279.006682,2014.167038,5.953229,23.812918,2.995546
3,arkansas,348.056147,751.988864,190.414655,724.683742,127.345455,135.902004,2014.167038,5.953229,23.812918,2.995546
4,california,245.376125,14947.073497,191.268909,16769.821826,190.795992,976.298441,2014.167038,5.953229,23.812918,2.995546


In [77]:
# We can now merge Demographic and Price mean data into one single data frame
df_merge = pd.merge(df_mean, df_demo, how='inner', on='State')

In [None]:
df_merge = pd.merge(df_mean, df_demo, how='inner', left_on='State',right_on='region')

In [78]:
df_merge.head()

Unnamed: 0,State,HighQ,HighQN,MedQ,MedQN,LowQ,LowQN,year,month,week,...,total_population,percent_white,percent_black,percent_asian,percent_hispanic,per_capita_income,median_rent,median_age,Random,percent_other
0,alabama,339.561849,1379.414254,204.606169,1270.351893,146.832603,161.14922,2014.167038,5.953229,23.812918,...,4799277,67,26,1,4,23680,501,38.1,US,2
1,alaska,291.482004,321.244989,262.046392,407.917595,387.232727,32.334076,2014.167038,5.953229,23.812918,...,720316,63,3,5,6,32651,978,33.6,US,23
2,arizona,300.667483,2392.465479,209.365345,2137.414254,190.82686,279.006682,2014.167038,5.953229,23.812918,...,6479703,57,4,3,30,25358,747,36.3,US,6
3,arkansas,348.056147,751.988864,190.414655,724.683742,127.345455,135.902004,2014.167038,5.953229,23.812918,...,2933369,74,15,1,7,22170,480,37.5,US,3
4,california,245.376125,14947.073497,191.268909,16769.821826,190.795992,976.298441,2014.167038,5.953229,23.812918,...,37659181,40,6,13,38,29527,1119,35.4,US,3


In [None]:
#df.drop('column_name',axis=1,inplace=True)

## 2.7 Filter the Data

Lets start by filtering the data 
- by location
- by Year
- by location & Year

In [174]:
# Filter data for location California
df_cal = df[df["State"] == "California"]

In [176]:
df_cal

Unnamed: 0,State,HighQ,HighQN,MedQ,MedQN,LowQ,LowQN,date,year,month,week,weekday
20098,California,248.77,12021,193.44,12724,193.88,770,2013-12-27,2013,12,52,4
20863,California,248.74,12025,193.44,12728,193.88,770,2013-12-28,2013,12,52,5
21577,California,248.76,12047,193.55,12760,193.60,772,2013-12-29,2013,12,52,6
22291,California,248.82,12065,193.54,12779,193.80,773,2013-12-30,2013,12,1,0
22801,California,248.76,12082,193.54,12792,193.80,773,2013-12-31,2013,12,1,1
4,California,248.78,12096,193.56,12812,192.92,778,2014-01-01,2014,1,1,2
769,California,248.67,12125,193.56,12836,192.80,779,2014-01-02,2014,1,1,3
1483,California,248.67,12141,193.57,12853,192.67,782,2014-01-03,2014,1,1,4
2248,California,248.65,12155,193.59,12884,192.67,782,2014-01-04,2014,1,1,5
3013,California,248.68,12176,193.63,12902,192.67,782,2014-01-05,2014,1,1,6


In [175]:
df_cal.shape

(449, 12)

In [34]:
# Filter data for year
df_2014 = df[df["year"] == 2014]

In [35]:
df_2014.shape

(18564, 12)

In [36]:
df_cal_2014 = df[(df["year"] == 2014) & (df["State"] == "California")]

In [37]:
df_cal_2014.shape

(364, 12)

In [38]:
df_cal_2014.head()

Unnamed: 0,State,HighQ,HighQN,MedQ,MedQN,LowQ,LowQN,date,year,month,week,weekday
4,California,248.78,12096,193.56,12812,192.92,778,2014-01-01,2014,1,1,2
769,California,248.67,12125,193.56,12836,192.8,779,2014-01-02,2014,1,1,3
1483,California,248.67,12141,193.57,12853,192.67,782,2014-01-03,2014,1,1,4
2248,California,248.65,12155,193.59,12884,192.67,782,2014-01-04,2014,1,1,5
3013,California,248.68,12176,193.63,12902,192.67,782,2014-01-05,2014,1,1,6


Exercise to get simple metrics???

## 2.8 Summarise the Data

We can use the describe function to get the summary stats for each column in the data frame

In [39]:
df.describe()

Unnamed: 0,HighQ,HighQN,MedQ,MedQN,LowQ,LowQN,year,month,week,weekday
count,22899.0,22899.0,22899.0,22899.0,22899.0,22899.0,22899.0,22899.0,22899.0,22899.0
mean,329.759854,2274.743657,247.618306,2183.737805,203.624092,202.804489,2014.167038,5.953229,23.812918,2.995546
std,41.173167,2641.936586,44.276015,2789.902626,101.484265,220.531987,0.401765,3.553055,15.426018,2.005599
min,202.02,93.0,144.85,134.0,63.7,11.0,2013.0,1.0,1.0,0.0
25%,303.78,597.0,215.775,548.0,145.81,51.0,2014.0,3.0,9.0,1.0
50%,342.31,1420.0,245.8,1320.0,185.78,139.0,2014.0,6.0,22.0,3.0
75%,356.55,2958.0,274.155,2673.0,222.94,263.0,2014.0,9.0,37.0,5.0
max,415.7,18492.0,379.0,22027.0,734.65,1287.0,2015.0,12.0,52.0,6.0


We can also use convenience functions like sum(), count(), mean() etc. to calculate these

In [40]:
df.HighQ.mean()

329.7598541421045

In [41]:
# Lets do this the hard way
df.HighQ.sum()

7551170.90000005

In [42]:
df.HighQ.count()

22899

In [43]:
df.HighQ.sum()/df.HighQ.count()

329.7598541421045

In [44]:
df.HighQ.median()

342.31

Add more exercises???

## 2.9 Sample the Data

In [92]:
?df.sample

In [93]:
df_ca_sample = df[df.State=='California'].sample(n = 50,  random_state=42)

In [94]:
df_ca_sample.duplicated()

14539    False
15100    False
5971     False
17701    False
3574     False
1534     False
9796     False
4033     False
18568    False
18211    False
2809     False
7552     False
18874    False
15916    False
20149    False
20710    False
3829     False
12499    False
15508    False
3931     False
8419     False
463      False
4594     False
4747     False
5920     False
15202    False
3676     False
4186     False
10765    False
1993     False
8011     False
22393    False
3727     False
16987    False
6991     False
16018    False
21118    False
3880     False
19180    False
12091    False
12142    False
22597    False
5767     False
22240    False
11683    False
18619    False
5155     False
22546    False
3982     False
1687     False
dtype: bool

In [95]:
df_ca_sample.loc[8572]

KeyError: 'the label [8572] is not in the [index]'

## 2.10 Quirks in Pandas

In [96]:
df_ca_sample.iat[0, 0] = "Cal"

In [97]:
df_ca_sample.head()

Unnamed: 0,State,HighQ,HighQN,MedQ,MedQN,LowQ,LowQN,date,year,month,week,weekday
14539,Cal,248.09,12858,192.92,13749,191.19,823,2014-02-20,2014,2,8,3
15100,California,244.1,16375,189.67,18922,,1079,2014-12-20,2014,12,51,5
5971,California,244.79,15792,190.71,17949,,1031,2014-11-08,2014,11,45,5
17701,California,245.46,14858,191.65,16522,188.81,967,2014-07-24,2014,7,30,3
3574,California,245.1,15264,191.26,17093,,997,2014-09-05,2014,9,36,4


But changes when you copy

In [98]:
df_ca_sample2 = df_ca_sample

In [99]:
df_ca_sample2.iat[0, 0] = "CA"
df_ca_sample2.head()

Unnamed: 0,State,HighQ,HighQN,MedQ,MedQN,LowQ,LowQN,date,year,month,week,weekday
14539,CA,248.09,12858,192.92,13749,191.19,823,2014-02-20,2014,2,8,3
15100,California,244.1,16375,189.67,18922,,1079,2014-12-20,2014,12,51,5
5971,California,244.79,15792,190.71,17949,,1031,2014-11-08,2014,11,45,5
17701,California,245.46,14858,191.65,16522,188.81,967,2014-07-24,2014,7,30,3
3574,California,245.1,15264,191.26,17093,,997,2014-09-05,2014,9,36,4


In [100]:
df_ca_sample.head()

Unnamed: 0,State,HighQ,HighQN,MedQ,MedQN,LowQ,LowQN,date,year,month,week,weekday
14539,CA,248.09,12858,192.92,13749,191.19,823,2014-02-20,2014,2,8,3
15100,California,244.1,16375,189.67,18922,,1079,2014-12-20,2014,12,51,5
5971,California,244.79,15792,190.71,17949,,1031,2014-11-08,2014,11,45,5
17701,California,245.46,14858,191.65,16522,188.81,967,2014-07-24,2014,7,30,3
3574,California,245.1,15264,191.26,17093,,997,2014-09-05,2014,9,36,4


Fix the issue

In [None]:
df_ca_sample3 = df_ca_sample2.copy()

In [None]:
df_ca_sample3.head()

In [None]:
df_ca_sample3.iat[0, 0] = "CALIFORNIA"
df_ca_sample3.head()

In [None]:
df_ca_sample2.head()

In [13]:
# Load the data again!
df = pd.read_csv("D:\\COEs\\PYTHON COE\\data\\Weed_Price.csv", parse_dates=[-1])

In [15]:
df.sort_values('State',ascending=0)

Unnamed: 0,State,HighQ,HighQN,MedQ,MedQN,LowQ,LowQN,date
22898,Wyoming,322.27,131,351.86,197,,12,2014-12-31
2753,Wyoming,351.78,110,360.09,160,161.30,12,2014-07-04
13157,Wyoming,354.11,95,378.34,140,161.30,11,2014-02-18
458,Wyoming,312.84,147,317.38,226,,13,2015-06-01
19532,Wyoming,320.39,133,350.57,202,,12,2015-01-27
2090,Wyoming,330.41,117,356.25,170,,12,2014-09-03
9740,Wyoming,351.78,110,359.34,162,161.30,12,2014-07-13
8210,Wyoming,351.78,110,359.34,162,161.30,12,2014-07-11
6119,Wyoming,354.03,94,378.95,134,161.30,11,2014-01-09
13106,Wyoming,320.39,133,351.58,199,,12,2015-01-18


In [92]:
df.head()

Unnamed: 0,State,HighQ,HighQN,MedQ,MedQN,LowQ,LowQN,date
0,Alabama,339.06,1042,198.64,933,149.49,123,2014-01-01
1,Alaska,288.75,252,260.6,297,388.58,26,2014-01-01
2,Arizona,303.31,1941,209.35,1625,189.45,222,2014-01-01
3,Arkansas,361.85,576,185.62,544,125.87,112,2014-01-01
4,California,248.78,12096,193.56,12812,192.92,778,2014-01-01


In [79]:
def add_columns(data):
    res= data['HighQ'] + data['MedQ']
    return res

In [80]:
def add_columns_2(x,y):
    res=x+y
    return res

In [123]:
df['newnew']=add_columns(df)

In [81]:
df['new_column'] = df.apply(lambda x : add_columns(x),axis=1)

In [82]:
df['new_column_2'] = df[['HighQ','MedQ']].apply(lambda x : add_columns_2(x[0],x[1]),axis=1)

In [83]:
df

Unnamed: 0,State,HighQ,HighQN,MedQ,MedQN,LowQ,LowQN,date,year,month,week,weekday,new_column,new_column_2
0,Alabama,339.06,1042,198.64,933,149.49,123,2014-01-01,2014,1,1,2,537.70,537.70
1,Alaska,288.75,252,260.60,297,388.58,26,2014-01-01,2014,1,1,2,549.35,549.35
2,Arizona,303.31,1941,209.35,1625,189.45,222,2014-01-01,2014,1,1,2,512.66,512.66
3,Arkansas,361.85,576,185.62,544,125.87,112,2014-01-01,2014,1,1,2,547.47,547.47
4,California,248.78,12096,193.56,12812,192.92,778,2014-01-01,2014,1,1,2,442.34,442.34
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22894,Virginia,364.98,3513,293.12,3079,,284,2014-12-31,2014,12,1,2,658.10,658.10
22895,Washington,233.05,3337,189.92,3562,,160,2014-12-31,2014,12,1,2,422.97,422.97
22896,West Virginia,359.35,551,224.03,545,,60,2014-12-31,2014,12,1,2,583.38,583.38
22897,Wisconsin,350.52,2244,272.71,2221,,167,2014-12-31,2014,12,1,2,623.23,623.23


In [124]:
df['State_split'] = df['State'].apply(lambda x : x[:3])

In [125]:
df

Unnamed: 0,State,HighQ,HighQN,MedQ,MedQN,LowQ,LowQN,date,new_column,new_column_2,newnew,State_split
0,Alabama,339.06,1042,198.64,933,149.49,123,2014-01-01,537.70,537.70,537.70,Ala
1,Alaska,288.75,252,260.60,297,388.58,26,2014-01-01,549.35,549.35,549.35,Ala
2,Arizona,303.31,1941,209.35,1625,189.45,222,2014-01-01,512.66,512.66,512.66,Ari
3,Arkansas,361.85,576,185.62,544,125.87,112,2014-01-01,547.47,547.47,547.47,Ark
4,California,248.78,12096,193.56,12812,192.92,778,2014-01-01,442.34,442.34,442.34,Cal
...,...,...,...,...,...,...,...,...,...,...,...,...
22894,Virginia,364.98,3513,293.12,3079,0.00,284,2014-12-31,658.10,658.10,658.10,Vir
22895,Washington,233.05,3337,189.92,3562,0.00,160,2014-12-31,422.97,422.97,422.97,Was
22896,West Virginia,359.35,551,224.03,545,0.00,60,2014-12-31,583.38,583.38,583.38,Wes
22897,Wisconsin,350.52,2244,272.71,2221,0.00,167,2014-12-31,623.23,623.23,623.23,Wis
