In [1]:
%matplotlib inline

In [2]:
import numpy as np
import pandas as pd
import scipy.stats
import matplotlib.pyplot as plt
from datetime import datetime

In [3]:
pew = pd.read_csv("data/pew.csv")

In [4]:
pew

Unnamed: 0,religion,<$10k,$10-20k,$20-30k,$30-40k,$40-50k,$50-75k,$75-100k,$100-150k,>150k,Don't know/refused
0,Agnostic,27,34,60,81,76,137,122,109,84,96
1,Atheist,12,27,37,52,35,70,73,59,74,76
2,Buddhist,27,21,30,34,33,58,62,39,53,54
3,Catholic,418,617,732,670,638,1116,949,792,633,1489
4,Don’t know/refused,15,14,15,11,10,35,21,17,18,116
5,Evangelical Prot,575,869,1064,982,881,1486,949,723,414,1529
6,Hindu,1,9,7,9,11,34,47,48,54,37
7,Historically Black Prot,228,244,236,238,197,223,131,81,78,339
8,Jehovah's Witness,20,27,24,24,21,30,15,11,6,37
9,Jewish,19,19,25,25,30,95,69,87,151,162


In [5]:
pew_tidy = pd.melt(pew, id_vars = ["religion"], var_name = "income", value_name = "count")

In [6]:
pew_tidy

Unnamed: 0,religion,income,count
0,Agnostic,<$10k,27
1,Atheist,<$10k,12
2,Buddhist,<$10k,27
3,Catholic,<$10k,418
4,Don’t know/refused,<$10k,15
...,...,...,...
175,Orthodox,Don't know/refused,73
176,Other Christian,Don't know/refused,18
177,Other Faiths,Don't know/refused,71
178,Other World Religions,Don't know/refused,8


In [7]:
pew_tidy.dtypes

religion    object
income      object
count        int64
dtype: object

In [8]:
pew_tidy["religion"] = pd.Categorical(pew_tidy["religion"])
pew_tidy["income"] = pd.Categorical(pew_tidy["income"])

In [9]:
tb = pd.read_csv("data/tb.csv")

In [10]:
tb

Unnamed: 0,iso2,year,m04,m514,m014,m1524,m2534,m3544,m4554,m5564,...,f04,f514,f014,f1524,f2534,f3544,f4554,f5564,f65,fu
0,AD,1989,,,,,,,,,...,,,,,,,,,,
1,AD,1990,,,,,,,,,...,,,,,,,,,,
2,AD,1991,,,,,,,,,...,,,,,,,,,,
3,AD,1992,,,,,,,,,...,,,,,,,,,,
4,AD,1993,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5764,ZW,2004,,,187.0,833.0,2908.0,2298.0,1056.0,366.0,...,,,225.0,1140.0,2858.0,1565.0,622.0,214.0,111.0,
5765,ZW,2005,,,210.0,837.0,2264.0,1855.0,762.0,295.0,...,,,269.0,1136.0,2242.0,1255.0,578.0,193.0,603.0,
5766,ZW,2006,,,215.0,736.0,2391.0,1939.0,896.0,348.0,...,,,237.0,1020.0,2424.0,1355.0,632.0,230.0,96.0,
5767,ZW,2007,6.0,132.0,138.0,500.0,3693.0,0.0,716.0,292.0,...,7.0,178.0,185.0,739.0,3311.0,0.0,553.0,213.0,90.0,


In [11]:
tb.iso2.unique()

array(['AD', 'AE', 'AF', 'AG', 'AI', 'AL', 'AM', 'AN', 'AO', 'AR', 'AS',
       'AT', 'AU', 'AZ', 'BA', 'BB', 'BD', 'BE', 'BF', 'BG', 'BH', 'BI',
       'BJ', 'BM', 'BN', 'BO', 'BR', 'BS', 'BT', 'BW', 'BY', 'BZ', 'CA',
       'CD', 'CF', 'CG', 'CH', 'CI', 'CK', 'CL', 'CM', 'CN', 'CO', 'CR',
       'CU', 'CV', 'CY', 'CZ', 'DE', 'DJ', 'DK', 'DM', 'DO', 'DZ', 'EC',
       'EE', 'EG', 'ER', 'ES', 'ET', 'FI', 'FJ', 'FM', 'FR', 'GA', 'GB',
       'GD', 'GE', 'GH', 'GM', 'GN', 'GQ', 'GR', 'GT', 'GU', 'GW', 'GY',
       'HK', 'HN', 'HR', 'HT', 'HU', 'ID', 'IE', 'IL', 'IN', 'IQ', 'IR',
       'IS', 'IT', 'JM', 'JO', 'JP', 'KE', 'KG', 'KH', 'KI', 'KM', 'KN',
       'KP', 'KR', 'KW', 'KY', 'KZ', 'LA', 'LB', 'LC', 'LK', 'LR', 'LS',
       'LT', 'LU', 'LV', 'LY', 'MA', 'MC', 'MD', 'ME', 'MG', 'MH', 'MK',
       'ML', 'MM', 'MN', 'MO', 'MP', 'MR', 'MS', 'MT', 'MU', 'MV', 'MW',
       'MX', 'MY', 'MZ', nan, 'NC', 'NE', 'NG', 'NI', 'NL', 'NO', 'NP',
       'NR', 'NU', 'NZ', 'OM', 'PA', 'PE', 'PF', 'PG

In [12]:
tb.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
year,5769.0,1994.229329,8.423265,1980.0,1987.0,1994.0,2002.0,2008.0
m04,392.0,8.806122,46.094789,0.0,0.0,0.0,2.0,655.0
m514,401.0,28.975062,127.881505,0.0,0.0,1.0,8.0,1519.0
m014,2381.0,79.438051,280.489536,0.0,0.0,6.0,36.0,4648.0
m1524,2407.0,922.364769,4278.638975,0.0,10.0,92.0,511.5,77121.0
m2534,2408.0,1301.894518,5253.398971,0.0,15.0,151.5,728.0,83850.0
m3544,2415.0,1205.566046,5409.064589,0.0,16.0,134.0,591.5,90498.0
m4554,2421.0,983.515489,4780.782748,0.0,13.0,98.0,434.0,78815.0
m5564,2414.0,696.171914,3771.934518,0.0,9.0,62.0,273.75,57492.0
m65,2408.0,609.440615,3660.224592,0.0,8.0,53.0,227.25,70376.0


In [13]:
tb_tidy = pd.melt(tb, id_vars = ["iso2", "year"], var_name = "sex_and_age", value_name = "cases")
parts = tb_tidy["sex_and_age"].str.extract("(\D)(\d+)(\d{2})", expand = True)
parts.columns = ["sex", "age_lower", "age_upper"]
parts["age"] = (parts.age_lower + "-" + parts.age_upper).dropna()
tb_tidy = pd.concat([tb_tidy, parts], axis = 1)
tb_tidy = tb_tidy.drop(["sex_and_age", "age_lower", "age_upper"], axis=1)
tb_tidy = tb_tidy.dropna()
tb_tidy = tb_tidy.sort_values(by = ["iso2", "year", "sex", "age", "cases"])
tb_tidy = tb_tidy.reset_index(drop = True) # if drop = True drops original column with index

In [14]:
tb_tidy["iso2"] = pd.Categorical(tb_tidy["iso2"])
tb_tidy["sex"] = pd.Categorical(tb_tidy["sex"])
tb_tidy["age"] = pd.Categorical(tb_tidy["age"])
tb_tidy["cases"] = tb_tidy["cases"].astype(np.int64)

In [15]:
tb_tidy.dtypes

iso2     category
year        int64
cases       int64
sex      category
age      category
dtype: object

In [16]:
tb_tidy["iso2"].unique()

['AD', 'AE', 'AF', 'AG', 'AI', ..., 'YE', 'YU', 'ZA', 'ZM', 'ZW']
Length: 212
Categories (212, object): ['AD', 'AE', 'AF', 'AG', ..., 'YU', 'ZA', 'ZM', 'ZW']

In [17]:
tb_tidy.groupby("iso2").size()

iso2
AD    131
AE    102
AF    144
AG     73
AI     13
     ... 
YE    168
YU     84
ZA    126
ZM    108
ZW     86
Length: 212, dtype: int64

In [18]:
tb_tidy.groupby(["iso2", "sex"])["cases"].sum()

iso2  sex
AD    f          24
      m          32
AE    f         199
      m         214
AF    f       52293
              ...  
ZA    m      485900
ZM    f       55274
      m       70629
ZW    f       39405
      m       44582
Name: cases, Length: 424, dtype: int64

In [19]:
tb_tidy.to_csv("data/tb_tidy.csv", index=None)

In [20]:
pd.Series([0.0, 3.0, 5.0, 1.0, np.nan]).astype("Int64")

0       0
1       3
2       5
3       1
4    <NA>
dtype: Int64

In [21]:
weather = pd.read_csv("data/weather.csv")

In [22]:
weather

Unnamed: 0,id,year,month,element,d1,d2,d3,d4,d5,d6,...,d22,d23,d24,d25,d26,d27,d28,d29,d30,d31
0,MX17004,2010,1,tmax,,,,,,,...,,,,,,,,,27.8,
1,MX17004,2010,1,tmin,,,,,,,...,,,,,,,,,14.5,
2,MX17004,2010,2,tmax,,27.3,24.1,,,,...,,29.9,,,,,,,,
3,MX17004,2010,2,tmin,,14.4,14.4,,,,...,,10.7,,,,,,,,
4,MX17004,2010,3,tmax,,,,,32.1,,...,,,,,,,,,,
5,MX17004,2010,3,tmin,,,,,14.2,,...,,,,,,,,,,
6,MX17004,2010,4,tmax,,,,,,,...,,,,,,36.3,,,,
7,MX17004,2010,4,tmin,,,,,,,...,,,,,,16.7,,,,
8,MX17004,2010,5,tmax,,,,,,,...,,,,,,33.2,,,,
9,MX17004,2010,5,tmin,,,,,,,...,,,,,,18.2,,,,


In [23]:
temp_data = weather.melt(id_vars = ["id", "year", "month", "element"], var_name="day", value_name="value")

In [24]:
temp_data.day = temp_data.day.str.slice(1,).astype(np.int64)

In [25]:
temp_data = temp_data.dropna()

In [26]:
temp_data["date"] = pd.to_datetime(temp_data[["year", "month", "day"]])
temp_data = temp_data.drop(["year", "month", "day"], axis = 1)

In [27]:
temp_data

Unnamed: 0,id,element,value,date
20,MX17004,tmax,29.9,2010-12-01
21,MX17004,tmin,13.8,2010-12-01
24,MX17004,tmax,27.3,2010-02-02
25,MX17004,tmin,14.4,2010-02-02
40,MX17004,tmax,31.3,2010-11-02
...,...,...,...,...
631,MX17004,tmin,15.3,2010-08-29
638,MX17004,tmax,27.8,2010-01-30
639,MX17004,tmin,14.5,2010-01-30
674,MX17004,tmax,25.4,2010-08-31


In [28]:
temp_data = temp_data.pivot_table(index = ["id", "date"], columns = ["element"], values = ["value"])

In [29]:
temp_data.reset_index(inplace=True)

In [30]:
temp_data.columns.name = ""

In [31]:
temp_data

Unnamed: 0_level_0,id,date,value,value
element,Unnamed: 1_level_1,Unnamed: 2_level_1,tmax,tmin
0,MX17004,2010-01-30,27.8,14.5
1,MX17004,2010-02-02,27.3,14.4
2,MX17004,2010-02-03,24.1,14.4
3,MX17004,2010-02-11,29.7,13.4
4,MX17004,2010-02-23,29.9,10.7
5,MX17004,2010-03-05,32.1,14.2
6,MX17004,2010-03-10,34.5,16.8
7,MX17004,2010-03-16,31.1,17.6
8,MX17004,2010-04-27,36.3,16.7
9,MX17004,2010-05-27,33.2,18.2


In [32]:
billboard = pd.read_csv("data/billboard.csv")

In [33]:
billboard

Unnamed: 0,year,artist,track,time,date.entered,wk1,wk2,wk3,wk4,wk5,...,wk67,wk68,wk69,wk70,wk71,wk72,wk73,wk74,wk75,wk76
0,2000,2 Pac,Baby Don't Cry (Keep...,4:22,2000-02-26,87,82.0,72.0,77.0,87.0,...,,,,,,,,,,
1,2000,2Ge+her,The Hardest Part Of ...,3:15,2000-09-02,91,87.0,92.0,,,...,,,,,,,,,,
2,2000,3 Doors Down,Kryptonite,3:53,2000-04-08,81,70.0,68.0,67.0,66.0,...,,,,,,,,,,
3,2000,3 Doors Down,Loser,4:24,2000-10-21,76,76.0,72.0,69.0,67.0,...,,,,,,,,,,
4,2000,504 Boyz,Wobble Wobble,3:35,2000-04-15,57,34.0,25.0,17.0,17.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
312,2000,Yankee Grey,Another Nine Minutes,3:10,2000-04-29,86,83.0,77.0,74.0,83.0,...,,,,,,,,,,
313,2000,"Yearwood, Trisha",Real Live Woman,3:55,2000-04-01,85,83.0,83.0,82.0,81.0,...,,,,,,,,,,
314,2000,Ying Yang Twins,Whistle While You Tw...,4:19,2000-03-18,95,94.0,91.0,85.0,84.0,...,,,,,,,,,,
315,2000,Zombie Nation,Kernkraft 400,3:30,2000-09-02,99,99.0,,,,...,,,,,,,,,,


In [34]:
billboard_tidy = pd.melt(billboard, id_vars=["year", "artist", "track", "time", "date.entered"],var_name = "week", value_name= "position")

In [35]:
billboard_tidy

Unnamed: 0,year,artist,track,time,date.entered,week,position
0,2000,2 Pac,Baby Don't Cry (Keep...,4:22,2000-02-26,wk1,87.0
1,2000,2Ge+her,The Hardest Part Of ...,3:15,2000-09-02,wk1,91.0
2,2000,3 Doors Down,Kryptonite,3:53,2000-04-08,wk1,81.0
3,2000,3 Doors Down,Loser,4:24,2000-10-21,wk1,76.0
4,2000,504 Boyz,Wobble Wobble,3:35,2000-04-15,wk1,57.0
...,...,...,...,...,...,...,...
24087,2000,Yankee Grey,Another Nine Minutes,3:10,2000-04-29,wk76,
24088,2000,"Yearwood, Trisha",Real Live Woman,3:55,2000-04-01,wk76,
24089,2000,Ying Yang Twins,Whistle While You Tw...,4:19,2000-03-18,wk76,
24090,2000,Zombie Nation,Kernkraft 400,3:30,2000-09-02,wk76,


In [36]:
billboard_tidy["week"] = billboard_tidy["week"].str.slice(2).astype(np.int64)

In [37]:
billboard_tidy = billboard_tidy.dropna()

In [38]:
billboard_tidy["date.entered"] = billboard_tidy["date.entered"].astype('datetime64[ns]')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  billboard_tidy["date.entered"] = billboard_tidy["date.entered"].astype('datetime64[ns]')


In [39]:
billboard_tidy["position_date"] = billboard_tidy["date.entered"] + pd.to_timedelta((billboard_tidy["week"]-1)*7, unit='d')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  billboard_tidy["position_date"] = billboard_tidy["date.entered"] + pd.to_timedelta((billboard_tidy["week"]-1)*7, unit='d')


In [40]:
billboard_tidy

Unnamed: 0,year,artist,track,time,date.entered,week,position,position_date
0,2000,2 Pac,Baby Don't Cry (Keep...,4:22,2000-02-26,1,87.0,2000-02-26
1,2000,2Ge+her,The Hardest Part Of ...,3:15,2000-09-02,1,91.0,2000-09-02
2,2000,3 Doors Down,Kryptonite,3:53,2000-04-08,1,81.0,2000-04-08
3,2000,3 Doors Down,Loser,4:24,2000-10-21,1,76.0,2000-10-21
4,2000,504 Boyz,Wobble Wobble,3:35,2000-04-15,1,57.0,2000-04-15
...,...,...,...,...,...,...,...,...
19716,2000,Creed,Higher,5:16,1999-09-11,63,50.0,2000-11-18
19833,2000,Lonestar,Amazed,4:25,1999-06-05,63,45.0,2000-08-12
20033,2000,Creed,Higher,5:16,1999-09-11,64,50.0,2000-11-25
20150,2000,Lonestar,Amazed,4:25,1999-06-05,64,50.0,2000-08-19


In [41]:
billboard_tidy[billboard_tidy["track"] == "Higher"]

Unnamed: 0,year,artist,track,time,date.entered,week,position,position_date
62,2000,Creed,Higher,5:16,1999-09-11,1,81.0,1999-09-11
379,2000,Creed,Higher,5:16,1999-09-11,2,77.0,1999-09-18
696,2000,Creed,Higher,5:16,1999-09-11,3,73.0,1999-09-25
1013,2000,Creed,Higher,5:16,1999-09-11,4,63.0,1999-10-02
1330,2000,Creed,Higher,5:16,1999-09-11,5,61.0,1999-10-09
1647,2000,Creed,Higher,5:16,1999-09-11,6,58.0,1999-10-16
1964,2000,Creed,Higher,5:16,1999-09-11,7,56.0,1999-10-23
2281,2000,Creed,Higher,5:16,1999-09-11,8,52.0,1999-10-30
2598,2000,Creed,Higher,5:16,1999-09-11,9,56.0,1999-11-06
2915,2000,Creed,Higher,5:16,1999-09-11,10,57.0,1999-11-13


In [42]:
billboard_tidy2 = pd.DataFrame(billboard_tidy, columns=["year", "artist", "track", "time", "date.entered"])

In [43]:
billboard_tidy2 = billboard_tidy2.drop_duplicates()

In [44]:
index_data = range(0, 317)
billboard_tidy2["index"] = pd.Index(index_data)

In [45]:
billboard_tidy2

Unnamed: 0,year,artist,track,time,date.entered,index
0,2000,2 Pac,Baby Don't Cry (Keep...,4:22,2000-02-26,0
1,2000,2Ge+her,The Hardest Part Of ...,3:15,2000-09-02,1
2,2000,3 Doors Down,Kryptonite,3:53,2000-04-08,2
3,2000,3 Doors Down,Loser,4:24,2000-10-21,3
4,2000,504 Boyz,Wobble Wobble,3:35,2000-04-15,4
...,...,...,...,...,...,...
312,2000,Yankee Grey,Another Nine Minutes,3:10,2000-04-29,312
313,2000,"Yearwood, Trisha",Real Live Woman,3:55,2000-04-01,313
314,2000,Ying Yang Twins,Whistle While You Tw...,4:19,2000-03-18,314
315,2000,Zombie Nation,Kernkraft 400,3:30,2000-09-02,315


In [46]:
billboard_tidy_merged = billboard_tidy2.merge(billboard_tidy, left_on='track', right_on='track',
          suffixes=('_left', '_right'))

In [47]:
billboard_tidy_merged = billboard_tidy_merged.drop(["year_left", "artist_left", "track", "time_left", "date.entered_left", "year_right", "artist_right", "time_right", "date.entered_right"], axis = 1)

In [48]:
billboard_tidy_merged["song_num"] = billboard_tidy_merged["index"]
billboard_tidy_merged.drop(["index"], axis=1)

Unnamed: 0,week,position,position_date,song_num
0,1,87.0,2000-02-26,0
1,2,82.0,2000-03-04,0
2,3,72.0,2000-03-11,0
3,4,77.0,2000-03-18,0
4,5,87.0,2000-03-25,0
...,...,...,...,...
5327,35,33.0,2000-12-23,316
5328,36,37.0,2000-12-30,316
5329,37,38.0,2001-01-06,316
5330,38,38.0,2001-01-13,316


In [49]:
billboard_tidy.shape

(5307, 8)

In [50]:
billboard_tidy.dtypes

year                      int64
artist                   object
track                    object
time                     object
date.entered     datetime64[ns]
week                      int64
position                float64
position_date    datetime64[ns]
dtype: object

In [51]:
billboard_tidy.head(10)

Unnamed: 0,year,artist,track,time,date.entered,week,position,position_date
0,2000,2 Pac,Baby Don't Cry (Keep...,4:22,2000-02-26,1,87.0,2000-02-26
1,2000,2Ge+her,The Hardest Part Of ...,3:15,2000-09-02,1,91.0,2000-09-02
2,2000,3 Doors Down,Kryptonite,3:53,2000-04-08,1,81.0,2000-04-08
3,2000,3 Doors Down,Loser,4:24,2000-10-21,1,76.0,2000-10-21
4,2000,504 Boyz,Wobble Wobble,3:35,2000-04-15,1,57.0,2000-04-15
5,2000,98^0,Give Me Just One Nig...,3:24,2000-08-19,1,51.0,2000-08-19
6,2000,A*Teens,Dancing Queen,3:44,2000-07-08,1,97.0,2000-07-08
7,2000,Aaliyah,I Don't Wanna,4:15,2000-01-29,1,84.0,2000-01-29
8,2000,Aaliyah,Try Again,4:03,2000-03-18,1,59.0,2000-03-18
9,2000,"Adams, Yolanda",Open My Heart,5:30,2000-08-26,1,76.0,2000-08-26


In [52]:
billboard_tidy.tail(5)

Unnamed: 0,year,artist,track,time,date.entered,week,position,position_date
19716,2000,Creed,Higher,5:16,1999-09-11,63,50.0,2000-11-18
19833,2000,Lonestar,Amazed,4:25,1999-06-05,63,45.0,2000-08-12
20033,2000,Creed,Higher,5:16,1999-09-11,64,50.0,2000-11-25
20150,2000,Lonestar,Amazed,4:25,1999-06-05,64,50.0,2000-08-19
20350,2000,Creed,Higher,5:16,1999-09-11,65,49.0,2000-12-02


In [53]:
billboard_tidy.sample(10)

Unnamed: 0,year,artist,track,time,date.entered,week,position,position_date
3304,2000,"Iglesias, Enrique",Rhythm Divine,7:35,1999-12-04,11,49.0,2000-02-12
5450,2000,Counting Crows,Hanginaround,4:07,1999-11-06,18,59.0,2000-03-04
10206,2000,Creed,Higher,5:16,1999-09-11,33,25.0,2000-04-22
5583,2000,"Martin, Ricky",She Bangs,4:02,2000-10-07,18,100.0,2001-02-03
329,2000,"Aguilera, Christina",I Turn To You,4:00,2000-04-15,2,39.0,2000-04-22
2939,2000,Drama,"Left, Right, Left",3:37,2000-02-12,10,96.0,2000-04-15
2822,2000,Third Eye Blind,Never Let You Go,3:57,2000-01-22,9,19.0,2000-03-18
5173,2000,"Fabian, Lara",I Will Love Again,3:43,2000-06-10,17,96.0,2000-09-30
586,2000,Sonique,It Feels So Good,3:48,2000-01-22,2,52.0,2000-01-29
2734,2000,"McBride, Martina",There You Are,3:26,2000-09-09,9,60.0,2000-11-04


In [54]:
temp_data.value.tmax.nsmallest(5)

2     24.1
20    25.4
28    26.3
17    26.4
21    27.0
Name: tmax, dtype: float64

In [55]:
temp_data.value.tmin.nsmallest(5)

28     7.9
24    10.5
32    10.5
4     10.7
27    12.0
Name: tmin, dtype: float64

In [56]:
billboard_tidy.loc[1:20, ["track", "artist", "week"]]

Unnamed: 0,track,artist,week
1,The Hardest Part Of ...,2Ge+her,1
2,Kryptonite,3 Doors Down,1
3,Loser,3 Doors Down,1
4,Wobble Wobble,504 Boyz,1
5,Give Me Just One Nig...,98^0,1
6,Dancing Queen,A*Teens,1
7,I Don't Wanna,Aaliyah,1
8,Try Again,Aaliyah,1
9,Open My Heart,"Adams, Yolanda",1
10,More,"Adkins, Trace",1


In [57]:
billboard_tidy[billboard_tidy.week == 1]

Unnamed: 0,year,artist,track,time,date.entered,week,position,position_date
0,2000,2 Pac,Baby Don't Cry (Keep...,4:22,2000-02-26,1,87.0,2000-02-26
1,2000,2Ge+her,The Hardest Part Of ...,3:15,2000-09-02,1,91.0,2000-09-02
2,2000,3 Doors Down,Kryptonite,3:53,2000-04-08,1,81.0,2000-04-08
3,2000,3 Doors Down,Loser,4:24,2000-10-21,1,76.0,2000-10-21
4,2000,504 Boyz,Wobble Wobble,3:35,2000-04-15,1,57.0,2000-04-15
...,...,...,...,...,...,...,...,...
312,2000,Yankee Grey,Another Nine Minutes,3:10,2000-04-29,1,86.0,2000-04-29
313,2000,"Yearwood, Trisha",Real Live Woman,3:55,2000-04-01,1,85.0,2000-04-01
314,2000,Ying Yang Twins,Whistle While You Tw...,4:19,2000-03-18,1,95.0,2000-03-18
315,2000,Zombie Nation,Kernkraft 400,3:30,2000-09-02,1,99.0,2000-09-02


In [58]:
billboard_tidy[billboard_tidy.position <= 20].sort_values("position")

Unnamed: 0,year,artist,track,time,date.entered,week,position,position_date
12859,2000,Lonestar,Amazed,4:25,1999-06-05,41,1.0,2000-03-11
2410,2000,Madonna,Music,3:45,2000-08-12,8,1.0,2000-09-30
3561,2000,Destiny's Child,Independent Women Pa...,3:38,2000-09-23,12,1.0,2000-12-09
3815,2000,"Aguilera, Christina",Come On Over Baby (A...,3:38,2000-08-05,13,1.0,2000-10-28
4023,2000,N'Sync,It's Gonna Be Me,3:10,2000-05-06,13,1.0,2000-07-29
...,...,...,...,...,...,...,...,...
2166,2000,Sisqo,Thong Song,4:05,2000-01-29,7,20.0,2000-03-11
4745,2000,Westlife,Swear It Again,4:07,2000-04-01,15,20.0,2000-07-08
2389,2000,Lil Bow Wow,Bounce With Me,3:22,2000-08-19,8,20.0,2000-10-07
9192,2000,matchbox twenty,Bent,4:12,2000-04-29,29,20.0,2000-11-11


In [59]:
selection_criteria = (billboard_tidy.position <= 20) & (billboard_tidy.week <= 5) & (billboard_tidy.artist == "Pink")

In [60]:
billboard_tidy[selection_criteria][["week", "track"]]

Unnamed: 0,week,track
551,2,There U Go
868,3,There U Go
1185,4,There U Go
1502,5,There U Go


In [61]:
billboard_tidy[selection_criteria]["week"]

551     2
868     3
1185    4
1502    5
Name: week, dtype: int64

In [62]:
billboard_tidy.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
year,5307.0,2000.0,0.0,2000.0,2000.0,2000.0,2000.0,2000.0
week,5307.0,11.470699,9.0013,1.0,5.0,10.0,16.0,65.0
position,5307.0,51.052384,28.966198,1.0,26.0,51.0,76.0,100.0


In [63]:
pew_tidy.describe()

Unnamed: 0,count
count,180.0
mean,197.533333
std,326.548693
min,1.0
25%,18.0
50%,46.0
75%,218.5
max,1529.0


In [64]:
weather_data = pd.read_csv("https://raw.githubusercontent.com/synesthesiam/blog/master/posts/data/weather_year.csv")

In [65]:
weather_data

Unnamed: 0,EDT,Max TemperatureF,Mean TemperatureF,Min TemperatureF,Max Dew PointF,MeanDew PointF,Min DewpointF,Max Humidity,Mean Humidity,Min Humidity,...,Max VisibilityMiles,Mean VisibilityMiles,Min VisibilityMiles,Max Wind SpeedMPH,Mean Wind SpeedMPH,Max Gust SpeedMPH,PrecipitationIn,CloudCover,Events,WindDirDegrees
0,2012-3-10,56,40,24,24,20,16,74,50,26,...,10,10,10,13,6,17.0,0.00,0,,138
1,2012-3-11,67,49,30,43,31,24,78,53,28,...,10,10,10,22,7,32.0,T,1,Rain,163
2,2012-3-12,71,62,53,59,55,43,90,76,61,...,10,10,6,24,14,36.0,0.03,6,Rain,190
3,2012-3-13,76,63,50,57,53,47,93,66,38,...,10,10,4,16,5,24.0,0.00,0,,242
4,2012-3-14,80,62,44,58,52,43,93,68,42,...,10,10,10,16,6,22.0,0.00,0,,202
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
361,2013-3-6,32,31,29,27,26,25,92,85,78,...,10,5,1,22,9,33.0,0.04,8,Snow,314
362,2013-3-7,36,32,28,27,25,22,85,71,56,...,10,9,6,8,2,12.0,0.00,8,,342
363,2013-3-8,47,35,23,27,24,21,88,65,42,...,10,9,5,12,3,14.0,0.00,1,,90
364,2013-3-9,56,45,33,32,29,23,75,57,38,...,10,10,10,16,10,24.0,T,2,,141


In [66]:
weather_data.dtypes

EDT                            object
Max TemperatureF                int64
Mean TemperatureF               int64
Min TemperatureF                int64
Max Dew PointF                  int64
MeanDew PointF                  int64
Min DewpointF                   int64
Max Humidity                    int64
 Mean Humidity                  int64
 Min Humidity                   int64
 Max Sea Level PressureIn     float64
 Mean Sea Level PressureIn    float64
 Min Sea Level PressureIn     float64
 Max VisibilityMiles            int64
 Mean VisibilityMiles           int64
 Min VisibilityMiles            int64
 Max Wind SpeedMPH              int64
 Mean Wind SpeedMPH             int64
 Max Gust SpeedMPH            float64
PrecipitationIn                object
 CloudCover                     int64
 Events                        object
 WindDirDegrees                 int64
dtype: object

In [67]:
weather_data.columns = ["date", "max_temp", "mean_temp", "min_temp", "max_dew","mean_dew", "min_dew", "max_humidity", "mean_humidity",
    "min_humidity", "max_pressure", "mean_pressure",
    "min_pressure", "max_visibilty", "mean_visibility",
    "min_visibility", "max_wind", "mean_wind", "min_wind",
    "precipitation", "cloud_cover", "events", "wind_dir"]


In [68]:
weather_data

Unnamed: 0,date,max_temp,mean_temp,min_temp,max_dew,mean_dew,min_dew,max_humidity,mean_humidity,min_humidity,...,max_visibilty,mean_visibility,min_visibility,max_wind,mean_wind,min_wind,precipitation,cloud_cover,events,wind_dir
0,2012-3-10,56,40,24,24,20,16,74,50,26,...,10,10,10,13,6,17.0,0.00,0,,138
1,2012-3-11,67,49,30,43,31,24,78,53,28,...,10,10,10,22,7,32.0,T,1,Rain,163
2,2012-3-12,71,62,53,59,55,43,90,76,61,...,10,10,6,24,14,36.0,0.03,6,Rain,190
3,2012-3-13,76,63,50,57,53,47,93,66,38,...,10,10,4,16,5,24.0,0.00,0,,242
4,2012-3-14,80,62,44,58,52,43,93,68,42,...,10,10,10,16,6,22.0,0.00,0,,202
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
361,2013-3-6,32,31,29,27,26,25,92,85,78,...,10,5,1,22,9,33.0,0.04,8,Snow,314
362,2013-3-7,36,32,28,27,25,22,85,71,56,...,10,9,6,8,2,12.0,0.00,8,,342
363,2013-3-8,47,35,23,27,24,21,88,65,42,...,10,9,5,12,3,14.0,0.00,1,,90
364,2013-3-9,56,45,33,32,29,23,75,57,38,...,10,10,10,16,10,24.0,T,2,,141


In [69]:
weather_data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
max_temp,366.0,66.803279,20.361247,16.0,51.0,69.0,84.0,106.0
mean_temp,366.0,55.68306,18.436506,11.0,41.0,59.0,70.75,89.0
min_temp,366.0,44.101093,17.301141,1.0,30.0,47.0,57.75,77.0
max_dew,366.0,49.54918,16.397178,0.0,36.0,54.5,62.0,77.0
mean_dew,366.0,44.057377,16.829996,-3.0,30.0,48.0,57.0,72.0
min_dew,366.0,37.980874,17.479449,-5.0,24.0,41.0,51.0,71.0
max_humidity,366.0,90.027322,9.108438,54.0,85.0,93.0,96.0,100.0
mean_humidity,366.0,67.860656,9.945591,37.0,61.25,68.0,74.0,95.0
min_humidity,366.0,45.193989,15.360261,15.0,35.0,42.0,54.0,90.0
max_pressure,366.0,30.108907,0.172189,29.64,29.99,30.1,30.21,30.6


In [70]:
weather_data.min_humidity.min(), weather_data.min_humidity.max()

(15, 90)

In [71]:
(weather_data.min_humidity <= weather_data.mean_humidity) & (weather_data.mean_humidity <= weather_data.max_humidity)

0      True
1      True
2      True
3      True
4      True
       ... 
361    True
362    True
363    True
364    True
365    True
Length: 366, dtype: bool

In [72]:
(~ (weather_data.min_humidity <= weather_data.mean_humidity) & (weather_data.mean_humidity <= weather_data.max_humidity)).any()

False

In [73]:
weather_data.precipitation.unique()

array(['0.00', 'T', '0.03', '0.04', '0.14', '0.86', '0.06', '0.01',
       '0.51', '0.69', '1.45', '0.38', '0.19', '0.15', '0.49', '0.29',
       '0.09', '0.90', '0.02', '0.07', '0.13', '0.10', '0.36', '0.27',
       '0.16', '0.26', '0.31', '0.05', '0.32', '1.85', '0.53', '2.00',
       '0.92', '1.10', '0.17', '1.13', '0.63', '0.50', '0.71', '0.73',
       '1.52', '0.47', '0.39', '0.18', '0.77', '0.08', '0.33', '0.44',
       '0.48', '0.20', '0.12', '0.82', '1.16', '1.73', '0.40', '0.99',
       '0.30', '1.17'], dtype=object)

In [74]:
weather_data.precipitation = weather_data.precipitation.replace({"T": 1e-7}).astype(np.float64)

In [75]:
weather_data.describe()

Unnamed: 0,max_temp,mean_temp,min_temp,max_dew,mean_dew,min_dew,max_humidity,mean_humidity,min_humidity,max_pressure,...,min_pressure,max_visibilty,mean_visibility,min_visibility,max_wind,mean_wind,min_wind,precipitation,cloud_cover,wind_dir
count,366.0,366.0,366.0,366.0,366.0,366.0,366.0,366.0,366.0,366.0,...,366.0,366.0,366.0,366.0,366.0,366.0,365.0,366.0,366.0,366.0
mean,66.803279,55.68306,44.101093,49.54918,44.057377,37.980874,90.027322,67.860656,45.193989,30.108907,...,29.936831,9.994536,8.73224,5.797814,16.418033,6.057377,22.764384,0.096885,2.885246,189.704918
std,20.361247,18.436506,17.301141,16.397178,16.829996,17.479449,9.108438,9.945591,15.360261,0.172189,...,0.182476,0.073821,1.875406,3.792219,5.564329,3.20094,8.131092,0.27593,2.707261,94.04508
min,16.0,11.0,1.0,0.0,-3.0,-5.0,54.0,37.0,15.0,29.64,...,29.23,9.0,2.0,0.0,6.0,0.0,7.0,0.0,0.0,1.0
25%,51.0,41.0,30.0,36.0,30.0,24.0,85.0,61.25,35.0,29.99,...,29.83,10.0,8.0,2.0,13.0,4.0,17.0,0.0,0.0,131.0
50%,69.0,59.0,47.0,54.5,48.0,41.0,93.0,68.0,42.0,30.1,...,29.94,10.0,10.0,6.0,16.0,6.0,22.0,0.0,2.0,192.5
75%,84.0,70.75,57.75,62.0,57.0,51.0,96.0,74.0,54.0,30.21,...,30.04,10.0,10.0,10.0,20.0,8.0,26.0,0.0275,5.0,259.75
max,106.0,89.0,77.0,77.0,72.0,71.0,100.0,95.0,90.0,30.6,...,30.44,10.0,10.0,10.0,39.0,19.0,63.0,2.0,8.0,360.0


In [76]:
weather_data.wind_dir.min(), weather_data.wind_dir.max()

(1, 360)

In [77]:
weather_data.date.str.split("-")

0      [2012, 3, 10]
1      [2012, 3, 11]
2      [2012, 3, 12]
3      [2012, 3, 13]
4      [2012, 3, 14]
           ...      
361     [2013, 3, 6]
362     [2013, 3, 7]
363     [2013, 3, 8]
364     [2013, 3, 9]
365    [2013, 3, 10]
Name: date, Length: 366, dtype: object

In [78]:
def string_to_date(date_string):
    return datetime.strptime(date_string, "%Y-%m-%d")

In [79]:
string_to_date("2021-07-02")

datetime.datetime(2021, 7, 2, 0, 0)

In [80]:
weather_data.date = weather_data.date.apply(string_to_date)

In [84]:
weather_data = weather_data.set_index("date")

In [85]:
weather_data.index

DatetimeIndex(['2012-03-10', '2012-03-11', '2012-03-12', '2012-03-13',
               '2012-03-14', '2012-03-15', '2012-03-16', '2012-03-17',
               '2012-03-18', '2012-03-19',
               ...
               '2013-03-01', '2013-03-02', '2013-03-03', '2013-03-04',
               '2013-03-05', '2013-03-06', '2013-03-07', '2013-03-08',
               '2013-03-09', '2013-03-10'],
              dtype='datetime64[ns]', name='date', length=366, freq=None)

In [86]:
weather_data.loc[datetime(2012, 3, 10)]

max_temp              56
mean_temp             40
min_temp              24
max_dew               24
mean_dew              20
min_dew               16
max_humidity          74
mean_humidity         50
min_humidity          26
max_pressure       30.53
mean_pressure      30.45
min_pressure       30.34
max_visibilty         10
mean_visibility       10
min_visibility        10
max_wind              13
mean_wind              6
min_wind            17.0
precipitation        0.0
cloud_cover            0
events               NaN
wind_dir             138
Name: 2012-03-10 00:00:00, dtype: object

In [89]:
weather_data.events.count() / len(weather_data)

0.4426229508196721

In [90]:
weather_data.events.unique()

array([nan, 'Rain', 'Rain-Thunderstorm', 'Fog-Thunderstorm', 'Fog-Rain',
       'Thunderstorm', 'Fog-Rain-Thunderstorm', 'Fog', 'Fog-Rain-Snow',
       'Fog-Rain-Snow-Thunderstorm', 'Fog-Snow', 'Snow', 'Rain-Snow'],
      dtype=object)

In [92]:
weather_data.events = pd.Categorical(weather_data.events.fillna(""))

In [94]:
weather_data

Unnamed: 0_level_0,max_temp,mean_temp,min_temp,max_dew,mean_dew,min_dew,max_humidity,mean_humidity,min_humidity,max_pressure,...,max_visibilty,mean_visibility,min_visibility,max_wind,mean_wind,min_wind,precipitation,cloud_cover,events,wind_dir
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2012-03-10,56,40,24,24,20,16,74,50,26,30.53,...,10,10,10,13,6,17.0,0.000000e+00,0,,138
2012-03-11,67,49,30,43,31,24,78,53,28,30.37,...,10,10,10,22,7,32.0,1.000000e-07,1,Rain,163
2012-03-12,71,62,53,59,55,43,90,76,61,30.13,...,10,10,6,24,14,36.0,3.000000e-02,6,Rain,190
2012-03-13,76,63,50,57,53,47,93,66,38,30.12,...,10,10,4,16,5,24.0,0.000000e+00,0,,242
2012-03-14,80,62,44,58,52,43,93,68,42,30.15,...,10,10,10,16,6,22.0,0.000000e+00,0,,202
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2013-03-06,32,31,29,27,26,25,92,85,78,30.31,...,10,5,1,22,9,33.0,4.000000e-02,8,Snow,314
2013-03-07,36,32,28,27,25,22,85,71,56,30.40,...,10,9,6,8,2,12.0,0.000000e+00,8,,342
2013-03-08,47,35,23,27,24,21,88,65,42,30.48,...,10,9,5,12,3,14.0,0.000000e+00,1,,90
2013-03-09,56,45,33,32,29,23,75,57,38,30.32,...,10,10,10,16,10,24.0,1.000000e-07,2,,141


In [105]:
def farenheit_to_celsius(temp):
    return (temp - 32) * (5/9)

In [106]:
weather_data["max_temp_celsius"] = weather_data.max_temp.apply(farenheit_to_celsius)

In [107]:
weather_data

Unnamed: 0_level_0,max_temp,mean_temp,min_temp,max_dew,mean_dew,min_dew,max_humidity,mean_humidity,min_humidity,max_pressure,...,mean_visibility,min_visibility,max_wind,mean_wind,min_wind,precipitation,cloud_cover,events,wind_dir,max_temp_celsius
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2012-03-10,56,40,24,24,20,16,74,50,26,30.53,...,10,10,13,6,17.0,0.000000e+00,0,,138,13.333333
2012-03-11,67,49,30,43,31,24,78,53,28,30.37,...,10,10,22,7,32.0,1.000000e-07,1,Rain,163,19.444444
2012-03-12,71,62,53,59,55,43,90,76,61,30.13,...,10,6,24,14,36.0,3.000000e-02,6,Rain,190,21.666667
2012-03-13,76,63,50,57,53,47,93,66,38,30.12,...,10,4,16,5,24.0,0.000000e+00,0,,242,24.444444
2012-03-14,80,62,44,58,52,43,93,68,42,30.15,...,10,10,16,6,22.0,0.000000e+00,0,,202,26.666667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2013-03-06,32,31,29,27,26,25,92,85,78,30.31,...,5,1,22,9,33.0,4.000000e-02,8,Snow,314,0.000000
2013-03-07,36,32,28,27,25,22,85,71,56,30.40,...,9,6,8,2,12.0,0.000000e+00,8,,342,2.222222
2013-03-08,47,35,23,27,24,21,88,65,42,30.48,...,9,5,12,3,14.0,0.000000e+00,1,,90,8.333333
2013-03-09,56,45,33,32,29,23,75,57,38,30.32,...,10,10,16,10,24.0,1.000000e-07,2,,141,13.333333


In [108]:
billboard_tidy_without_higher = billboard_tidy.drop(billboard_tidy[billboard_tidy.track == "Higher"].index)

In [109]:
billboard_tidy_without_higher[billboard_tidy_without_higher.artist == "Creed"]

Unnamed: 0,year,artist,track,time,date.entered,week,position,position_date
63,2000,Creed,With Arms Wide Open,3:52,2000-05-13,1,84.0,2000-05-13
380,2000,Creed,With Arms Wide Open,3:52,2000-05-13,2,78.0,2000-05-20
697,2000,Creed,With Arms Wide Open,3:52,2000-05-13,3,76.0,2000-05-27
1014,2000,Creed,With Arms Wide Open,3:52,2000-05-13,4,74.0,2000-06-03
1331,2000,Creed,With Arms Wide Open,3:52,2000-05-13,5,70.0,2000-06-10
1648,2000,Creed,With Arms Wide Open,3:52,2000-05-13,6,68.0,2000-06-17
1965,2000,Creed,With Arms Wide Open,3:52,2000-05-13,7,74.0,2000-06-24
2282,2000,Creed,With Arms Wide Open,3:52,2000-05-13,8,75.0,2000-07-01
2599,2000,Creed,With Arms Wide Open,3:52,2000-05-13,9,69.0,2000-07-08
2916,2000,Creed,With Arms Wide Open,3:52,2000-05-13,10,74.0,2000-07-15
