## Introduction to social data science
#### Pandas
    Series：
<center>A vector/list with labels for each entry.由data,index赋值<center>
<center>和dic的区别：series的indices可以相同,且有序<center>

In [1]:
import pandas as pd
import numpy as np

In [2]:
#设置一个series
L = [1, 1.2, 'abc', True]

my_series = pd.Series(L)
my_series

0       1
1     1.2
2     abc
3    True
dtype: object

In [3]:
#具有自定义索引的series
num_data = range(0,3) # Generate data
indices = ['B', 'C', 'A'] # Generate index names
my_series2 = pd.Series(data=num_data, index=indices) # Create a pandas series from the two
my_series2

B    0
C    1
A    2
dtype: int64

In [4]:
#series转dictionary
my_series.to_dict()

{0: 1, 1: 1.2, 2: 'abc', 3: True}

In [5]:
#dictionary转series
d = {'yesterday': 0, 'today': 1, 'tomorrow':3} # Create some dictionary
my_series3 = pd.Series(d) # Use the constructor
my_series3

yesterday    0
today        1
tomorrow     3
dtype: int64

In [6]:
s = pd.Series(range(3), index=['A','A', 'A']) # Create series with same indices
print(s) # Check duplicates

A    0
A    1
A    2
dtype: int64


    Data Frames:
<center>A vector/list with labels for each entry.由data,index赋值<center>
<center>和dic的区别：series的indices可以相同,且有序<center>

In [7]:
#创建2*2data frame
df = pd.DataFrame(data=[[1,2],[3,4]],
                  columns=['A', 'B'])
df

Unnamed: 0,A,B
0,1,2
1,3,4


In [8]:
#从dic创建data frame
djan = {'1st': 0, '2nd': 1, '3rd':3} # Create some dictionary for january
dfeb = {'1st': -3, '2nd': -1, '3rd':-2} # Create some dictionary for february
dmar = {'1st': 3, '2nd': 5, '3rd':4} # Create some dictionary for march

d = {'january': djan, 'february': dfeb, 'march': dmar} # Create dictionary of dictionaries
my_df1 = pd.DataFrame(d) # Use the constructor
my_df1

Unnamed: 0,january,february,march
1st,0,-3,3
2nd,1,-1,5
3rd,3,-2,4


In [9]:
#用astype转换数据格式
print(my_series3)
print()
print(my_series3.astype(float))
print()
print(my_series3.astype(str))

yesterday    0
today        1
tomorrow     3
dtype: int64

yesterday    0.0
today        1.0
tomorrow     3.0
dtype: float64

yesterday    0
today        1
tomorrow     3
dtype: object


In [10]:
#打印头3行，尾3行
import seaborn as sns
titanic = sns.load_dataset('titanic')
titanic.head(3)
#titanic.tail(3)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True


In [11]:
# use loc and .iloc to select
print(titanic.loc[range(3),['survived', 'age', 'sex']])
print(titanic.iloc[10:15,:5])
# use column to select
titanic[['survived']].head(3)

   survived   age     sex
0         0  22.0    male
1         1  38.0  female
2         1  26.0  female
    survived  pclass     sex   age  sibsp
10         1       3  female   4.0      1
11         1       1  female  58.0      0
12         0       3    male  20.0      0
13         0       3    male  39.0      1
14         0       3  female  14.0      0


Unnamed: 0,survived
0,0
1,1
2,1


In [12]:
# change the index of Data Frame
my_df = pd.DataFrame([[1,2], [3,4], [5,6]], columns = ['a', 'b'], index = ['i', 'ii', 'iii'])
print(my_df)
my_df_a = my_df.set_index('a')
print(my_df_a)
print(my_df_a.reset_index()) # drop=True
print()
print(my_df_a.reset_index(drop=True)) # drop=True

# change column names
my_df.columns = ['A', 'B']
print(my_df)
my_df.rename(columns={'A': 'Aa'}, inplace=True)
print(my_df)

#sort data
my_loc2 = ['i', 'iii']
my_df.loc[my_loc2, 'Aa'] = 10

print(my_df.sort_values(by='Aa', ascending=True))
print(my_df.sort_index())

     a  b
i    1  2
ii   3  4
iii  5  6
   b
a   
1  2
3  4
5  6
   a  b
0  1  2
1  3  4
2  5  6

   b
0  2
1  4
2  6
     A  B
i    1  2
ii   3  4
iii  5  6
     Aa  B
i     1  2
ii    3  4
iii   5  6
     Aa  B
ii    3  4
i    10  2
iii  10  6
     Aa  B
i    10  2
ii    3  4
iii  10  6


In [13]:
# boolean selection
print(my_series3)
print()
print(my_series3[my_series3<3])
print()
print(((titanic.sex == 'female') & (titanic.age >= 30)).head(3)) # selection by multiple columns

yesterday    0
today        1
tomorrow     3
dtype: int64

yesterday    0
today        1
dtype: int64

0    False
1     True
2    False
dtype: bool


In [14]:
# categorical data with order
edu_list = ['BSc Political Science', 'Secondary School'] + ['High School']*2
edu_cats = ['Secondary School', 'High School', 'BSc Political Science']

str_ser = pd.Series(edu_list*10**5)
cats = pd.Categorical(str_ser, categories=edu_cats, ordered=True)
cat_ser2 = pd.Series(cats, index=str_ser.index)
print(cat_ser2[:5])
print()

# 分位数切片分类
cat_ser3 = pd.qcut(pd.Series(np.random.normal(size = 10**6)), q = [0,0.025, 0.975, 1])
cat_ser3.cat.categories

0    BSc Political Science
1         Secondary School
2              High School
3              High School
4    BSc Political Science
dtype: category
Categories (3, object): ['Secondary School' < 'High School' < 'BSc Political Science']



IntervalIndex([(-5.0600000000000005, -1.957], (-1.957, 1.96], (1.96, 5.155]], dtype='interval[float64, right]')

In [16]:
# 时间数据 in pandas
str_ser2 = pd.Series(['20170101', '20170727', '20170803', '20171224'])
print(pd.to_datetime(str_ser2)) #datetime格式
print()
print(pd.to_datetime(str_ser2.astype(int))) #epoch time格式
print()

# extract time data
import yfinance as yf
aapl = yf.download("AAPL", data_source='yahoo')['Adj Close']
dt_ser2 = pd.Series(aapl.index)
dt_ser2.dt.year #also year, weekday, hour, second

0   2017-01-01
1   2017-07-27
2   2017-08-03
3   2017-12-24
dtype: datetime64[ns]

0   1970-01-01 00:00:00.020170101
1   1970-01-01 00:00:00.020170727
2   1970-01-01 00:00:00.020170803
3   1970-01-01 00:00:00.020171224
dtype: datetime64[ns]

[*********************100%***********************]  1 of 1 completed


0        1980
1        1980
2        1980
3        1980
4        1980
         ... 
10493    2022
10494    2022
10495    2022
10496    2022
10497    2022
Name: Date, Length: 10498, dtype: int64