numpy与list运算效率比较

In [1]:
import random
import time
import numpy as np

a = []
for i in range(100000):
    a.append(random.random())

# %time 是 IPython 环境（如 Jupyter Notebook、JupyterLab 或 IPython 终端）中的魔术命令，用于测量代码执行时间。
%time sum1 = sum(a)

b = np.array(a)
%time sum2 = np.sum(b)


CPU times: user 414 μs, sys: 50 μs, total: 464 μs
Wall time: 465 μs
CPU times: user 172 μs, sys: 21 μs, total: 193 μs
Wall time: 196 μs


In [2]:
import numpy as np
np.ones(10)

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

In [3]:
import numpy as np
np.ones([2,3])

array([[1., 1., 1.],
       [1., 1., 1.]])

In [4]:
np.zeros_like(np.ones([2,3]))

array([[0., 0., 0.],
       [0., 0., 0.]])

In [5]:
arr = np.arange(12).reshape(3, 4)  # 3行4列数组：[[0,1,2,3],[4,5,6,7],[8,9,10,11]]

# 索引
print(arr[1, 2])  # 第2行第3列：6

# 切片（取第1-2行，第2-3列）
print(arr[0:2, 1:3])  # [[1,2], [5,6]]

# 省略号（...）表示剩余维度全取
print(arr[..., 0])  # 取所有行的第1列：[0,4,8]

6
[[1 2]
 [5 6]]
[0 4 8]


In [6]:
arr = np.arange(12).reshape(3, 4)
arr1 = np.array(arr)
arr2 = np.asarray(arr)
print(arr1 is arr)  # False，arr1是新数组
print(arr2 is arr)  # False，arr2是新数组
arr3 = np.array(arr, copy=False)
print(arr3 is arr)  # True，arr3是视图，没有复制数据
arr4  = np.array(arr, copy=True)
print(arr4 is arr)  # False，arr4是新数组
arr1[0,0] = 100
print(arr)  # arr未变，arr1是新数组（深拷贝）
arr3[0,0] = 200
print(arr)  # arr变了，arr3是视图（浅拷贝）
arr2[0,0] = 300
print(arr)  # arr变了，arr2是视图(浅拷贝)

False
True
True
False
[[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]]
[[200   1   2   3]
 [  4   5   6   7]
 [  8   9  10  11]]
[[300   1   2   3]
 [  4   5   6   7]
 [  8   9  10  11]]


In [7]:
np.linspace(1,10,10)  # 1到10之间均匀取10个数

array([ 1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10.])

In [8]:
np.logspace(1,10,10)  # 10的1次方到10的10次方之间均匀取10个数

array([1.e+01, 1.e+02, 1.e+03, 1.e+04, 1.e+05, 1.e+06, 1.e+07, 1.e+08,
       1.e+09, 1.e+10])

In [9]:
import numpy as np
# 生成均值为5，标准差为2的4个正态分布随机数
normal_arr = np.random.normal(loc=5, scale=2, size=4)
print(normal_arr)

[0.45390105 3.66739296 5.60306978 4.75425302]


In [10]:
import numpy as np
stock_change = np.random.normal(0, 1, (1000, 4))  # 1000天，4只股票的日收益率
print(stock_change)

stock_change.shape


[[ 0.98699445  0.78578717 -1.9615979  -0.49291191]
 [-0.32875681  0.37422451 -0.07462686 -0.1158063 ]
 [ 0.10098245 -1.07603792  0.75625298 -0.04674358]
 ...
 [-0.30177717  0.19183955 -1.19645464  0.18865996]
 [-0.38493619  1.05539145 -1.36810386 -1.35941174]
 [-1.1322278  -1.61185035 -0.49862508 -0.83821294]]


(1000, 4)

In [11]:
# 随机生成正态分布数据，10天，5只股票
import numpy as np
stock_change = np.random.normal(0,1,(10,5))
print(stock_change.shape)
stock_change

(10, 5)


array([[-1.31001961,  1.53725825,  0.38717232, -0.34800767,  0.41232315],
       [ 0.83059097, -1.32471594, -0.65241856,  1.33579272,  1.06508412],
       [ 0.77623719, -1.39260418, -0.4269007 , -0.72825267, -0.17371486],
       [-1.35138094, -0.68450423, -0.23271512, -0.96874666, -0.47812174],
       [-1.49475858, -0.82374337,  1.31945802,  0.89619489, -0.12421549],
       [-0.17831405,  1.11355738, -1.66347312, -0.81974514, -0.04500139],
       [ 1.93821788,  0.03329185, -0.37862492, -0.46802851, -1.91995935],
       [-1.04922488, -0.72157161,  1.16639364,  1.16138387,  0.59382856],
       [ 0.97896743,  1.90259468,  1.37001959, -0.52165996,  0.76743382],
       [ 0.77390893,  2.15090521,  1.11982028,  1.68987902,  0.45456516]])

In [12]:
# pandas
import pandas as pd
pd.DataFrame(stock_change)

Unnamed: 0,0,1,2,3,4
0,-1.31002,1.537258,0.387172,-0.348008,0.412323
1,0.830591,-1.324716,-0.652419,1.335793,1.065084
2,0.776237,-1.392604,-0.426901,-0.728253,-0.173715
3,-1.351381,-0.684504,-0.232715,-0.968747,-0.478122
4,-1.494759,-0.823743,1.319458,0.896195,-0.124215
5,-0.178314,1.113557,-1.663473,-0.819745,-0.045001
6,1.938218,0.033292,-0.378625,-0.468029,-1.919959
7,-1.049225,-0.721572,1.166394,1.161384,0.593829
8,0.978967,1.902595,1.37002,-0.52166,0.767434
9,0.773909,2.150905,1.11982,1.689879,0.454565


In [13]:
# 添加行列标签
stock_code = [f'股票{i+1}' for i in range(stock_change.shape[0])]
print(stock_code)

pd.DataFrame(stock_change,index=stock_code)

['股票1', '股票2', '股票3', '股票4', '股票5', '股票6', '股票7', '股票8', '股票9', '股票10']


Unnamed: 0,0,1,2,3,4
股票1,-1.31002,1.537258,0.387172,-0.348008,0.412323
股票2,0.830591,-1.324716,-0.652419,1.335793,1.065084
股票3,0.776237,-1.392604,-0.426901,-0.728253,-0.173715
股票4,-1.351381,-0.684504,-0.232715,-0.968747,-0.478122
股票5,-1.494759,-0.823743,1.319458,0.896195,-0.124215
股票6,-0.178314,1.113557,-1.663473,-0.819745,-0.045001
股票7,1.938218,0.033292,-0.378625,-0.468029,-1.919959
股票8,-1.049225,-0.721572,1.166394,1.161384,0.593829
股票9,0.978967,1.902595,1.37002,-0.52166,0.767434
股票10,0.773909,2.150905,1.11982,1.689879,0.454565


In [14]:
# 设置列标签，采用pd.date_range()自动生成跳过周末的时间
date = pd.date_range(start='20190403',periods=stock_change.shape[1],freq='B')
date

DatetimeIndex(['2019-04-03', '2019-04-04', '2019-04-05', '2019-04-08',
               '2019-04-09'],
              dtype='datetime64[ns]', freq='B')

In [15]:
stock_c = pd.DataFrame(stock_change,index=stock_code, columns=date)

In [16]:
stock_c.head()   # 查看前五行

Unnamed: 0,2019-04-03,2019-04-04,2019-04-05,2019-04-08,2019-04-09
股票1,-1.31002,1.537258,0.387172,-0.348008,0.412323
股票2,0.830591,-1.324716,-0.652419,1.335793,1.065084
股票3,0.776237,-1.392604,-0.426901,-0.728253,-0.173715
股票4,-1.351381,-0.684504,-0.232715,-0.968747,-0.478122
股票5,-1.494759,-0.823743,1.319458,0.896195,-0.124215


In [17]:
stock_c.tail()   # 查看后五行

Unnamed: 0,2019-04-03,2019-04-04,2019-04-05,2019-04-08,2019-04-09
股票6,-0.178314,1.113557,-1.663473,-0.819745,-0.045001
股票7,1.938218,0.033292,-0.378625,-0.468029,-1.919959
股票8,-1.049225,-0.721572,1.166394,1.161384,0.593829
股票9,0.978967,1.902595,1.37002,-0.52166,0.767434
股票10,0.773909,2.150905,1.11982,1.689879,0.454565


In [18]:
# 不可以先行后列
# stock_c['股票6']['2019-04-03']
stock_c['2019-04-03']['股票6']

np.float64(-0.1783140470059152)

In [19]:
stock_c.describe()

Unnamed: 0,2019-04-03,2019-04-04,2019-04-05,2019-04-08,2019-04-09
count,10.0,10.0,10.0,10.0,10.0
mean,-0.008578,0.179047,0.200873,0.122881,0.055222
std,1.225427,1.369994,1.030175,1.02166,0.84226
min,-1.494759,-1.392604,-1.663473,-0.968747,-1.919959
25%,-1.244821,-0.7982,-0.414832,-0.676604,-0.16134
50%,0.297797,-0.325606,0.077229,-0.408018,0.183661
75%,0.817003,1.431333,1.15475,1.095087,0.559013
max,1.938218,2.150905,1.37002,1.689879,1.065084


In [20]:
stock_c[['2019-04-03']].apply(lambda x: x.max()-x.min())

2019-04-03    3.432976
Freq: B, dtype: float64

In [21]:
stock_c[['2019-04-03']].apply(lambda x: (x > 0) & (x < 0.2))

Unnamed: 0,2019-04-03
股票1,False
股票2,False
股票3,False
股票4,False
股票5,False
股票6,False
股票7,False
股票8,False
股票9,False
股票10,False


In [23]:
# 文件保存
stock_c.to_csv("output_index.csv", index=True)

In [24]:
# 创建含缺失值的 DataFrame
df = pd.DataFrame({
    'A': [1, np.nan, 3],
    'B': [4, 5, None],
    'C': [7, 8, 9]
})

# 判断缺失值（返回布尔值 DataFrame）
print(df.isna())
# 统计每列缺失值数量
print(df.isna().sum())

       A      B      C
0  False  False  False
1   True  False  False
2  False   True  False
A    1
B    1
C    0
dtype: int64
