numpy与list运算效率比较

In [1]:
import random
import time
import numpy as np

a = []
for i in range(100000):
    a.append(random.random())

# %time 是 IPython 环境（如 Jupyter Notebook、JupyterLab 或 IPython 终端）中的魔术命令，用于测量代码执行时间。
%time sum1 = sum(a)

b = np.array(a)
%time sum2 = np.sum(b)


CPU times: total: 0 ns
Wall time: 999 μs
CPU times: total: 0 ns
Wall time: 998 μs


In [2]:
import numpy as np
np.ones(10)

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

In [3]:
import numpy as np
np.ones([2,3])

array([[1., 1., 1.],
       [1., 1., 1.]])

In [4]:
np.zeros_like(np.ones([2,3]))

array([[0., 0., 0.],
       [0., 0., 0.]])

In [5]:
arr = np.arange(12).reshape(3, 4)  # 3行4列数组：[[0,1,2,3],[4,5,6,7],[8,9,10,11]]

# 索引
print(arr[1, 2])  # 第2行第3列：6

# 切片（取第1-2行，第2-3列）
print(arr[0:2, 1:3])  # [[1,2], [5,6]]

# 省略号（...）表示剩余维度全取
print(arr[..., 0])  # 取所有行的第1列：[0,4,8]

6
[[1 2]
 [5 6]]
[0 4 8]


In [6]:
arr = np.arange(12).reshape(3, 4)
arr1 = np.array(arr)
arr2 = np.asarray(arr)
print(arr1 is arr)  # False，arr1是新数组
print(arr2 is arr)  # False，arr2是新数组
arr3 = np.array(arr, copy=False)
print(arr3 is arr)  # True，arr3是视图，没有复制数据
arr4  = np.array(arr, copy=True)
print(arr4 is arr)  # False，arr4是新数组
arr1[0,0] = 100
print(arr)  # arr未变，arr1是新数组（深拷贝）
arr3[0,0] = 200
print(arr)  # arr变了，arr3是视图（浅拷贝）
arr2[0,0] = 300
print(arr)  # arr变了，arr2是视图(浅拷贝)

False
True
True
False
[[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]]
[[200   1   2   3]
 [  4   5   6   7]
 [  8   9  10  11]]
[[300   1   2   3]
 [  4   5   6   7]
 [  8   9  10  11]]


In [7]:
np.linspace(1,10,10)  # 1到10之间均匀取10个数

array([ 1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10.])

In [8]:
np.logspace(1,10,10)  # 10的1次方到10的10次方之间均匀取10个数

array([1.e+01, 1.e+02, 1.e+03, 1.e+04, 1.e+05, 1.e+06, 1.e+07, 1.e+08,
       1.e+09, 1.e+10])

In [9]:
import numpy as np
# 生成均值为5，标准差为2的4个正态分布随机数
normal_arr = np.random.normal(loc=5, scale=2, size=4)
print(normal_arr)

[7.53035508 7.26188188 6.02823563 5.39217729]


In [10]:
import numpy as np
stock_change = np.random.normal(0, 1, (1000, 4))  # 1000天，4只股票的日收益率
print(stock_change)

stock_change.shape


[[ 0.27951317  1.1775248  -0.97925103 -1.46710388]
 [ 1.93597296  0.93013333  0.73077659 -1.34388379]
 [ 0.81566968 -0.30442396  0.49020532 -0.57162095]
 ...
 [ 1.11432622 -1.49334243 -1.42595571 -1.27860965]
 [-1.22652999  0.07938364 -1.05109829 -1.59235575]
 [-1.43863099  0.56219392 -0.40865756  1.17767091]]


(1000, 4)

In [11]:
# 随机生成正态分布数据，10天，5只股票
import numpy as np
stock_change = np.random.normal(0,1,(10,5))
print(stock_change.shape)
stock_change

(10, 5)


array([[-1.8246002 ,  0.62899161,  1.07487324, -0.01192552,  0.20987608],
       [ 0.38351704,  0.29591695, -0.56976445,  1.09005264,  1.21142388],
       [ 0.7147009 , -0.65087819, -0.37595255, -0.50689236, -0.95498767],
       [-1.29833764,  0.66706988,  0.53256549,  0.96961158, -0.10316803],
       [-0.75409544, -1.67583444, -0.46876336, -0.06135842, -0.13050896],
       [-0.24797994, -0.75146285,  1.41236716,  0.40805255, -0.08840968],
       [-0.79596281,  0.09911132,  0.51383952,  1.97799854, -0.96143812],
       [ 0.92861858,  0.31022276,  0.99085975,  0.93889136, -1.16328187],
       [-0.17034281,  1.00269862, -1.10500801,  0.76981874,  0.24684219],
       [-0.38045481,  0.11265165,  1.87943552,  0.65617892,  1.09700323]])

In [12]:
# pandas
import pandas as pd
pd.DataFrame(stock_change)

Unnamed: 0,0,1,2,3,4
0,-1.8246,0.628992,1.074873,-0.011926,0.209876
1,0.383517,0.295917,-0.569764,1.090053,1.211424
2,0.714701,-0.650878,-0.375953,-0.506892,-0.954988
3,-1.298338,0.66707,0.532565,0.969612,-0.103168
4,-0.754095,-1.675834,-0.468763,-0.061358,-0.130509
5,-0.24798,-0.751463,1.412367,0.408053,-0.08841
6,-0.795963,0.099111,0.51384,1.977999,-0.961438
7,0.928619,0.310223,0.99086,0.938891,-1.163282
8,-0.170343,1.002699,-1.105008,0.769819,0.246842
9,-0.380455,0.112652,1.879436,0.656179,1.097003


In [13]:
# 添加行列标签
stock_code = [f'股票{i+1}' for i in range(stock_change.shape[0])]
print(stock_code)

pd.DataFrame(stock_change,index=stock_code)

['股票1', '股票2', '股票3', '股票4', '股票5', '股票6', '股票7', '股票8', '股票9', '股票10']


Unnamed: 0,0,1,2,3,4
股票1,-1.8246,0.628992,1.074873,-0.011926,0.209876
股票2,0.383517,0.295917,-0.569764,1.090053,1.211424
股票3,0.714701,-0.650878,-0.375953,-0.506892,-0.954988
股票4,-1.298338,0.66707,0.532565,0.969612,-0.103168
股票5,-0.754095,-1.675834,-0.468763,-0.061358,-0.130509
股票6,-0.24798,-0.751463,1.412367,0.408053,-0.08841
股票7,-0.795963,0.099111,0.51384,1.977999,-0.961438
股票8,0.928619,0.310223,0.99086,0.938891,-1.163282
股票9,-0.170343,1.002699,-1.105008,0.769819,0.246842
股票10,-0.380455,0.112652,1.879436,0.656179,1.097003


In [14]:
# 设置列标签，采用pd.date_range()自动生成跳过周末的时间
date = pd.date_range(start='20190403',periods=stock_change.shape[1],freq='B')
date

DatetimeIndex(['2019-04-03', '2019-04-04', '2019-04-05', '2019-04-08',
               '2019-04-09'],
              dtype='datetime64[ns]', freq='B')

In [15]:
stock_c = pd.DataFrame(stock_change,index=stock_code, columns=date)

In [16]:
stock_c.head()   # 查看前五行

Unnamed: 0,2019-04-03,2019-04-04,2019-04-05,2019-04-08,2019-04-09
股票1,-1.8246,0.628992,1.074873,-0.011926,0.209876
股票2,0.383517,0.295917,-0.569764,1.090053,1.211424
股票3,0.714701,-0.650878,-0.375953,-0.506892,-0.954988
股票4,-1.298338,0.66707,0.532565,0.969612,-0.103168
股票5,-0.754095,-1.675834,-0.468763,-0.061358,-0.130509


In [17]:
stock_c.tail()   # 查看后五行

Unnamed: 0,2019-04-03,2019-04-04,2019-04-05,2019-04-08,2019-04-09
股票6,-0.24798,-0.751463,1.412367,0.408053,-0.08841
股票7,-0.795963,0.099111,0.51384,1.977999,-0.961438
股票8,0.928619,0.310223,0.99086,0.938891,-1.163282
股票9,-0.170343,1.002699,-1.105008,0.769819,0.246842
股票10,-0.380455,0.112652,1.879436,0.656179,1.097003


In [18]:
# 不可以先行后列
# stock_c['股票6']['2019-04-03']
stock_c['2019-04-03']['股票6']

np.float64(-0.24797993533295545)

In [19]:
stock_c.describe()

Unnamed: 0,2019-04-03,2019-04-04,2019-04-05,2019-04-08,2019-04-09
count,10.0,10.0,10.0,10.0,10.0
mean,-0.344494,0.003849,0.388445,0.623043,-0.063665
std,0.868317,0.806153,0.978078,0.706406,0.813804
min,-1.8246,-1.675834,-1.105008,-0.506892,-1.163282
25%,-0.785496,-0.463381,-0.445561,0.093069,-0.748868
50%,-0.314217,0.204284,0.523203,0.712999,-0.095789
75%,0.245052,0.549299,1.05387,0.961932,0.237601
max,0.928619,1.002699,1.879436,1.977999,1.211424


In [20]:
stock_c[['2019-04-03']].apply(lambda x: x.max()-x.min())

2019-04-03    2.753219
Freq: B, dtype: float64

In [21]:
stock_c[['2019-04-03']].apply(lambda x: (x > 0) & (x < 0.2))

Unnamed: 0,2019-04-03
股票1,False
股票2,False
股票3,False
股票4,False
股票5,False
股票6,False
股票7,False
股票8,False
股票9,False
股票10,False


In [22]:
# 文件保存
stock_c.to_csv("output_index.csv", index=True)

In [23]:
# 创建含缺失值的 DataFrame
df = pd.DataFrame({
    'A': [1, np.nan, 3],
    'B': [4, 5, None],
    'C': [7, 8, 9]
})

# 判断缺失值（返回布尔值 DataFrame）
print(df.isna())
# 统计每列缺失值数量
print(df.isna().sum())

       A      B      C
0  False  False  False
1   True  False  False
2  False   True  False
A    1
B    1
C    0
dtype: int64
