![](http://pic.yupoo.com/sunnnychan/822001f7/41129d34.jpg)

In [1]:
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
%matplotlib inline
# 设置种子，以确保你能得到相同的答案
random.seed(42)

In [2]:
# 读取 CSV 文件内容，存储至变量 df 
df = pd.read_csv('ab_data.csv')

# 输出前3行数据
df.head(3)

Unnamed: 0,user_id,timestamp,group,landing_page,converted
0,851104,2017-01-21 22:11:48.556739,control,old_page,0
1,804228,2017-01-12 08:01:45.159739,control,old_page,0
2,661590,2017-01-11 16:55:06.154213,treatment,new_page,0


## 数据集总体

In [3]:
# 索引，数据类型和内存信息
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 294478 entries, 0 to 294477
Data columns (total 5 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   user_id       294478 non-null  int64 
 1   timestamp     294478 non-null  object
 2   group         294478 non-null  object
 3   landing_page  294478 non-null  object
 4   converted     294478 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 11.2+ MB


In [4]:
# (行数，列数)
df.shape

(294478, 5)

In [5]:
# 输出数据集的行数
df.shape[0]

294478

In [34]:
# 任一行有空的值吗?
df.isna().any()

user_id         False
timestamp       False
group           False
landing_page    False
converted       False
dtype: bool

## 数据列

In [6]:
# 输出数据集的列数
df.shape[1]

5

In [7]:
# 重命名数据列名
df.columns = ['a','b','c','d','e']
df.head(1)

Unnamed: 0,a,b,c,d,e
0,851104,2017-01-21 22:11:48.556739,control,old_page,0


In [8]:
# 重新加载数据
df= pd.read_csv('ab_data.csv')

In [9]:
# 操作数据列
df.user_id.head(2)

0    851104
1    804228
Name: user_id, dtype: int64

In [33]:
# user_id中不重复的数据函数
df["user_id"].value_counts().shape[0]

290584

In [10]:
# user_id 列去重
df.user_id.nunique()

290584

In [11]:
# 求 converted 列的均值，1 表示转化成果，0 表示转化失败
# 这里求得的均值，就是转化率
df['converted'].mean()

0.11965919355605512

In [12]:
# 条件查询
df.query('group=="control" and landing_page == "new_page"')

Unnamed: 0,user_id,timestamp,group,landing_page,converted
22,767017,2017-01-12 22:58:14.991443,control,new_page,0
240,733976,2017-01-11 15:11:16.407599,control,new_page,0
490,808613,2017-01-10 21:44:01.292755,control,new_page,0
846,637639,2017-01-11 23:09:52.682329,control,new_page,1
850,793580,2017-01-08 03:25:33.723712,control,new_page,1
...,...,...,...,...,...
293894,741581,2017-01-09 20:49:03.391764,control,new_page,0
293996,942612,2017-01-08 13:52:28.182648,control,new_page,0
294200,928506,2017-01-13 21:32:10.491309,control,new_page,0
294253,886135,2017-01-06 12:49:20.509403,control,new_page,0


## 数据集操作

In [13]:
df2 = df.query("group == 'control' and landing_page == 'old_page'")

In [14]:
df2 = df2.append(df.query("group == 'treatment' and landing_page == 'new_page'"))

In [15]:
df2.shape[0]

290585

In [16]:
# group 列的各个值 与 'treatment' 比较，输出是否相等结果
df2['group'] == 'treatment'

0         False
1         False
4         False
5         False
7         False
          ...  
294462     True
294465     True
294468     True
294472     True
294477     True
Name: group, Length: 290585, dtype: bool

In [17]:
df2['landing_page'] == 'new_page'

0         False
1         False
4         False
5         False
7         False
          ...  
294462     True
294465     True
294468     True
294472     True
294477     True
Name: landing_page, Length: 290585, dtype: bool

In [18]:
(df2['group'] == 'treatment') == (df2['landing_page'] == 'new_page')

0         True
1         True
4         True
5         True
7         True
          ... 
294462    True
294465    True
294468    True
294472    True
294477    True
Length: 290585, dtype: bool

In [19]:
# group 列按照是否等于 treatment 输出 True 或 False 单列数据集
# landing_page 列按照是否等于 new_page 输出 True 或 False 单列数据集
# 以上两个数据集按照行，比较是否相等，输出 True 或 False 单列数据集
# 再把得到的数据集，按行与False值比较
df2[((df2['group'] == 'treatment') == (df2['landing_page'] == 'new_page')) == False].shape[0]

0

In [20]:
df2[((df2['group'] == 'treatment') == (df2['landing_page'] == 'new_page')) == True].shape[0]

290585

In [21]:
# 从前面可以看到 df2 数据集是 290585 行
# 按 user_id 去重之后为 290584 行，说明有 2 行的 user_id 是一致的
df2.user_id.nunique()

290584

In [22]:
# df[df.index.duplicated()] 获取重复记录行
df2[df2['user_id'].duplicated()]

Unnamed: 0,user_id,timestamp,group,landing_page,converted
2893,773192,2017-01-14 02:55:59.590927,treatment,new_page,0


In [23]:
df2[~df2['user_id'].duplicated()].shape[0]

290584

In [24]:
df2[df2['user_id'] == 773192]

Unnamed: 0,user_id,timestamp,group,landing_page,converted
1899,773192,2017-01-09 05:37:58.781806,treatment,new_page,0
2893,773192,2017-01-14 02:55:59.590927,treatment,new_page,0


In [25]:
df2 = df2.drop(1899)

In [26]:
df2['converted'].mean()

0.11959708724499628

## 资料

* [数据和源码](https://github.com/SunnnyChan/python-data-analysis)
