<a href="https://colab.research.google.com/github/XuRui314/HITSZ_2022_NLP_Project/blob/main/pandas_learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd

### Series与DataFrame

In [2]:
# List -> Series
courses = ["Chinese", "Math", "English", "Computer Science"]
data = pd.Series(data = courses)
data

0             Chinese
1                Math
2             English
3    Computer Science
dtype: object

In [4]:
# Dictionary -> Series
grades = {"Chinese":80, "Math":90, "English":85, "CS":100}
data = pd.Series(grades)
data

Chinese     80
Math        90
English     85
CS         100
dtype: int64

In [5]:
# Series -> List
numbers = data.tolist()
numbers

[80, 90, 85, 100]

In [6]:
# Series -> DataFrame
df = pd.DataFrame(data, columns = ['grade'])
df

Unnamed: 0,grade
Chinese,80
Math,90
English,85
CS,100


In [7]:
# numpy -> Series 
s = pd.Series(
    np.arange(10, 100, 10), # 10-90
    index = np.arange(101, 110), # 101-109
    dtype = 'float'
)
s

101    10.0
102    20.0
103    30.0
104    40.0
105    50.0
106    60.0
107    70.0
108    80.0
109    90.0
dtype: float64

In [9]:
# Series中data类型转换
s = pd.Series(
    data = ["001","002","003","004"],
    index = list("abcd")
)
s = s.astype(int) # int类型
# s = s.map(int) # int是函数
s

a    1
b    2
c    3
d    4
dtype: int64

In [17]:
# 给Series添加新元素
grades = {"Chinese":80, "Math":90, "English":85, "CS":100}
data = pd.Series(grades)

data = data.append(pd.Series({
    "Physics": 88,
    "Chemistry": 95
}))
data['Biology'] = 99
data

Chinese       80
Math          90
English       85
CS           100
Physics       88
Chemistry     95
Biology       99
dtype: int64

In [13]:
# 用reset_index将Series转换成df
df = data.reset_index()
df.columns = ['course', 'grade']
df

Unnamed: 0,course,grade
0,Chinese,80
1,Math,90
2,English,85
3,CS,100
4,Physics,88
5,Chemistry,95
6,Biology,99


In [14]:
# Dictionary -> DataFrame
s = {'course':data.index, 'grade':data.values}
df2 = pd.DataFrame(s)
df2

Unnamed: 0,course,grade
0,Chinese,80
1,Math,90
2,English,85
3,CS,100
4,Physics,88
5,Chemistry,95
6,Biology,99


In [22]:
# 设置DataFrame索引列
data = {
    "name": ['Job', "Merry", "Jane", "Bob"],
    "sex": ['male', "female", "female", "male"],
    "age": [18, 19, 20, 17]
}
df = pd.DataFrame(data)
df.set_index("name", inplace = True)
df

Unnamed: 0_level_0,sex,age
name,Unnamed: 1_level_1,Unnamed: 2_level_1
Job,male,18
Merry,female,19
Jane,female,20
Bob,male,17


### DataFrame操作
* df.head()
* df.tail()
* df.info()
* df.describe

In [34]:
# df.head(),df.tail()
df = pd.DataFrame(
    data = {
        "norm": np.random.normal(loc=0, scale=1, size=1000),
        "uniform": np.random.uniform(low=0, high=1, size=1000),
        "binomial": np.random.binomial(n=1, p=0.2, size=1000)
        
    }
)
df.head() # df.head(5)

Unnamed: 0,norm,uniform,binomial
0,2.120631,0.065634,0
1,-1.061252,0.571313,0
2,0.686139,0.203889,1
3,0.05983,0.172143,1
4,0.475016,0.768721,0


In [35]:
df.tail()

Unnamed: 0,norm,uniform,binomial
995,-0.398552,0.012913,0
996,-0.592581,0.20742,0
997,-0.520816,0.017961,0
998,-0.907487,0.941732,0
999,0.476244,0.950916,0


In [36]:
df.index = pd.date_range(start = '2022-03-16', periods = 1000)

In [37]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1000 entries, 2022-03-16 to 2024-12-09
Freq: D
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   norm      1000 non-null   float64
 1   uniform   1000 non-null   float64
 2   binomial  1000 non-null   int64  
dtypes: float64(2), int64(1)
memory usage: 31.2 KB


In [38]:
df.describe()

Unnamed: 0,norm,uniform,binomial
count,1000.0,1000.0,1000.0
mean,0.004982,0.482346,0.204
std,0.987566,0.288278,0.403171
min,-3.061303,0.000588,0.0
25%,-0.630457,0.230589,0.0
50%,0.004836,0.48053,0.0
75%,0.686525,0.734743,0.0
max,3.684347,0.99979,1.0


### DataFrame统计信息

In [39]:
# 统计数据列的值出现次数
df['binomial'].value_counts()

0    796
1    204
Name: binomial, dtype: int64

### DataFrame和csv文件操作

In [43]:
# 写入test.csv文件
df.to_csv("test.csv")

In [45]:
# 加载csv到df
df_read = pd.read_csv("test.csv", index_col = 0)
df_read.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1000 entries, 2022-03-16 to 2024-12-09
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   norm      1000 non-null   float64
 1   uniform   1000 non-null   float64
 2   binomial  1000 non-null   int64  
dtypes: float64(2), int64(1)
memory usage: 31.2+ KB
