In [2]:
import pandas as pd
import numpy as np

## Series

Она представляет из себя объект, похожий на одномерный массив, но отличительной чертой является наличие индексов. Индекс находится слева, а сам элемент справа.

Синтаксис создания:

pandas.Series(input_data, index, data_type)
input_data: ввод в виде списка, константы, массива NumPy, Dict и т. д.
index: значения индексов.
data_type (опционально): тип данных.

In [3]:
a = pd.Series([4, 7, 6, 3, 9],
              index=['one', 'two', 'three', 'four', 'five'])
a

one      4
two      7
three    6
four     3
five     9
dtype: int64

In [4]:
a = pd.Series([4, 7, 6, 3, 9])
a

0    4
1    7
2    6
3    3
4    9
dtype: int64

In [5]:
a.index

RangeIndex(start=0, stop=5, step=1)

In [6]:
a.values

array([4, 7, 6, 3, 9], dtype=int64)

In [8]:
a[0]

4

In [10]:
a[3]

3

## DataFrame

Объект DataFrame является табличной структурой данных. В любой таблице всегда присутствуют строки и столбцы. При этом в столбцах можно хранить данные разных типов данных. Столбцами в объекте DataFrame выступают объекты Series, строки которых являются их элементами.

Синтаксис создания:

pandas.DataFrame(input_data, index)
input_data: ввод в виде Dict, 2D массива NumPy, Series и т. д.
index: значения индексов.

In [12]:
df = pd.DataFrame({
    'Age': [46, 37, 44, 42, 42],
    'Country': ['Spain', 'Spain', 'Germany', 'Germany', 'France'],
    'Gender': ['Female', 'Female', 'Male', 'Male', 'Male']
})
df

Unnamed: 0,Age,Country,Gender
0,46,Spain,Female
1,37,Spain,Female
2,44,Germany,Male
3,42,Germany,Male
4,42,France,Male


In [19]:
df.Country

0      Spain
1      Spain
2    Germany
3    Germany
4     France
Name: Country, dtype: object

In [18]:
df['Country']

0      Spain
1      Spain
2    Germany
3    Germany
4     France
Name: Country, dtype: object

In [21]:
df[['Country','Age']]

Unnamed: 0,Country,Age
0,Spain,46
1,Spain,37
2,Germany,44
3,Germany,42
4,France,42


In [22]:
df[['Age','Country']]

Unnamed: 0,Age,Country
0,46,Spain
1,37,Spain
2,44,Germany
3,42,Germany
4,42,France


In [23]:
df.columns

Index(['Age', 'Country', 'Gender'], dtype='object')

In [24]:
df.index

RangeIndex(start=0, stop=5, step=1)

In [25]:
df = pd.DataFrame({
    'Age': [46, 37, 44, 42, 42],
    'Country': ['Spain', 'Spain', 'Germany', 'Germany', 'France'],
    'Gender': ['Female', 'Female', 'Male', 'Male', 'Male']
}, index=[5, 4, 6, 3, 2])
df

Unnamed: 0,Age,Country,Gender
5,46,Spain,Female
4,37,Spain,Female
6,44,Germany,Male
3,42,Germany,Male
2,42,France,Male


In [26]:
df.index = [101, 102, 103, 104, 105]
df

Unnamed: 0,Age,Country,Gender
101,46,Spain,Female
102,37,Spain,Female
103,44,Germany,Male
104,42,Germany,Male
105,42,France,Male


## Считывание данных

В целом, pandas поддерживает все самые популярные форматы хранения данных: csv, excel, sql, html и многое другое, но чаще всего приходится работать именно с csv файлами (comma separated values).

Будем работать с датасетом по оттоку клиентов из банка https://www.kaggle.com/datasets/shubh0799/churn-modelling.

Характеристики каждого клиента:

RowNumber - Номер строки
CustomerId - Уникальный идентификатор клиента
Surname - Фамилия клиента
CreditScore - Кредитная оценка клиента
Geography - Из какой страны клиент
Gender - Пол клиента
Age - Возраст клиента
Tenure - Сколько лет человек является клиентом банка
Balance - Баланс счета
NumOfProducts - Количество открытых продуктов
HasCrCard - Есть ли у клиента кредитная карта
IsActiveMember - Является ли клиент активные участником
EstimatedSalary - Предположительная зарплата клиента
Exited - Уйдет ли человек в отток

In [32]:
df = pd.read_csv('./Churn_Modelling.csv')
df

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.00,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.80,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.00,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.10,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9996,15606229,Obijiaku,771,France,Male,39,5,0.00,2,1,0,96270.64,0
9996,9997,15569892,Johnstone,516,France,Male,35,10,57369.61,1,1,1,101699.77,0
9997,9998,15584532,Liu,709,France,Female,36,7,0.00,1,0,1,42085.58,1
9998,9999,15682355,Sabbatini,772,Germany,Male,42,3,75075.31,2,1,0,92888.52,1


In [34]:
pd.read_csv('./Churn_Modelling.csv', header=1)

Unnamed: 0,1,15634602,Hargrave,619,France,Female,42,2,0,1.1,1.2,1.3,101348.88,1.4
0,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
1,3,15619304,Onio,502,France,Female,42,8,159660.80,3,1,0,113931.57,1
2,4,15701354,Boni,699,France,Female,39,1,0.00,2,0,0,93826.63,0
3,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.10,0
4,6,15574012,Chu,645,Spain,Male,44,8,113755.78,2,1,0,149756.71,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9994,9996,15606229,Obijiaku,771,France,Male,39,5,0.00,2,1,0,96270.64,0
9995,9997,15569892,Johnstone,516,France,Male,35,10,57369.61,1,1,1,101699.77,0
9996,9998,15584532,Liu,709,France,Female,36,7,0.00,1,0,1,42085.58,1
9997,9999,15682355,Sabbatini,772,Germany,Male,42,3,75075.31,2,1,0,92888.52,1


In [35]:
pd.read_csv('./Churn_Modelling.csv', sep=';')

Unnamed: 0,"RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited"
0,"1,15634602,Hargrave,619,France,Female,42,2,0,1..."
1,"2,15647311,Hill,608,Spain,Female,41,1,83807.86..."
2,"3,15619304,Onio,502,France,Female,42,8,159660...."
3,"4,15701354,Boni,699,France,Female,39,1,0,2,0,0..."
4,"5,15737888,Mitchell,850,Spain,Female,43,2,1255..."
...,...
9995,"9996,15606229,Obijiaku,771,France,Male,39,5,0,..."
9996,"9997,15569892,Johnstone,516,France,Male,35,10,..."
9997,"9998,15584532,Liu,709,France,Female,36,7,0,1,0..."
9998,"9999,15682355,Sabbatini,772,Germany,Male,42,3,..."


In [37]:
df.head(7)

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0
5,6,15574012,Chu,645,Spain,Male,44,8,113755.78,2,1,0,149756.71,1
6,7,15592531,Bartlett,822,France,Male,50,7,0.0,2,1,1,10062.8,0


In [38]:
df.tail(3)

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
9997,9998,15584532,Liu,709,France,Female,36,7,0.0,1,0,1,42085.58,1
9998,9999,15682355,Sabbatini,772,Germany,Male,42,3,75075.31,2,1,0,92888.52,1
9999,10000,15628319,Walker,792,France,Female,28,4,130142.79,1,1,0,38190.78,0


In [42]:
df.sample()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
5982,5983,15704378,Calabrese,655,Germany,Male,37,9,121342.24,1,1,1,180241.44,0


In [47]:
df.sample(frac=1)

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
4755,4756,15740072,Padovesi,720,France,Female,37,2,120328.88,2,1,1,138470.21,0
8012,8013,15702715,Kao,747,France,Female,34,10,0.00,2,1,1,50759.80,0
9313,9314,15757912,Bradley,722,Germany,Female,37,0,125977.81,1,0,0,160162.42,0
3762,3763,15643042,Han,590,Germany,Female,40,2,117641.43,2,0,0,92198.05,0
3884,3885,15735788,Chiagoziem,709,France,Male,31,6,0.00,2,1,1,71009.84,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6651,6652,15803941,Seleznev,600,France,Male,46,10,95502.21,1,0,0,19842.18,0
2191,2192,15583548,Harrison,525,Spain,Female,47,6,118560.00,1,1,0,82522.61,1
8300,8301,15802625,Hardy,733,Germany,Male,48,7,85915.52,1,1,1,23860.50,0
8969,8970,15622461,Ndubuagha,562,France,Female,51,7,122822.00,2,0,0,32626.21,0


In [50]:
df.sample(frac=0.15)

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
557,558,15634844,Miller,598,Germany,Male,41,3,91536.93,1,1,0,191468.78,1
8627,8628,15777830,Hutchinson,639,France,Female,42,4,0.00,2,0,0,167682.37,0
9526,9527,15665521,Chiazagomekpele,642,Germany,Male,18,5,111183.53,2,0,1,10063.75,0
9072,9073,15777315,Hill,529,France,Male,43,6,93616.35,2,0,0,98348.66,0
4976,4977,15717056,Pan,828,Germany,Female,25,7,144351.86,1,1,0,116613.26,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5198,5199,15660768,L?,604,France,Male,40,1,84315.02,1,0,0,36209.10,0
3800,3801,15685314,Noble,850,France,Female,28,2,0.00,2,1,1,38773.74,0
1782,1783,15642002,Hayward,554,France,Female,35,6,117707.18,2,0,0,95277.15,1
2453,2454,15619935,Vanmeter,783,Spain,Female,59,9,126224.87,1,1,1,4423.63,0


In [52]:
df.shape

(10000, 14)