Чтобы эффективно работать с pandas, необходимо освоить самые главные структуры данных библиотеки: DataFrame и Series. Без понимания что они из себя представляют, невозможно в дальнейшем проводить качественный анализ.

In [None]:
import pandas as pd

# Series

Структура/объект Series представляет из себя объект, похожий на одномерный массив (питоновский список, например), но отличительной его чертой является наличие ассоциированных меток, т.н. индексов, вдоль каждого элемента из списка. Такая особенность превращает его в ассоциативный массив или словарь в Python.

In [None]:
my_series = pd.Series([5, 6, 7, 8, 9, 10])

In [None]:
!pip install pandas

In [None]:
!ls

sample_data


In [None]:
my_series

0     5
1     6
2     7
3     8
4     9
5    10
dtype: int64

In [None]:
my_series.index

RangeIndex(start=0, stop=6, step=1)

In [None]:
my_series.values

array([ 5,  6,  7,  8,  9, 10])

In [None]:
my_series[4]

9

In [None]:
my_series2 = pd.Series([5, 6, 7, 8, 9, 10], index=['a', 'b', 'c', 'd', 'e', 'f'])

In [None]:
my_series2['f']

10

In [None]:
my_series2[['a', 'b', 'f']]

a     5
b     6
f    10
dtype: int64

In [None]:
my_series2[['a', 'b', 'f']] = 0
my_series2

a    0
b    0
c    7
d    8
e    9
f    0
dtype: int64

In [None]:
my_series2[my_series2 > 0]

c    7
d    8
e    9
dtype: int64

In [None]:
my_series2[my_series2 > 0] * 2

c    14
d    16
e    18
dtype: int64

# DataFrame

Объект DataFrame лучше всего представлять себе в виде обычной таблицы и это правильно, ведь DataFrame является табличной структурой данных. В любой таблице всегда присутствуют строки и столбцы. Столбцами в объекте DataFrame выступают объекты Series, строки которых являются их непосредственными элементами.

In [None]:
df = pd.DataFrame({
  'country': ['Kazakhstan', 'Russia', 'Belarus', 'Ukraine'],
  'population': [17.04, 143.5, 9.5, 45.5],
  'square': [2724902, 17125191, 207600, 603628]
})

In [None]:
df

Unnamed: 0,country,population,square
0,Kazakhstan,17.04,2724902
1,Russia,143.5,17125191
2,Belarus,9.5,207600
3,Ukraine,45.5,603628


In [None]:
df['country']

0    Kazakhstan
1        Russia
2       Belarus
3       Ukraine
Name: country, dtype: object

In [None]:
df.columns

Index(['country', 'population', 'square'], dtype='object')

.loc - используется для доступа по строковой метке

.iloc - используется для доступа по числовому значению (начиная от 0)

In [None]:
df.iloc[0]

country       Kazakhstan
population         17.04
square           2724902
Name: 0, dtype: object

In [None]:
df.loc[2:]

Unnamed: 0,country,population,square
2,Belarus,9.5,207600
3,Ukraine,45.5,603628


In [None]:
df.loc[2:, 'country']

2    Belarus
3    Ukraine
Name: country, dtype: object

In [None]:
df[df.population > 10][['country', 'square']]

Unnamed: 0,country,square
0,Kazakhstan,2724902
1,Russia,17125191
3,Ukraine,603628


In [None]:
df['density'] = df['population'] / df['square'] * 1000000
df

Unnamed: 0,country,population,square,density
0,Kazakhstan,17.04,2724902,6.253436
1,Russia,143.5,17125191,8.379469
2,Belarus,9.5,207600,45.761079
3,Ukraine,45.5,603628,75.37755


In [None]:
df = df.drop(['density'], axis='columns')


In [None]:
df

Unnamed: 0,country,population,square
0,Kazakhstan,17.04,2724902
1,Russia,143.5,17125191
2,Belarus,9.5,207600
3,Ukraine,45.5,603628


In [None]:
df = df.rename(columns={'country': 'Country'})
df

Unnamed: 0,Country,population,square
0,Kazakhstan,17.04,2724902
1,Russia,143.5,17125191
2,Belarus,9.5,207600
3,Ukraine,45.5,603628


# Чтение и запись данных

In [None]:
df.to_csv('filename.csv')
df = pd.read_csv('filename.csv', sep=',')

# Группировка и агрегирование в pandas

In [None]:
titanic_df = pd.read_csv('titanic.csv')

In [None]:
titanic_df.head(13)

Unnamed: 0,PassengerID,Name,PClass,Age,Sex,Survived,SexCode
0,1,"Allen, Miss Elisabeth Walton",1st,29.0,female,1,1
1,2,"Allison, Miss Helen Loraine",1st,2.0,female,0,1
2,3,"Allison, Mr Hudson Joshua Creighton",1st,30.0,male,0,0
3,4,"Allison, Mrs Hudson JC (Bessie Waldo Daniels)",1st,25.0,female,0,1
4,5,"Allison, Master Hudson Trevor",1st,0.92,male,1,0
5,6,"Anderson, Mr Harry",1st,47.0,male,1,0
6,7,"Andrews, Miss Kornelia Theodosia",1st,63.0,female,1,1
7,8,"Andrews, Mr Thomas, jr",1st,39.0,male,0,0
8,9,"Appleton, Mrs Edward Dale (Charlotte Lamson)",1st,58.0,female,1,1
9,10,"Artagaveytia, Mr Ramon",1st,71.0,male,0,0


In [None]:
# Необходимо подсчитать, сколько женщин и мужчин выжило, а сколько нет.
titanic_df.groupby(['Sex', 'Survived'])['PassengerID'].count()

Sex     Survived
female  0           154
        1           308
male    0           709
        1           142
Name: PassengerID, dtype: int64

# Сводные таблицы в pandas

In [None]:
pvt = titanic_df.pivot_table(index=['Sex'], columns=['PClass'], values='Name', aggfunc='count')

In [None]:
pvt.loc['female', ['1st', '2nd', '3rd']]

PClass
1st    143.0
2nd    107.0
3rd    212.0
Name: female, dtype: float64

In [None]:
pvt

PClass,*,1st,2nd,3rd
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
female,,143.0,107.0,212.0
male,1.0,179.0,172.0,499.0


# Предобработка

## Пропуски в данных

https://pandas.pydata.org/pandas-docs/stable/user_guide/missing_data.html

In [None]:
titanic_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1313 entries, 0 to 1312
Data columns (total 7 columns):
PassengerID    1313 non-null int64
Name           1313 non-null object
PClass         1313 non-null object
Age            756 non-null float64
Sex            1313 non-null object
Survived       1313 non-null int64
SexCode        1313 non-null int64
dtypes: float64(1), int64(3), object(3)
memory usage: 71.9+ KB


In [None]:
titanic_df.isnull().sum()

PassengerID      0
Name             0
PClass           0
Age            557
Sex              0
Survived         0
SexCode          0
dtype: int64

In [None]:
titanic_df.fillna(titanic_df.mean()).isnull().sum()


PassengerID    0
Name           0
PClass         0
Age            0
Sex            0
Survived       0
SexCode        0
dtype: int64

In [None]:
titanic_df.mean()

PassengerID    657.000000
Age             30.397989
Survived         0.342727
SexCode          0.351866
dtype: float64

In [None]:
titanic_df.mean()

In [None]:
titanic_df.dropna()

Unnamed: 0,PassengerID,Name,PClass,Age,Sex,Survived,SexCode
0,1,"Allen, Miss Elisabeth Walton",1st,29.00,female,1,1
1,2,"Allison, Miss Helen Loraine",1st,2.00,female,0,1
2,3,"Allison, Mr Hudson Joshua Creighton",1st,30.00,male,0,0
3,4,"Allison, Mrs Hudson JC (Bessie Waldo Daniels)",1st,25.00,female,0,1
4,5,"Allison, Master Hudson Trevor",1st,0.92,male,1,0
...,...,...,...,...,...,...,...
1308,1309,"Zakarian, Mr Artun",3rd,27.00,male,0,0
1309,1310,"Zakarian, Mr Maprieder",3rd,26.00,male,0,0
1310,1311,"Zenni, Mr Philip",3rd,22.00,male,0,0
1311,1312,"Lievens, Mr Rene",3rd,24.00,male,0,0


In [None]:
titanic_df.dropna(axis=1)

Unnamed: 0,PassengerID,Name,PClass,Sex,Survived,SexCode
0,1,"Allen, Miss Elisabeth Walton",1st,female,1,1
1,2,"Allison, Miss Helen Loraine",1st,female,0,1
2,3,"Allison, Mr Hudson Joshua Creighton",1st,male,0,0
3,4,"Allison, Mrs Hudson JC (Bessie Waldo Daniels)",1st,female,0,1
4,5,"Allison, Master Hudson Trevor",1st,male,1,0
...,...,...,...,...,...,...
1308,1309,"Zakarian, Mr Artun",3rd,male,0,0
1309,1310,"Zakarian, Mr Maprieder",3rd,male,0,0
1310,1311,"Zenni, Mr Philip",3rd,male,0,0
1311,1312,"Lievens, Mr Rene",3rd,male,0,0


## One-hot кодирование

In [None]:
df = pd.DataFrame({
  'country': ['Kazakhstan', 'Russia', 'Belarus', 'Ukraine'],
  'population': [17.04, 143.5, 9.5, 45.5],
  'square': [2724902, 17125191, 207600, 603628]
})
df

Unnamed: 0,country,population,square
0,Kazakhstan,17.04,2724902
1,Russia,143.5,17125191
2,Belarus,9.5,207600
3,Ukraine,45.5,603628


In [None]:
pd.get_dummies(df,prefix=['c'], columns=['country'])

Unnamed: 0,population,square,c_Belarus,c_Kazakhstan,c_Russia,c_Ukraine
0,17.04,2724902,0,1,0,0
1,143.5,17125191,0,0,1,0
2,9.5,207600,1,0,0,0
3,45.5,603628,0,0,0,1


In [None]:
df = pd.DataFrame({
    'has_dogs':[True,False,True,True,False,True],
    'country': ['germany', None ,'germany','united kingdom','america','united kingdom']
})
df

Unnamed: 0,has_dogs,country
0,True,germany
1,False,
2,True,germany
3,True,united kingdom
4,False,america
5,True,united kingdom


In [None]:
pd.get_dummies(df)

Unnamed: 0,has_dogs,country_america,country_germany,country_united kingdom
0,True,0,1,0
1,False,0,0,0
2,True,0,1,0
3,True,0,0,1
4,False,1,0,0
5,True,0,0,1


## Ordinal Encoding

In [None]:
mapping = {'1st': 1,
           '2nd': 2,
           '3rd': 3 }

titanic_df['PClass'] = titanic_df['PClass'].map(mapping)
titanic_df

Unnamed: 0,PassengerID,Name,PClass,Age,Sex,Survived,SexCode
0,1,"Allen, Miss Elisabeth Walton",1.0,29.00,female,1,1
1,2,"Allison, Miss Helen Loraine",1.0,2.00,female,0,1
2,3,"Allison, Mr Hudson Joshua Creighton",1.0,30.00,male,0,0
3,4,"Allison, Mrs Hudson JC (Bessie Waldo Daniels)",1.0,25.00,female,0,1
4,5,"Allison, Master Hudson Trevor",1.0,0.92,male,1,0
...,...,...,...,...,...,...,...
1308,1309,"Zakarian, Mr Artun",3.0,27.00,male,0,0
1309,1310,"Zakarian, Mr Maprieder",3.0,26.00,male,0,0
1310,1311,"Zenni, Mr Philip",3.0,22.00,male,0,0
1311,1312,"Lievens, Mr Rene",3.0,24.00,male,0,0


In [None]:
titanic_df['PClass']

0       1.0
1       1.0
2       1.0
3       1.0
4       1.0
       ... 
1308    3.0
1309    3.0
1310    3.0
1311    3.0
1312    3.0
Name: PClass, Length: 1313, dtype: float64

In [None]:
titanic_df['PClass'] = titanic_df['PClass'].astype('int64', copy=False)

In [None]:
titanic_df