In [7]:
import pandas as pd
import numpy as np

In [3]:
series = pd.Series([5, 6, 7, 8, 9])
series

0    5
1    6
2    7
3    8
4    9
dtype: int64

In [4]:
series.index

RangeIndex(start=0, stop=5, step=1)

In [5]:
series.values

array([5, 6, 7, 8, 9], dtype=int64)

In [8]:
series2 = pd.Series([5, 6, 7, 8, 9], index=['a','b', 'c', 'd', 'e'],
                    dtype=np.float32)
series2

a    5.0
b    6.0
c    7.0
d    8.0
e    9.0
dtype: float32

In [10]:
series2[[4]]

(e    9.0
 dtype: float32,
 9.0)

In [11]:
series2[4]

9.0

In [12]:
series2['a']

5.0

In [13]:
series2[['a']]

a    5.0
dtype: float32

In [14]:
series2[(series2 > 6)] # маска, круглые скобки обязательны для множественых условий

c    7.0
d    8.0
e    9.0
dtype: float32

In [15]:
series3 = pd.Series({'a': 5, 'b': 6, 'c': 7, 'd': 8, 'e': 9})
series3

a    5
b    6
c    7
d    8
e    9
dtype: int64

In [17]:
df = pd.DataFrame({
    'country': ['Kazakhstan', 'Russia', 'Belarus', 'Ukraine'],
    'population': [17.04, 143.5, 9.5, 45.5],
    'square': [2724902, 17125191, 207600, 603628]
})
df

Unnamed: 0,country,population,square
0,Kazakhstan,17.04,2724902
1,Russia,143.5,17125191
2,Belarus,9.5,207600
3,Ukraine,45.5,603628


In [18]:
df.population

0     17.04
1    143.50
2      9.50
3     45.50
Name: population, dtype: float64

In [25]:
# импорт файлов
# pd.read_csv('filename.csv')
# pd.read_excel('filename.xls')
# pd.read_sql('filename.sql')
# pd.read_table('filename')
# pd.read_json('json_string')
# pd.read_html('url')
# pd.read_clipboard # из буфера обмена
# pd.DataFrame(dict)

# экспорт данных
# df.to_csv('filename.csv')
# df.to_excel('filename.xls')
# df.to_sql('table_name', connection_object)
# df.to_json('filename.json')
# df.to_html('filename.html')
# df.to_clipboard() # в буфер обмена

In [21]:
meteorite_landings = pd.read_csv('meteorite-landings.csv', sep=',')
meteorite_landings.tail() # показать последние n строк, 5 - по умолчанию

Unnamed: 0,name,id,nametype,recclass,mass,fall,year,reclat,reclong,GeoLocation
45711,Zillah 002,31356,Valid,Eucrite,172.0,Found,1990.0,29.037,17.0185,"(29.037000, 17.018500)"
45712,Zinder,30409,Valid,"Pallasite, ungrouped",46.0,Found,1999.0,13.78333,8.96667,"(13.783330, 8.966670)"
45713,Zlin,30410,Valid,H4,3.3,Found,1939.0,49.25,17.66667,"(49.250000, 17.666670)"
45714,Zubkovsky,31357,Valid,L6,2167.0,Found,2003.0,49.78917,41.5046,"(49.789170, 41.504600)"
45715,Zulu Queen,30414,Valid,L3.7,200.0,Found,1976.0,33.98333,-115.68333,"(33.983330, -115.683330)"


In [22]:
meteorite_landings.head() # показать первые n строк, 5 - по умолчанию

Unnamed: 0,name,id,nametype,recclass,mass,fall,year,reclat,reclong,GeoLocation
0,Aachen,1,Valid,L5,21.0,Fell,1880.0,50.775,6.08333,"(50.775000, 6.083330)"
1,Aarhus,2,Valid,H6,720.0,Fell,1951.0,56.18333,10.23333,"(56.183330, 10.233330)"
2,Abee,6,Valid,EH4,107000.0,Fell,1952.0,54.21667,-113.0,"(54.216670, -113.000000)"
3,Acapulco,10,Valid,Acapulcoite,1914.0,Fell,1976.0,16.88333,-99.9,"(16.883330, -99.900000)"
4,Achiras,370,Valid,L6,780.0,Fell,1902.0,-33.16667,-64.95,"(-33.166670, -64.950000)"


In [24]:
chunk_size = 10000

for gm_chunk in pd.read_csv('meteorite-landings.csv', sep=',', chunksize=chunk_size):
    print(gm_chunk.shape) # разибивка данных на части (чанки)

(10000, 10)
(10000, 10)
(10000, 10)
(10000, 10)
(5716, 10)


In [27]:
df.index = ['KZ', 'RU', 'BY', 'UA']
df

Unnamed: 0,country,population,square
KZ,Kazakhstan,17.04,2724902
RU,Russia,143.5,17125191
BY,Belarus,9.5,207600
UA,Ukraine,45.5,603628


In [30]:
df.loc[['KZ', 'RU'], 'population'] # по строковой метке

KZ     17.04
RU    143.50
Name: population, dtype: float64

In [31]:
df.iloc[[0,1], [1]] # по числовому индексу

Unnamed: 0,population
KZ,17.04
RU,143.5


In [33]:
df[df.square > 300000][['country', 'square']] # маска

Unnamed: 0,country,square
KZ,Kazakhstan,2724902
RU,Russia,17125191
UA,Ukraine,603628


In [34]:
df[df.country == 'Russia'][['country', 'square']]

Unnamed: 0,country,square
RU,Russia,17125191


In [35]:
df.population > 10

KZ     True
RU     True
BY    False
UA     True
Name: population, dtype: bool

In [36]:
filters = (df.country == 'Russia') # объявление фильтра
df[filters]

Unnamed: 0,country,population,square
RU,Russia,143.5,17125191


In [37]:
df['density'] = df['population'] / df['square'] * 1000000 # быстрая вставка колонки
df

Unnamed: 0,country,population,square,density
KZ,Kazakhstan,17.04,2724902,6.253436
RU,Russia,143.5,17125191,8.379469
BY,Belarus,9.5,207600,45.761079
UA,Ukraine,45.5,603628,75.37755


In [40]:
df.drop(['density'], axis=1) # исходный df не перезаписывается, если inplace = False

Unnamed: 0,country,population,square
KZ,Kazakhstan,17.04,2724902
RU,Russia,143.5,17125191
BY,Belarus,9.5,207600
UA,Ukraine,45.5,603628


In [42]:
df.rename(columns={'population': 'country_population'}, inplace=True)
df

Unnamed: 0,country,country_population,square,density
KZ,Kazakhstan,17.04,2724902,6.253436
RU,Russia,143.5,17125191,8.379469
BY,Belarus,9.5,207600,45.761079
UA,Ukraine,45.5,603628,75.37755


In [45]:
df.nlargest(3, 'square') # nsmallest

Unnamed: 0,country,country_population,square,density
RU,Russia,143.5,17125191,8.379469
KZ,Kazakhstan,17.04,2724902,6.253436
UA,Ukraine,45.5,603628,75.37755


In [47]:
# изменение макс. кол-ва отображаемых столбцов, по умолчанию 20
pd.set_option('display.max_columns', 100)
# изменение макс. кол-ва отображаемых строк, по умолчанию 60
pd.set_option('display.max_rows', 100)
# изменение кол-ва отображаемых знаков после запятой
pd.set_option('display.precision', 3)

In [48]:
meteorite_landings.shape # размер массива данных, строки х столбцы

(45716, 10)

In [51]:
meteorite_landings.columns

Index(['name', 'id', 'nametype', 'recclass', 'mass', 'fall', 'year', 'reclat',
       'reclong', 'GeoLocation'],
      dtype='object')

In [52]:
meteorite_landings.describe() # числовые статистики

Unnamed: 0,id,mass,year,reclat,reclong
count,45716.0,45580.0,45428.0,38401.0,38401.0
mean,26889.735,13280.0,1991.772,-39.123,61.074
std,16860.683,575000.0,27.181,46.379,80.647
min,1.0,0.0,301.0,-87.367,-165.433
25%,12688.75,7.2,1987.0,-76.714,0.0
50%,24261.5,32.6,1998.0,-71.5,35.667
75%,40656.75,202.6,2003.0,0.0,157.167
max,57458.0,60000000.0,2501.0,81.167,354.473


In [53]:
meteorite_landings.describe(include='object')

Unnamed: 0,name,nametype,recclass,fall,GeoLocation
count,45716,45716,45716,45716,38401
unique,45716,2,466,2,17100
top,Northwest Africa 2858,Valid,L6,Found,"(0.000000, 0.000000)"
freq,1,45641,8285,44609,6214


In [57]:
meteorite_landings.recclass.value_counts()[:5]

L6    8285
H5    7142
L5    4796
H6    4528
H4    4211
Name: recclass, dtype: int64

In [59]:
meteorite_landings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45716 entries, 0 to 45715
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   name         45716 non-null  object 
 1   id           45716 non-null  int64  
 2   nametype     45716 non-null  object 
 3   recclass     45716 non-null  object 
 4   mass         45585 non-null  float64
 5   fall         45716 non-null  object 
 6   year         45428 non-null  float64
 7   reclat       38401 non-null  float64
 8   reclong      38401 non-null  float64
 9   GeoLocation  38401 non-null  object 
dtypes: float64(4), int64(1), object(5)
memory usage: 3.5+ MB


In [60]:
for dtype in ['float', 'int', 'object']:
    selected_dtype = meteorite_landings.select_dtypes(include=[dtype])
    mean_usage_bytes = selected_dtype.memory_usage(deep = True).mean()
    mean_usage_megabytes = mean_usage_bytes / 1024 ** 2
    print('Average memory usage for {} columns: {:03.2f} MB'.format(dtype, mean_usage_megabytes))

Average memory usage for float columns: 0.28 MB
Average memory usage for int columns: 0.00 MB
Average memory usage for object columns: 2.41 MB


In [61]:
meteorite_landings['mass'] = meteorite_landings['mass'].astype(np.float32)
meteorite_landings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45716 entries, 0 to 45715
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   name         45716 non-null  object 
 1   id           45716 non-null  int64  
 2   nametype     45716 non-null  object 
 3   recclass     45716 non-null  object 
 4   mass         45585 non-null  float32
 5   fall         45716 non-null  object 
 6   year         45428 non-null  float64
 7   reclat       38401 non-null  float64
 8   reclong      38401 non-null  float64
 9   GeoLocation  38401 non-null  object 
dtypes: float32(1), float64(3), int64(1), object(5)
memory usage: 3.3+ MB


In [63]:
#вывод кол-ва уникальных записей для каждой колонки
[(column, meteorite_landings[column].nunique()) for column in meteorite_landings.columns]

[('name', 45716),
 ('id', 45716),
 ('nametype', 2),
 ('recclass', 466),
 ('mass', 12576),
 ('fall', 2),
 ('year', 268),
 ('reclat', 12738),
 ('reclong', 14640),
 ('GeoLocation', 17100)]

In [64]:
meteorite_landings_category = meteorite_landings.copy()
# меняем тип колонки recclass на category
meteorite_landings_category['recclass'] = meteorite_landings['recclass'].astype('category')
meteorite_landings_category.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45716 entries, 0 to 45715
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   name         45716 non-null  object  
 1   id           45716 non-null  int64   
 2   nametype     45716 non-null  object  
 3   recclass     45716 non-null  category
 4   mass         45585 non-null  float32 
 5   fall         45716 non-null  object  
 6   year         45428 non-null  float64 
 7   reclat       38401 non-null  float64 
 8   reclong      38401 non-null  float64 
 9   GeoLocation  38401 non-null  object  
dtypes: category(1), float32(1), float64(3), int64(1), object(4)
memory usage: 3.1+ MB


In [66]:
# сортировка, по убыванию для recclass + по возрастанию для mass
meteorite_landings.sort_values(by=['recclass', 'mass'],ascending=[True, False]).head()

Unnamed: 0,name,id,nametype,recclass,mass,fall,year,reclat,reclong,GeoLocation
27673,Northwest Africa 2656,32485,Valid,Acapulcoite,7500.0,Found,2003.0,,,
30353,Northwest Africa 725,17807,Valid,Acapulcoite,3824.0,Found,,30.6,-5.05,"(30.600000, -5.050000)"
27845,Northwest Africa 2871,33345,Valid,Acapulcoite,3467.0,Found,2005.0,,,
7131,Dhofar 125,6910,Valid,Acapulcoite,2697.0,Found,2000.0,18.987,54.6,"(18.986670, 54.600500)"
3,Acapulco,10,Valid,Acapulcoite,1914.0,Fell,1976.0,16.883,-99.9,"(16.883330, -99.900000)"


In [67]:
meteorite_landings[['mass', 'id']].apply(np.max)

mass    6.000e+07
id      5.746e+04
dtype: float64

In [69]:
meteorite_landings.groupby(by='fall')['mass'].agg([np.mean, np.max, np.min, np.std])

Unnamed: 0_level_0,mean,amax,amin,std
fall,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Fell,47070.715,23000000.0,0.1,717067.126
Found,12461.923,60000000.0,0.0,571105.752


In [70]:
# sql - like операции для таблиц
# https://pandas.pydata.org/pandas-docs/stable/user_guide/merging.html