In [9]:
import pandas as pd
import numpy as np
import pandas.util.testing as testing

import os


In [24]:
# С помощь данной опции на консоль будут выводится все несколько команд в одной ячейке
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Генерация тестовых DataFrame

## Ручной сбор DataFrame (собственный метод)

In [27]:
def generate_my_sample_data():
    rows = 10
    int_col = np.random.randint(0, rows, size=rows)
    num_col = np.random.random_sample(rows)
    chr_col = [''.join([chr(i) for i in np.random.randint(ord('а'), ord('я'), 5)]) for _ in range(rows)]
    word_col = ['кошка', 'собака', 'ЯБЛОКО', 'АПЕЛЬСИН', 'Дерево', 'Машина', 'Car', 'House', 'Дом', 'кошка']
    lists = [int_col, num_col, chr_col, word_col]
    mix_col = [lists[np.random.randint(len(lists))][np.random.randint(rows)] for _ in range(rows)]
    
    return pd.DataFrame(
        {'int_col': int_col,
         'num_col': num_col,
         'chr_col': chr_col, 
         'word_col': word_col,
         'mix_col': mix_col,
        })

df = generate_my_sample_data()
df

Unnamed: 0,int_col,num_col,chr_col,word_col,mix_col
0,9,0.006785,гичеб,кошка,ЯБЛОКО
1,2,0.928071,ьцотъ,собака,9
2,8,0.52864,цащбл,ЯБЛОКО,0.704209
3,5,0.339944,юцоук,АПЕЛЬСИН,Дерево
4,6,0.059825,буежй,Дерево,9
5,4,0.748937,ыпмоп,Машина,ЯБЛОКО
6,9,0.665886,рзежо,Car,0.339944
7,8,0.704209,удбоа,House,0.956846
8,0,0.956846,ныфгй,Дом,рзежо
9,5,0.801357,уййла,кошка,0.52864


## Авто генерация 

In [13]:
# check Trick 91 for an example
def generate_sample_data(): # creates a fake df for testing
    number_or_rows = 20
    num_cols = 7
    cols = list("ABCDEFG")
    df = pd.DataFrame(np.random.randint(1, 20, size = (number_or_rows, num_cols)), columns=cols)
    df.index = testing.makeIntIndex(number_or_rows)
    return df

# check Trick 91 for an example
def generate_sample_data_datetime(): # creates a fake df for testing
    number_or_rows = 365*24
    num_cols = 2
    cols = ["sales", "customers"]
    df = pd.DataFrame(np.random.randint(1, 20, size = (number_or_rows, num_cols)), columns=cols)
    df.index = testing.makeDateIndex(number_or_rows, freq="H")
    return df

In [20]:
def print_files(init_dir = './'):
    for dirname, _, filenames in os.walk(init_dir):
        for filename in filenames:
            print(os.path.join(dirname, filename))

# Загрузка / выгрузка данных

## Показать версию и зависимости Pandas

In [207]:
pd.__version__
pd.show_versions()

'1.1.0'

  "Distutils was imported before Setuptools. This usage is discouraged "



INSTALLED VERSIONS
------------------
commit           : d9fff2792bf16178d4e450fe7384244e50635733
python           : 3.7.7.final.0
python-bits      : 64
OS               : Windows
OS-release       : 10
Version          : 10.0.17763
machine          : AMD64
processor        : Intel64 Family 6 Model 37 Stepping 5, GenuineIntel
byteorder        : little
LC_ALL           : None
LANG             : None
LOCALE           : None.None

pandas           : 1.1.0
numpy            : 1.18.5
pytz             : 2020.1
dateutil         : 2.8.1
pip              : 20.2
setuptools       : 49.2.0.post20200712
Cython           : 0.29.21
pytest           : 6.0.0
hypothesis       : None
sphinx           : 3.1.2
blosc            : None
feather          : None
xlsxwriter       : 1.3.0
lxml.etree       : 4.5.2
html5lib         : 1.1
pymysql          : None
psycopg2         : None
jinja2           : 2.11.2
IPython          : 7.16.1
pandas_datareader: None
bs4              : 4.9.1
bottleneck       : 1.3.2
fsspec 

## Избавление от столбцов вида Unnamed: 0 при заагрузке данных из csv

In [23]:
d = {\
"zip_code": [12345, 56789, 101112, 131415],
"factory": [100, 400, 500, 600],
"warehouse": [200, 300, 400, 500],
"retail": [1, 2, 3, 4]
}

df = pd.DataFrame(d)
df

# save to csv
df.to_csv("trick99data.csv")

df = pd.read_csv("trick99data.csv")
df
# To avoid Unnamed: 0

df = pd.read_csv("trick99data.csv", index_col=0)
# or when saving df = pd.read_csv("trick99data.csv", index = False)
df

Unnamed: 0,zip_code,factory,warehouse,retail
0,12345,100,200,1
1,56789,400,300,2
2,101112,500,400,3
3,131415,600,500,4


Unnamed: 0.1,Unnamed: 0,zip_code,factory,warehouse,retail
0,0,12345,100,200,1
1,1,56789,400,300,2
2,2,101112,500,400,3
3,3,131415,600,500,4


Unnamed: 0,zip_code,factory,warehouse,retail
0,12345,100,200,1
1,56789,400,300,2
2,101112,500,400,3
3,131415,600,500,4


## Сохранение памяти с помощью изменения типа данных полей

In [50]:
df = generate_my_sample_data()
df

# let's see how much our df occupies in memory
df.memory_usage(deep = True)

# convert to smaller datatypes
# "Sparse[int]" - для большинства 0
# "Sparse[str]" - для большинства значений Na

df = df.astype({"int_col": "int8",  
#                 "num_col":"category", 
                "chr_col": "Sparse[str]",
                "word_col": "Sparse[str]", 
                "mix_col": "Sparse[str]"}) 

df.memory_usage(deep = True)

df = generate_sample_data()
df.head()

df.memory_usage(deep = True)

df = df.astype({key:"int8" if i%2 ==0 else "Sparse[int]" for i,key in enumerate(df.columns)})

df.memory_usage(deep = True)

Unnamed: 0,int_col,num_col,chr_col,word_col,mix_col
0,5,0.322222,ьшсьр,кошка,0.0741137
1,7,0.665898,тощез,собака,0.592305
2,7,0.93746,жщмфю,ЯБЛОКО,Дерево
3,9,0.074114,иошшс,АПЕЛЬСИН,АПЕЛЬСИН
4,1,0.934209,унщыр,Дерево,жщмфю
5,2,0.852008,ьзнкк,Машина,0.934209
6,6,0.802299,дплмч,Car,йъйир
7,9,0.976159,йъйир,House,0.749462
8,1,0.749462,хакеф,Дом,кошка
9,4,0.592305,кмьщп,кошка,жщмфю


Index       128
int_col      40
num_col      80
chr_col     920
word_col    868
mix_col     720
dtype: int64

Index       128
int_col      10
num_col      80
chr_col     960
word_col    908
mix_col     760
dtype: int64

Unnamed: 0,A,B,C,D,E,F,G
0,19,4,16,11,15,18,6
1,2,1,10,2,14,9,1
2,17,7,10,8,7,17,13
3,2,14,2,11,13,13,14
4,6,19,13,9,5,12,13


Index    160
A         80
B         80
C         80
D         80
E         80
F         80
G         80
dtype: int64

Index    160
A         20
B        160
C         20
D        160
E         20
F        160
G         20
dtype: int64

## Исправьте типы данных при импорте df

In [None]:
# df = pd.read_csv("/kaggle/input/drinks-by-country/drinksbycountry.csv")

# # Step 1: Let's the datetype of the columns
# col_types = df.dtypes.to_frame()
# col_types.rename({0:"type"}, inplace = True, axis = 1)
# col_types
# col_types.to_csv("trick83data.csv")

# # Step 2: Let's import the previous data and convert it to a dictionary
# col_dict = pd.read_csv("trick83data.csv", index_col = 0)["type"].to_dict()

# # Step 3: Edit the dictionary with the correct data types
# print("Original dictionary")
# col_dict
# col_dict["country"] = "category"
# col_dict["continent"] = "category"
# print("Modified dictionary")
# col_dict

# # Step 4: Use the dictionary to import the data
# df = pd.read_csv("/kaggle/input/drinks-by-country/drinksbycountry.csv", dtype=col_dict)
# df.dtypes

# # Note: please note that you can use the dict from step1 and paste in like this
# df = pd.read_csv("/kaggle/input/drinks-by-country/drinksbycountry.csv", \
# dtype=
# {'country': 'category',
#  'beer_servings': 'int64',
#  'spirit_servings': 'int64',
#  'wine_servings': 'int64',
#  'total_litres_of_pure_alcohol': 'float64',
#  'continent': 'category'})
# # However, if you have many colums, this can be confusing
# df.dtypes

## Отслеживание источника загрузки данных при массовой загрузки данных в DF

In [121]:
fn_template = 'multi_import_test_{}.csv'

for i in range(3):
    df = generate_sample_data()
    df.to_csv(fn_template.format(i))

df = pd.concat(
    (pd.read_csv(fn_template.format(i), index_col=0).assign(filename=fn_template.format(i)) for i in range(3)),
    ignore_index=True,    
)

df.sample(10)

Unnamed: 0,A,B,C,D,E,F,G,filename
56,19,11,2,16,6,13,19,multi_import_test_2.csv
46,6,15,8,1,16,13,12,multi_import_test_2.csv
0,3,9,16,14,7,8,9,multi_import_test_0.csv
50,6,17,15,3,13,13,6,multi_import_test_2.csv
25,4,6,10,7,17,13,19,multi_import_test_1.csv
7,1,13,15,4,6,9,14,multi_import_test_0.csv
8,2,4,11,19,8,2,16,multi_import_test_0.csv
32,16,5,15,4,8,17,17,multi_import_test_1.csv
45,10,19,8,13,12,11,12,multi_import_test_2.csv
11,4,6,4,13,2,16,11,multi_import_test_0.csv


'multi_import_test_3.csv'

## Вебскрапинг используя read_html() и сопоставление параметров поиска

In [195]:
# Run this on you local machine
url = "https://en.wikipedia.org/wiki/Twitter"
tables = pd.read_html(url)
len(tables)

matching_tables = pd.read_html(url, match = "Followers")
matching_tables[0]

15

Unnamed: 0,Rank,Change (monthly),Account name,Owner,Followers (millions),Activity,Country
0,1,,@BarackObama,Barack Obama,120,Former U.S. president,United States
1,2,,@justinbieber,Justin Bieber,112,Musician,Canada
2,3,,@katyperry,Katy Perry,108,Musician,United States
3,4,,@rihanna,Rihanna,97,Musician and businesswoman,Barbados
4,5,,@taylorswift13,Taylor Swift,87,Musician,United States
5,6,,@Cristiano,Cristiano Ronaldo,86,Footballer,Portugal
6,7,,@realDonaldTrump,Donald Trump,82,Current U.S. president,United States
7,8,,@ladygaga,Lady Gaga,82,Musician and actress,United States
8,9,,@TheEllenShow,Ellen DeGeneres,80,Comedian and television hostess,United States
9,10,,@ArianaGrande,Ariana Grande,75,Musician and actress,United States


# Конвертация и иследование DataFrame

## Просмотр тип значений для всех столбцов и выборка только определеных типов данных

In [39]:
df = generate_my_sample_data()

print('\n>>\tdtypes')
df.dtypes
pass

print('\n>>\tdtypes.value_counts()')
df.dtypes.value_counts()

print('\n>>\tselect_dtypes(include=[\'number\'])')
# Notes
# -----
# * To select all *numeric* types, use ``np.number`` or ``'number'``
# * To select strings you must use the ``object`` dtype, but note that
#   this will return *all* object dtype columns
# * See the `numpy dtype hierarchy
#   <https://numpy.org/doc/stable/reference/arrays.scalars.html>`__
# * To select datetimes, use ``np.datetime64``, ``'datetime'`` or
#   ``'datetime64'``
# * To select timedeltas, use ``np.timedelta64``, ``'timedelta'`` or
#   ``'timedelta64'``
# * To select Pandas categorical dtypes, use ``'category'``
# * To select Pandas datetimetz dtypes, use ``'datetimetz'`` (new in
#   0.20.0) or ``'datetime64[ns, tz]'``
df.select_dtypes(include=['number'])


>>	dtypes


int_col       int32
num_col     float64
chr_col      object
word_col     object
mix_col      object
dtype: object


>>	dtypes.value_counts()


object     3
float64    1
int32      1
dtype: int64


>>	select_dtypes(include=['number'])


Unnamed: 0,int_col,num_col
0,2,0.1037
1,2,0.395167
2,1,0.326898
3,6,0.196408
4,2,0.060777
5,4,0.753293
6,0,0.303177
7,7,0.909531
8,6,0.218997
9,4,0.503039


## переводит Series в численный тип данных. 

### Будут ошибки, если в наборе данных строки содержат не числовые значения

In [126]:
# все значения цифровые
pd.to_numeric(df['int_col'])


0    0
1    1
2    2
3    3
4    4
5    5
6    6
7    7
8    8
9    9
Name: int_col, dtype: int64

In [127]:
# смешанные поля цифры и строки - ошибка
pd.to_numeric(df['mix_col'])

ValueError: Unable to parse string "иккзз" at position 0

### Переводит Series в числовой тип, при этом заменяет не числовые значения на NaN

In [139]:
# pd.to_numeric(.., errors - аргумент для управления поведением при встрече не числовых значений)
#     'raise' - по умолчанию вызывает ошибку
#     'coerce' - заменяет не числовые на NaN
#     'ignore' - оставляет исходное значение
pd.DataFrame(
    {
        'original': df['mix_col'],
        'errors=coerce': pd.to_numeric(df['mix_col'], errors='coerce'), 
        'errors=ignore': pd.to_numeric(df['mix_col'], errors='ignore')
    }
)

Unnamed: 0,original,errors=coerce,errors=ignore
0,иккзз,,иккзз
1,7,7.0,7
2,уздул,,уздул
3,ъфииф,,ъфииф
4,8,8.0,8
5,рттчъ,,рттчъ
6,0.400321,0.400321,0.400321
7,9,9.0,9
8,0.436953,0.436953,0.436953
9,иккзз,,иккзз


## Смена типа данных у колонки DataFrame

In [159]:
pd.DataFrame(
    {
        'original': df['mix_col'],
        'errors=ignore': df.mix_col.astype(np.int64, errors='ignore'),
    }
)


Unnamed: 0,original,errors=ignore
0,0,0
1,0,0
2,5,5
3,0,0
4,9,9
5,0.647119,0.647119
6,лфыче,лфыче
7,0.851594,0.851594
8,0.622595,0.622595
9,0.886704,0.886704


## Unpivote DataFrame. Превращает данные из широкого в длинный формат, при желании оставляя установленные идентификаторы.

In [26]:
d = {\
"zip_code": [12345, 56789, 101112, 131415],
"factory": [100, 400, 500, 600],
"warehouse": [200, 300, 400, 500],
"retail": [1, 2, 3, 4]
}

df = pd.DataFrame(d)
df

# we have to reassing

# location_type is generated automatically from the columns left after specifying id_vars (you can pass a list also)
df = df.melt(id_vars = ["zip_code", 'factory'], var_name = "location_type", value_name = "distance")
df

Unnamed: 0,zip_code,factory,warehouse,retail
0,12345,100,200,1
1,56789,400,300,2
2,101112,500,400,3
3,131415,600,500,4


Unnamed: 0,zip_code,factory,location_type,distance
0,12345,100,warehouse,200
1,56789,400,warehouse,300
2,101112,500,warehouse,400
3,131415,600,warehouse,500
4,12345,100,retail,1
5,56789,400,retail,2
6,101112,500,retail,3
7,131415,600,retail,4


## Конвертирование значений года и дня года из разных колонок в колонку с типом datetime

In [41]:
# Convert
d = {\
"year": [2019, 2019, 2020],
"day_of_year": [350, 365, 1]
}

df = pd.DataFrame(d)
df

# Step 1: create a combined column
df["combined"] = df["year"]*1000 + df["day_of_year"]
df

# Step 2: convert to datetime
df["date"] = pd.to_datetime(df["combined"], format = "%Y%j")
df

df.dtypes

Unnamed: 0,year,day_of_year
0,2019,350
1,2019,365
2,2020,1


Unnamed: 0,year,day_of_year,combined
0,2019,350,2019350
1,2019,365,2019365
2,2020,1,2020001


Unnamed: 0,year,day_of_year,combined,date
0,2019,350,2019350,2019-12-16
1,2019,365,2019365,2019-12-31
2,2020,1,2020001,2020-01-01


year                    int64
day_of_year             int64
combined                int64
date           datetime64[ns]
dtype: object

## Проверка на схожесть двух Series

In [216]:
d = {"A":[1, 2, 3, 4,], "B":[1.0, 2.0, 3.0, 4.0], "C":[1.00000, 2.00000, 3.00000, 4.000003], "D":[1.0, 2.0, 3.0, 4.0], "E":[4.0, 2.0, 3.0, 1.0]}
df = pd.DataFrame(d)
df

df["A"].equals(df["B"]) # they requiere identical datatypes
df["B"].equals(df["C"])
df["B"].equals(df["D"])
df["B"].equals(df["E"]) # and the same order

print(testing.assert_series_equal(df["A"], df["B"], check_names=False, check_dtype=False)) # assertion passes

Unnamed: 0,A,B,C,D,E
0,1,1.0,1.0,1.0,4.0
1,2,2.0,2.0,2.0,2.0
2,3,3.0,3.0,3.0,3.0
3,4,4.0,4.000003,4.0,1.0


False

False

True

False

None


# Exploring anf Finding Data

## Подсчет пропущенных значений

In [46]:
d = {\
"col1": [2019, 2019, 2020],
"col2": [350, 365, 1],
"col3": [np.nan, 365, None]
}

df = pd.DataFrame(d)
df

print('\n>>\tSolution 1')
df.isnull().sum().sum()

print('\n>>\tSolution 2')
df.isna().sum()

print('\n>>\tSolution 3')
df.isna().any()

print('\n>>\tSolution 4:')
df.isna().any(axis = None)

Unnamed: 0,col1,col2,col3
0,2019,350,
1,2019,365,365.0
2,2020,1,



>>	Solution 1


2


>>	Solution 2


col1    0
col2    0
col3    2
dtype: int64


>>	Solution 3


col1    False
col2    False
col3     True
dtype: bool


>>	Solution 4:


True

## Получение данных обо всех дублей в DataFrame по специфичным полям


In [163]:
df = generate_my_sample_data()
df

Unnamed: 0,int_col,num_col,chr_col,word_col,mix_col
0,7,0.581952,чиьфс,кошка,чиьфс
1,5,0.401379,ьюжыз,собака,кошка
2,4,0.91301,шцйжы,ЯБЛОКО,0.91301
3,5,0.515563,хцпхг,АПЕЛЬСИН,5
4,5,0.418918,иьчщд,Дерево,ЯБЛОКО
5,2,0.577025,жеикф,Машина,7
6,1,0.224206,шмимо,Car,АПЕЛЬСИН
7,2,0.221537,елчжз,House,4
8,4,0.652998,пнфкц,Дом,АПЕЛЬСИН
9,6,0.971932,ытлфь,кошка,ьюжыз


In [182]:
# keep : {'first', 'last', False}, default 'first'
#     Determines which duplicates (if any) to mark.
#     - ``first`` : Mark duplicates as ``True`` except for the first occurrence.
#     - ``last`` : Mark duplicates as ``True`` except for the last occurrence.
#     - False : Mark all duplicates as ``True``.

df[df.duplicated(['int_col',], keep=False)]

Unnamed: 0,int_col,num_col,chr_col,word_col,mix_col
1,5,0.401379,ьюжыз,собака,кошка
2,4,0.91301,шцйжы,ЯБЛОКО,0.91301
3,5,0.515563,хцпхг,АПЕЛЬСИН,5
4,5,0.418918,иьчщд,Дерево,ЯБЛОКО
5,2,0.577025,жеикф,Машина,7
7,2,0.221537,елчжз,House,4
8,4,0.652998,пнфкц,Дом,АПЕЛЬСИН


In [179]:
df2 = pd.DataFrame(
    {
        'col1': [1, 2, 3, 1, 2, 3],
        'col2': [2, 2, 3, 1, 3, 3]
    }    
    
)
df2[df2.duplicated(['col1', 'col2'], keep=False)]

Unnamed: 0,col1,col2
2,3,3
5,3,3


## Список уникальных значений в колонке DataFrame

In [181]:
df['int_col'].unique()

array([7, 5, 4, 2, 1, 6])

## Для каждого уникального значения вывести частоту кол-ва использования

In [185]:
df['int_col'].value_counts()

5    3
4    2
2    2
7    1
6    1
1    1
Name: int_col, dtype: int64

## Вывод всех записей DataFrame, где в определенном столбце встречается указанное значение

In [193]:
df.loc[df.word_col == 'кошка']

Unnamed: 0,int_col,num_col,chr_col,word_col,mix_col
0,7,0.581952,чиьфс,кошка,чиьфс
9,6,0.971932,ытлфь,кошка,ьюжыз


## Вывод всех записей DataFrame, где в определенном столбце встречаются перечень значений

In [381]:
valuelist = ['кошка', 'Дом', 'House']
df[df.word_col.isin(valuelist)]

Unnamed: 0,int_col,num_col,chr_col,word_col,mix_col
0,7,0.58,чиьфс,кошка,чиьфс
7,2,0.22,елчжз,House,4
8,4,0.65,пнфкц,Дом,АПЕЛЬСИН
9,6,0.97,ытлфь,кошка,ьюжыз


## Вывод всех записей DataFrame, где в определенном столбце нет значений из переданного списка

In [196]:
valuelist = ['кошка', 'Дом', 'House']
df[~df.word_col.isin(valuelist)]

Unnamed: 0,int_col,num_col,chr_col,word_col,mix_col
1,5,0.401379,ьюжыз,собака,кошка
2,4,0.91301,шцйжы,ЯБЛОКО,0.91301
3,5,0.515563,хцпхг,АПЕЛЬСИН,5
4,5,0.418918,иьчщд,Дерево,ЯБЛОКО
5,2,0.577025,жеикф,Машина,7
6,1,0.224206,шмимо,Car,АПЕЛЬСИН


## Выборка значений из DataFrame по различным критериям 
(в примере ниже использовать | вместо & что бы использовать ИЛИ)

In [201]:
df[(df['word_col'] == 'кошка') & (df.num_col > 0.6)]

Unnamed: 0,int_col,num_col,chr_col,word_col,mix_col
9,6,0.971932,ытлфь,кошка,ьюжыз


## Поиск по сложным критериям вхождения списка ключевых слов

In [393]:
# Генерация шаблона регулярного выражения для поиска

def gen_pattern(words):
    if isinstance(words, str):
        words = list(words)

    base = r'^{}'
    expr = '(?=.*{})'
    return base.format(''.join(expr.format(w) for w in words))


In [408]:
# отбираем записи, где в тексте встречаются ['сложного', 'поиска'] и где отсутствует слово 'строка'
words = ['сложного', 'поиска']  # example

tmp_df = pd.DataFrame([
    ['Это строка для сложного поиска'],
    ['Какой-то еще текст и набор букв'],
    ['Нам надо много примеров для сложного поиска'],
    ['Поэтому мы создали еще одну строку для сложного поиска']
], columns = ['text'])
tmp_df[
    (tmp_df['text'].astype('str').str.contains(gen_pattern(words), case=False)) & 
    (tmp_df['text'].astype('str').str.contains(gen_pattern('поэтому'), case=False) == False)
]

Unnamed: 0,text
0,Это строка для сложного поиска
2,Нам надо много примеров для сложного поиска


In [409]:
tmp_df[
    (tmp_df['text'].astype('str').str.contains(gen_pattern(['поис', 'слож']), case=False)) &
    ~(tmp_df['text'].astype('str').str.contains(gen_pattern(['нам']), case=False))    
]

Unnamed: 0,text
0,Это строка для сложного поиска
3,Поэтому мы создали еще одну строку для сложног...


## Цикл по записям DataFrame

In [213]:
loops = 1
for index, row in df.iterrows():
    print(index, row['num_col'])
    print(row[['int_col', 'word_col']])
    print('='*20)
    loops += 1
    if loops > 3:
        break

0 0.5819520280440198
int_col         7
word_col    кошка
Name: 0, dtype: object
1 0.4013787731727625
int_col          5
word_col    собака
Name: 1, dtype: object
2 0.9130104408214264
int_col          4
word_col    ЯБЛОКО
Name: 2, dtype: object


## Намного более быстрый способ перебирать строки DataFrame, если вы можете работать с кортежами

In [225]:
loops = 1
for row in df.itertuples():
    print(row[1:-1])   
    loops += 1
    if loops > 3:
        break
row

(7, 0.5819520280440198, 'чиьфс', 'кошка')
(5, 0.4013787731727625, 'ьюжыз', 'собака')
(4, 0.9130104408214264, 'шцйжы', 'ЯБЛОКО')


Pandas(Index=2, int_col=4, num_col=0.9130104408214264, chr_col='шцйжы', word_col='ЯБЛОКО', mix_col=0.9130104408214264)

## Получить первые n для каждой группы столбцов в отсортированном DataFrame 
(убедитесь, что DataFrame отсортирован первым)

In [234]:
n = 1
df.groupby(['int_col', 'word_col']).head(n)

Unnamed: 0,int_col,num_col,chr_col,word_col,mix_col
0,7,0.581952,чиьфс,кошка,чиьфс
1,5,0.401379,ьюжыз,собака,кошка
2,4,0.91301,шцйжы,ЯБЛОКО,0.91301
3,5,0.515563,хцпхг,АПЕЛЬСИН,5
4,5,0.418918,иьчщд,Дерево,ЯБЛОКО
5,2,0.577025,жеикф,Машина,7
6,1,0.224206,шмимо,Car,АПЕЛЬСИН
7,2,0.221537,елчжз,House,4
8,4,0.652998,пнфкц,Дом,АПЕЛЬСИН
9,6,0.971932,ытлфь,кошка,ьюжыз


## Получить из DataFrame записи, где выбранные поля пустые/не пустые 

In [238]:
df[~df['mix_col'].isnull()]

Unnamed: 0,int_col,num_col,chr_col,word_col,mix_col
0,7,0.581952,чиьфс,кошка,чиьфс
1,5,0.401379,ьюжыз,собака,кошка
2,4,0.91301,шцйжы,ЯБЛОКО,0.91301
3,5,0.515563,хцпхг,АПЕЛЬСИН,5
4,5,0.418918,иьчщд,Дерево,ЯБЛОКО
5,2,0.577025,жеикф,Машина,7
6,1,0.224206,шмимо,Car,АПЕЛЬСИН
7,2,0.221537,елчжз,House,4
8,4,0.652998,пнфкц,Дом,АПЕЛЬСИН
9,6,0.971932,ытлфь,кошка,ьюжыз


## Срез значений в колонке DataFrame

In [243]:
pd.DataFrame(
    {
        'original': df.word_col,
        'slicce': df.word_col.str[1:-1]
    }
)

Unnamed: 0,original,slicce
0,кошка,ошк
1,собака,обак
2,ЯБЛОКО,БЛОК
3,АПЕЛЬСИН,ПЕЛЬСИ
4,Дерево,ерев
5,Машина,ашин
6,Car,a
7,House,ous
8,Дом,о
9,кошка,ошк


## Быстрый подсчет количества строк в DataFrame


In [244]:
len(df.index)

10

## Получить длину данных в колонке DataFrame

In [246]:
pd.DataFrame(
    {
        'original': df.word_col,
        'len': df.word_col.str.len()
    }
)

Unnamed: 0,original,len
0,кошка,5
1,собака,6
2,ЯБЛОКО,6
3,АПЕЛЬСИН,8
4,Дерево,6
5,Машина,6
6,Car,3
7,House,5
8,Дом,3
9,кошка,5


## Агрегирование данных, где индекс в формате даты по дням и фильтрация выходных

In [56]:
df = generate_sample_data_datetime()
df.shape
df.head()

# Step 1: resample by D. Basically agregate by day and use to_frame() to convert it to frame
daily_sales = df.resample("D")["sales"].sum().to_frame()
daily_sales

# Step 2: filter weekends
weekends_sales = daily_sales[daily_sales.index.dayofweek.isin([5, 6])]
weekends_sales

'''
dayofweek day
0         Monday
1         Tuesday
2         Wednesday
3         Thursday
4         Friday
5         Saturday
6         Sunday
'''

(8760, 2)

Unnamed: 0,sales,customers
2000-01-01 00:00:00,8,6
2000-01-01 01:00:00,10,1
2000-01-01 02:00:00,7,1
2000-01-01 03:00:00,9,6
2000-01-01 04:00:00,3,18


Unnamed: 0,sales
2000-01-01,206
2000-01-02,255
2000-01-03,261
2000-01-04,264
2000-01-05,253
...,...
2000-12-26,196
2000-12-27,229
2000-12-28,288
2000-12-29,261


Unnamed: 0,sales
2000-01-01,206
2000-01-02,255
2000-01-08,250
2000-01-09,209
2000-01-15,245
...,...
2000-12-16,243
2000-12-17,258
2000-12-23,293
2000-12-24,246


'\ndayofweek day\n0         Monday\n1         Tuesday\n2         Wednesday\n3         Thursday\n4         Friday\n5         Saturday\n6         Sunday\n'

## Именнованные агрегации в GroupBy - исключая мультииндекс

In [60]:
df = generate_my_sample_data()
df


# Problem 1
print("The Problem relies on that we don't know the column name")
df.groupby('word_col')['int_col'].agg(['mean', 'max'])

# Problem 2
print("The Problem relies on that we have multiindex")
df.groupby('word_col').agg({"int_col":["mean", "max"]})


# Solution new in pandas 0.25 and higher
print("Now we have solved the previous problems by specifyig the column final name we want.")
print("BUT IT ONLY WORKS WITH A COLUMN. TO THIS KIND OF OPERATIONS ON MULTIPLE COLUMNS CHECK THE NEXT CELL")
df.groupby("word_col")["int_col"].agg(int_col_mean = "mean", int_col_max = "max", int_col_count='count')

Unnamed: 0,int_col,num_col,chr_col,word_col,mix_col
0,3,0.44398,ййфчн,кошка,юфйюч
1,6,0.621947,уацдо,собака,ищптр
2,0,0.723471,ищптр,ЯБЛОКО,кошка
3,4,0.860328,юъоак,АПЕЛЬСИН,7
4,5,0.480781,чофсч,Дерево,щуалш
5,3,0.245399,щуалш,Машина,0.245399
6,7,0.504072,жътюф,Car,кошка
7,6,0.034268,чжыьи,House,5
8,1,0.18344,обухл,Дом,Дом
9,4,0.685419,юфйюч,кошка,щуалш


The Problem relies on that we don't know the column name


Unnamed: 0_level_0,mean,max
word_col,Unnamed: 1_level_1,Unnamed: 2_level_1
Car,7.0,7
House,6.0,6
АПЕЛЬСИН,4.0,4
Дерево,5.0,5
Дом,1.0,1
Машина,3.0,3
ЯБЛОКО,0.0,0
кошка,3.5,4
собака,6.0,6


The Problem relies on that we have multiindex


Unnamed: 0_level_0,int_col,int_col
Unnamed: 0_level_1,mean,max
word_col,Unnamed: 1_level_2,Unnamed: 2_level_2
Car,7.0,7
House,6.0,6
АПЕЛЬСИН,4.0,4
Дерево,5.0,5
Дом,1.0,1
Машина,3.0,3
ЯБЛОКО,0.0,0
кошка,3.5,4
собака,6.0,6


Now we have solved the previous problems by specifyig the column final name we want.
BUT IT ONLY WORKS WITH A COLUMN. TO THIS KIND OF OPERATIONS ON MULTIPLE COLUMNS CHECK THE NEXT CELL


Unnamed: 0_level_0,int_col_mean,int_col_max,int_col_count
word_col,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Car,7.0,7,1
House,6.0,6,1
АПЕЛЬСИН,4.0,4,1
Дерево,5.0,5,1
Дом,1.0,1,1
Машина,3.0,3,1
ЯБЛОКО,0.0,0,1
кошка,3.5,4,2
собака,6.0,6,1


## Именнованные агрегаты в GroupBy (несколько столбцов) исключая мультииндекс)

In [61]:
def my_agg(x):
    names = {
        'int_col_mean': x['int_col'].mean(),
        'int_col_max':  x['int_col'].max(),
        'num_col_mean': x['num_col'].mean(),
        'num_col_max': x['num_col'].max()
    } # define you custom colum names and operations

    return pd.Series(names, index=[ key for key in names.keys()]) # all the columns you create in the previous dictionary will be in this list comprehension

df.groupby('word_col').apply(my_agg)

# reference
# https://stackoverflow.com/questions/44635626/rename-result-columns-from-pandas-aggregation-futurewarning-using-a-dict-with

Unnamed: 0_level_0,int_col_mean,int_col_max,num_col_mean,num_col_max
word_col,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Car,7.0,7.0,0.504072,0.504072
House,6.0,6.0,0.034268,0.034268
АПЕЛЬСИН,4.0,4.0,0.860328,0.860328
Дерево,5.0,5.0,0.480781,0.480781
Дом,1.0,1.0,0.18344,0.18344
Машина,3.0,3.0,0.245399,0.245399
ЯБЛОКО,0.0,0.0,0.723471,0.723471
кошка,3.5,4.0,0.5647,0.685419
собака,6.0,6.0,0.621947,0.621947


## Выбор данных по позициям строк и названию столбцов(цепочка из iloc & loc)

In [73]:
df = generate_sample_data()
df.iloc[5:10, :].loc[:, "B":"E"]

Unnamed: 0,B,C,D,E
5,10,15,7,7
6,18,8,6,10
7,3,13,4,12
8,19,4,4,18
9,2,12,11,5


## Используйте apply (тип), чтобы увидеть, есть ли у вас смешанные типы данных

In [80]:
d = {"customer":["A", "B", "C", "D", "E"], "sales":[100, "100", 50, 550.20, "375.25"]}
df = pd.DataFrame(d)
# everything seems  but this operation crashes df["sales"].sum(). We have mixed data types
df.dtypes
df["sales"].apply(type) # Wow we can see that we have int, str, floats inn one column
df["sales"].apply(type).value_counts() # See the number of each value

df["sales"] = df["sales"].astype(float) # convert the data to float
df["sales"].sum()
df["sales"].apply(type).value_counts()

customer    object
sales       object
dtype: object

0      <class 'int'>
1      <class 'str'>
2      <class 'int'>
3    <class 'float'>
4      <class 'str'>
Name: sales, dtype: object

<class 'int'>      2
<class 'str'>      2
<class 'float'>    1
Name: sales, dtype: int64

1175.45

<class 'float'>    5
Name: sales, dtype: int64

## Подсчет количества строк в DF подходящих под условие

In [113]:
df = generate_sample_data()
df.head()
df.shape

# absolute values
(df["A"] < 5).sum()
print("In the columns A we have {} of rows that are below 5".format((df["A"] < 5).sum()))

# percentage
(df["A"] < 5).mean()
print("In the columns A the values that are below 5 represent {}%".format((df["A"] < 5).mean()))

Unnamed: 0,A,B,C,D,E,F,G
0,15,10,15,16,1,12,4
1,19,8,10,17,16,6,11
2,12,17,13,4,7,3,5
3,16,18,6,3,3,11,3
4,12,8,16,4,7,13,10


(20, 7)

2

In the columns A we have 2 of rows that are below 5


0.1

In the columns A the values that are below 5 represent 0.1%


## Выбрать данные из DF по топовым категориям

In [158]:
df = pd.read_csv('IMDB-Movie-Data.csv')
df.columns = df.columns.str.lower()
# list(map(str.lower, df.columns))
# df.columns.str.lower()
# df.columns.str.lower().values
# df.head()
top_genre = df['genre'].value_counts()[:3].index
top_genre
df[df['genre'].isin(top_genre)]
df[df['genre'].isin(top_genre)]['genre'].unique()

Index(['Action,Adventure,Sci-Fi', 'Drama', 'Comedy,Drama,Romance'], dtype='object')

Unnamed: 0,rank,title,genre,description,director,actors,year,runtime (minutes),rating,votes,revenue (millions),metascore
0,1,Guardians of the Galaxy,"Action,Adventure,Sci-Fi",A group of intergalactic criminals are forced ...,James Gunn,"Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S...",2014,121,8.1,757074,333.13,76.0
12,13,Rogue One,"Action,Adventure,Sci-Fi",The Rebel Alliance makes a risky move to steal...,Gareth Edwards,"Felicity Jones, Diego Luna, Alan Tudyk, Donnie...",2016,133,7.9,323118,532.17,65.0
21,22,Manchester by the Sea,Drama,A depressed uncle is asked to take care of his...,Kenneth Lonergan,"Casey Affleck, Michelle Williams, Kyle Chandle...",2016,137,7.9,134213,47.70,96.0
24,25,Independence Day: Resurgence,"Action,Adventure,Sci-Fi",Two decades after the first Independence Day i...,Roland Emmerich,"Liam Hemsworth, Jeff Goldblum, Bill Pullman,Ma...",2016,120,5.3,127553,103.14,32.0
32,33,X-Men: Apocalypse,"Action,Adventure,Sci-Fi",After the re-emergence of the world's first mu...,Bryan Singer,"James McAvoy, Michael Fassbender, Jennifer Law...",2016,144,7.1,275510,155.33,52.0
...,...,...,...,...,...,...,...,...,...,...,...,...
948,949,After Earth,"Action,Adventure,Sci-Fi",A crash landing leaves Kitai Raige and his fat...,M. Night Shyamalan,"Jaden Smith, David Denman, Will Smith,Sophie O...",2013,100,4.9,166512,60.52,33.0
952,953,Sex and the City 2,"Comedy,Drama,Romance","While wrestling with the pressures of life, lo...",Michael Patrick King,"Sarah Jessica Parker, Kim Cattrall, Kristin Da...",2010,146,4.3,62403,95.33,27.0
964,965,Custody,Drama,The lives of three women are unexpectedly chan...,James Lapine,"Viola Davis, Hayden Panettiere, Catalina Sandi...",2016,104,6.9,280,,72.0
978,979,It's Only the End of the World,Drama,"Louis (Gaspard Ulliel), a terminally ill write...",Xavier Dolan,"Nathalie Baye, Vincent Cassel, Marion Cotillar...",2016,97,7.0,10658,,48.0


array(['Action,Adventure,Sci-Fi', 'Drama', 'Comedy,Drama,Romance'],
      dtype=object)

## Подсчет количества слов в конкретном столбце DF

In [171]:
df = pd.read_csv('IMDB-Movie-Data.csv', usecols=['Title'])
# pd.concat([df['Title'], df['Title'].str.count(" ")+1], axis=1)
df['Words'] = df['Title'].str.count(' ') + 1
df

Unnamed: 0,Title,Words
0,Guardians of the Galaxy,4
1,Prometheus,1
2,Split,1
3,Sing,1
4,Suicide Squad,2
...,...,...
995,Secret in Their Eyes,4
996,Hostel: Part II,3
997,Step Up 2: The Streets,5
998,Search Party,2


# Обновление и очистка данных

## Удаление столбца из DataFrame

In [254]:
df = df.assign(len_word_col = df.word_col.str.len())

In [255]:
df

Unnamed: 0,int_col,num_col,chr_col,word_col,mix_col,len_word_col
0,7,0.581952,чиьфс,кошка,чиьфс,5
1,5,0.401379,ьюжыз,собака,кошка,6
2,4,0.91301,шцйжы,ЯБЛОКО,0.91301,6
3,5,0.515563,хцпхг,АПЕЛЬСИН,5,8
4,5,0.418918,иьчщд,Дерево,ЯБЛОКО,6
5,2,0.577025,жеикф,Машина,7,6
6,1,0.224206,шмимо,Car,АПЕЛЬСИН,3
7,2,0.221537,елчжз,House,4,5
8,4,0.652998,пнфкц,Дом,АПЕЛЬСИН,3
9,6,0.971932,ытлфь,кошка,ьюжыз,5


In [256]:
del df['len_word_col']

In [257]:
df

Unnamed: 0,int_col,num_col,chr_col,word_col,mix_col
0,7,0.581952,чиьфс,кошка,чиьфс
1,5,0.401379,ьюжыз,собака,кошка
2,4,0.91301,шцйжы,ЯБЛОКО,0.91301
3,5,0.515563,хцпхг,АПЕЛЬСИН,5
4,5,0.418918,иьчщд,Дерево,ЯБЛОКО
5,2,0.577025,жеикф,Машина,7
6,1,0.224206,шмимо,Car,АПЕЛЬСИН
7,2,0.221537,елчжз,House,4
8,4,0.652998,пнфкц,Дом,АПЕЛЬСИН
9,6,0.971932,ытлфь,кошка,ьюжыз


## Изменение названия нескольких столюцов DataFrame

In [259]:
df = df.rename(columns= {
    'mix_col': 'MIX_col',
    'int_col': 'int_COL'
})
df

Unnamed: 0,int_COL,num_col,chr_col,word_col,MIX_col
0,7,0.581952,чиьфс,кошка,чиьфс
1,5,0.401379,ьюжыз,собака,кошка
2,4,0.91301,шцйжы,ЯБЛОКО,0.91301
3,5,0.515563,хцпхг,АПЕЛЬСИН,5
4,5,0.418918,иьчщд,Дерево,ЯБЛОКО
5,2,0.577025,жеикф,Машина,7
6,1,0.224206,шмимо,Car,АПЕЛЬСИН
7,2,0.221537,елчжз,House,4
8,4,0.652998,пнфкц,Дом,АПЕЛЬСИН
9,6,0.971932,ытлфь,кошка,ьюжыз


## Нижний регистр всех названий колонок DataFrame

In [260]:
df.columns = map(str.lower, df.columns)
df

Unnamed: 0,int_col,num_col,chr_col,word_col,mix_col
0,7,0.581952,чиьфс,кошка,чиьфс
1,5,0.401379,ьюжыз,собака,кошка
2,4,0.91301,шцйжы,ЯБЛОКО,0.91301
3,5,0.515563,хцпхг,АПЕЛЬСИН,5
4,5,0.418918,иьчщд,Дерево,ЯБЛОКО
5,2,0.577025,жеикф,Машина,7
6,1,0.224206,шмимо,Car,АПЕЛЬСИН
7,2,0.221537,елчжз,House,4
8,4,0.652998,пнфкц,Дом,АПЕЛЬСИН
9,6,0.971932,ытлфь,кошка,ьюжыз


In [263]:
# inplace=True применит изменения в датасете
df.rename(columns=lambda x: x.split('_')[0], inplace=False)

Unnamed: 0,int,num,chr,word,mix
0,7,0.581952,чиьфс,кошка,чиьфс
1,5,0.401379,ьюжыз,собака,кошка
2,4,0.91301,шцйжы,ЯБЛОКО,0.91301
3,5,0.515563,хцпхг,АПЕЛЬСИН,5
4,5,0.418918,иьчщд,Дерево,ЯБЛОКО
5,2,0.577025,жеикф,Машина,7
6,1,0.224206,шмимо,Car,АПЕЛЬСИН
7,2,0.221537,елчжз,House,4
8,4,0.652998,пнфкц,Дом,АПЕЛЬСИН
9,6,0.971932,ытлфь,кошка,ьюжыз


## Сортировка DataFrame по нескольким колонкам

In [268]:
df.sort_values(['mix_col', 'num_col'], ascending=[0, 1])

Unnamed: 0,int_col,num_col,chr_col,word_col,mix_col
9,6,0.971932,ытлфь,кошка,ьюжыз
0,7,0.581952,чиьфс,кошка,чиьфс
1,5,0.401379,ьюжыз,собака,кошка
4,5,0.418918,иьчщд,Дерево,ЯБЛОКО
6,1,0.224206,шмимо,Car,АПЕЛЬСИН
8,4,0.652998,пнфкц,Дом,АПЕЛЬСИН
5,2,0.577025,жеикф,Машина,7
3,5,0.515563,хцпхг,АПЕЛЬСИН,5
7,2,0.221537,елчжз,House,4
2,4,0.91301,шцйжы,ЯБЛОКО,0.91301


## Замена всех NaNs на None (полезно перед загрузкой в БД)

In [282]:
tmp_df = pd.DataFrame(
    [
        [1, '    2  ', np.nan], 
        [np.nan, ' data ', np.nan]
    ])
tmp_df.where((pd.notnull(tmp_df)), None)

Unnamed: 0,0,1,2
0,1.0,2,
1,,data,


## Больше предварительной очистки вставки db ... сделайте проход через DataFrame, убрав пробелы из строк и изменив все пустые значения на None


In [288]:
tmp_df.iloc[0][1]

'    2  '

In [290]:
tmp_df.applymap(lambda x: str(x).strip() if len(str(x).strip()) else None).iloc[0][1]

'2'

## Преобразование одного типа значений в другие

In [62]:
# Do some fast feature eng on the DF
d = {"gender":["male", "female", "male"], "color":["red", "green", "blue"], "age":[25, 30, 15]}
df = pd.DataFrame(d)
df

# Solution
map_dict = {"male":"M", "female":"F"}
df["gender_mapped"] = df["gender"].map(map_dict) # using dictionaries to map values
df["color_factorized"] = df["color"].factorize()[0] # using factorize: returns a tuple of arrays (array([0, 1, 2]), Index(['red', 'green', 'blue'], dtype='object')) that's why we select [0]
df["age_compared_boolean"] = df["age"] < 18 # return a True False boolean value

df

Unnamed: 0,gender,color,age
0,male,red,25
1,female,green,30
2,male,blue,15


Unnamed: 0,gender,color,age,gender_mapped,color_factorized,age_compared_boolean
0,male,red,25,M,0,False
1,female,green,30,F,1,False
2,male,blue,15,M,2,True


## Замена значений в столбце из словаря

In [420]:
lvl = {'АПЕЛЬСИН': 'фрукт', 'ЯБЛОКО': 'фрукт', 'кошка': 'животное'}

pd.DataFrame(
    {
        'original': df.mix_col,
        'change': df.mix_col.map(lvl).fillna('-')
    }
)

Unnamed: 0,original,change
0,чиьфс,-
1,кошка,животное
2,0.91,-
3,5,-
4,ЯБЛОКО,фрукт
5,7,-
6,АПЕЛЬСИН,фрукт
7,4,-
8,АПЕЛЬСИН,фрукт
9,ьюжыз,-


## Замена всех не числовых значений на прочерк '-'

In [314]:
df2 = df.copy()

for col in df2.columns.values:
    df2[col] = df2[col].replace(r"^\w*$", '-', regex=True)

df2

Unnamed: 0,int_col,num_col,chr_col,word_col,mix_col
0,7,0.581952,-,-,-
1,5,0.401379,-,-,-
2,4,0.91301,-,-,0.91301
3,5,0.515563,-,-,5
4,5,0.418918,-,-,-
5,2,0.577025,-,-,7
6,1,0.224206,-,-,-
7,2,0.221537,-,-,4
8,4,0.652998,-,-,-
9,6,0.971932,-,-,-


## Присвоение колонке DataFrame значения в зависимости от значений остальных колонок

In [329]:
df2 = df.copy()
my_filter = (df2['int_col'] >=5) & (df2['word_col'] == 'кошка') 
column_to_change = 'mix_col'

df2.loc[my_filter, [column_to_change]] = 'сфинкс'
df2

Unnamed: 0,int_col,num_col,chr_col,word_col,mix_col
0,7,0.581952,чиьфс,кошка,сфинкс
1,5,0.401379,ьюжыз,собака,кошка
2,4,0.91301,шцйжы,ЯБЛОКО,0.91301
3,5,0.515563,хцпхг,АПЕЛЬСИН,5
4,5,0.418918,иьчщд,Дерево,ЯБЛОКО
5,2,0.577025,жеикф,Машина,7
6,1,0.224206,шмимо,Car,АПЕЛЬСИН
7,2,0.221537,елчжз,House,4
8,4,0.652998,пнфкц,Дом,АПЕЛЬСИН
9,6,0.971932,ытлфь,кошка,сфинкс


## Заполнение пропущенных значений в нескольких столбцах специфическими значениями

In [336]:
tmp_df = pd.DataFrame(
    [
        [1, '    2  ', np.nan], 
        [np.nan, ' data ', np.nan],
        [' data ', np.nan, np.nan],
        [np.nan, ' data ', '231'],
    ], columns=['col1', 'col2', 'col3'])
tmp_df

Unnamed: 0,col1,col2,col3
0,1,2,
1,,data,
2,data,,
3,,data,231.0


In [338]:
tmp_df.fillna({
    'col1': 'пусто',
    'col3': '999'
})

Unnamed: 0,col1,col2,col3
0,1,2,999
1,пусто,data,999
2,data,,999
3,пусто,data,231


## Выполнение расчетов со столбцами DataFrame, в которых отсутствуют значения

In [348]:
tmp_df['new_col'] = np.where(
    pd.isnull(tmp_df['col3']), 0, tmp_df['col3']
).astype('int32') * 2
tmp_df

Unnamed: 0,col1,col2,col3,new_col
0,1,2,,0
1,,data,,0
2,data,,,0
3,,data,231.0,462


## Разделить значения в столбце DataFrame на два новых столбца

In [351]:
df2 = df.copy()
df2['new_col1'], df2['new_col2'] = zip(
    *df2['num_col'].apply(
        lambda x: str(x).split('.', 1)
    )
)
df2

Unnamed: 0,int_col,num_col,chr_col,word_col,mix_col,new_col1,new_col2
0,7,0.581952,чиьфс,кошка,чиьфс,0,5819520280440198
1,5,0.401379,ьюжыз,собака,кошка,0,4013787731727625
2,4,0.91301,шцйжы,ЯБЛОКО,0.91301,0,9130104408214264
3,5,0.515563,хцпхг,АПЕЛЬСИН,5,0,515563406478504
4,5,0.418918,иьчщд,Дерево,ЯБЛОКО,0,4189177063087276
5,2,0.577025,жеикф,Машина,7,0,5770247675512845
6,1,0.224206,шмимо,Car,АПЕЛЬСИН,0,22420620475925346
7,2,0.221537,елчжз,House,4,0,22153708323941823
8,4,0.652998,пнфкц,Дом,АПЕЛЬСИН,0,6529981416907323
9,6,0.971932,ытлфь,кошка,ьюжыз,0,971932217361423


In [368]:
df2 = df.copy()
df2[['new_col1', 'new_col2']] = df2['num_col'].astype('str').str.split('.', 1, True)

df2[['num_col','new_col1', 'new_col2']]

Unnamed: 0,num_col,new_col1,new_col2
0,0.582,0,5819520280440198
1,0.401,0,4013787731727625
2,0.913,0,9130104408214264
3,0.516,0,515563406478504
4,0.419,0,4189177063087276
5,0.577,0,5770247675512845
6,0.224,0,22420620475925346
7,0.222,0,22153708323941823
8,0.653,0,6529981416907323
9,0.972,0,971932217361423


## Объедините небольшие категории в одну категорию под названием «Другие»

### используя частоту упоминания и метод replace

In [52]:
d = {"genre": ["A", "A", "A", "A", "A", "B", "B", "C", "D", "E", "F"]}
df = pd.DataFrame(d)
df

# Step 1: count the frequencies
frequencies = df["genre"].value_counts(normalize = True)
frequencies

# Step 2: establish your threshold and filter the smaller categories
threshold = 0.1
small_categories = frequencies[frequencies < threshold].index
small_categories

# Step 3: replace the values
df["genre_cat"] = df["genre"].replace(small_categories, "Other")
df
df["genre_cat"].value_counts(normalize = True)

Unnamed: 0,genre
0,A
1,A
2,A
3,A
4,A
5,B
6,B
7,C
8,D
9,E


A    0.454545
B    0.181818
E    0.090909
C    0.090909
D    0.090909
F    0.090909
Name: genre, dtype: float64

Index(['E', 'C', 'D', 'F'], dtype='object')

Unnamed: 0,genre,genre_cat
0,A,A
1,A,A
2,A,A
3,A,A
4,A,A
5,B,B
6,B,B
7,C,Other
8,D,Other
9,E,Other


A        0.454545
Other    0.363636
B        0.181818
Name: genre_cat, dtype: float64

### Используя выборку наибольших значений и метод where

In [122]:
d = {"genre": ["A", "A", "A", "A", "A", "B", "B", "C", "D", "E", "F"]}
df = pd.DataFrame(d)
df["genre"].value_counts()

# Step 1: count the frequencies
top_four = df["genre"].value_counts().nlargest(4).index
top_four

# Step 2: update the df
df_updated = df.where(df["genre"].isin(top_four), other = "Other")
df_updated["genre"].value_counts()

A    5
B    2
E    1
C    1
D    1
F    1
Name: genre, dtype: int64

Index(['A', 'B', 'E', 'C'], dtype='object')

A        5
Other    2
B        2
C        1
E        1
Name: genre, dtype: int64

## Вставка нового столбца в определенную позицию DataFrame

In [54]:
d = {"A":[15, 20], "B":[20, 25], "C":[30 ,40], "D":[50, 60]}
df = pd.DataFrame(d)
df

# Using insert
df.insert(3, "C2", df["C"]*2)
df

# Other solution
df["C3"] = df["C"]*3 # create a new columns, it will be at the end
columns = df.columns.to_list() # create a list with all columns
location = 4 # specify the location where you want your new column
columns = columns[:location] + ["C3"] + columns[location:-1] # reaarange the list
df = df[columns] # create te dataframe in with the order of columns you like
df

Unnamed: 0,A,B,C,D
0,15,20,30,50
1,20,25,40,60


Unnamed: 0,A,B,C,C2,D
0,15,20,30,60,50
1,20,25,40,80,60


Unnamed: 0,A,B,C,C2,C3,D
0,15,20,30,60,90,50
1,20,25,40,80,120,60


## Перестановка столбцов в DataFrame

In [55]:
df = generate_sample_data()
df.head()

# Solution 1
df[["A", "C", "D", "F", "E", "G", "B"]].head() # doesn't modify in place

# Solution 2
cols_to_move = ["A", "G", "B"]

new_order = cols_to_move + [c for c in df.columns if c not in cols_to_move] # generate your new order
df[new_order].head()

# Solutin 3: using index
cols = df.columns[[0, 5 , 3, 4, 2, 1, 6]] # df.columns returns a series with index, we use the list to iorder the index as we like --> this way we order the columns
df[cols].head()

Unnamed: 0,A,B,C,D,E,F,G
0,18,18,7,18,15,15,16
1,15,4,16,3,10,15,5
2,12,12,14,17,4,14,11
3,15,10,11,8,8,12,16
4,4,16,6,4,3,13,3


Unnamed: 0,A,C,D,F,E,G,B
0,18,7,18,15,15,16,18
1,15,16,3,15,10,5,4
2,12,14,17,14,4,11,12
3,15,11,8,12,8,16,10
4,4,6,4,13,3,3,16


Unnamed: 0,A,G,B,C,D,E,F
0,18,16,18,7,18,15,15
1,15,5,4,16,3,10,15
2,12,11,12,14,17,4,14
3,15,16,10,11,8,8,12
4,4,3,16,6,4,3,13


Unnamed: 0,A,F,D,E,C,B,G
0,18,15,18,15,7,18,16
1,15,15,3,10,16,4,5
2,12,14,17,4,14,12,11
3,15,12,8,8,11,10,16
4,4,13,4,3,6,16,3


## Очистить Строковый столбец  со смешанными данными с помощью регулярных выражений

In [53]:
d = {"customer": ["A", "B", "C", "D"], "sales":[1100, 950.75, "$400", "$1250.35"]}
df = pd.DataFrame(d)
df

# Step 1: check the data types
df["sales"].apply(type)

# Step 2: use regex
df["sales"] = df["sales"].replace("[$,]", "", regex = True).astype("float")
df
df["sales"].apply(type)

Unnamed: 0,customer,sales
0,A,1100
1,B,950.75
2,C,$400
3,D,$1250.35


0      <class 'int'>
1    <class 'float'>
2      <class 'str'>
3      <class 'str'>
Name: sales, dtype: object

Unnamed: 0,customer,sales
0,A,1100.0
1,B,950.75
2,C,400.0
3,D,1250.35


0    <class 'float'>
1    <class 'float'>
2    <class 'float'>
3    <class 'float'>
Name: sales, dtype: object

## Выбрать срез столбцов из DataFrame

In [81]:
df = generate_sample_data().T
cols_str = list(map(str, list(df.columns))) # so that we can do df["0"] as string for the example
df.columns = cols_str

# Using pandas concatenation
# if you are ever confused about axis = 1 or axis = 0, just put axis = "columns" or axis = "rows"
pd.concat([df.loc[:, "0":"2"], df.loc[:, "6":"10"], df.loc[:, "16":"19"]], axis = "columns") # ------------------> here we are selecting columns converted to strings

# Using lists
# please ntoe that df.columns is a series with index, so we are using index to filter # -------------------------> here we are selecting the index of columns
df[list(df.columns[0:3]) + list(df.columns[6:11]) + list(df.columns[16:20])]

# Using numpy
df.iloc[:, np.r_[0:3, 6:11, 16:20]] # probably the most beautiful solution

Unnamed: 0,0,1,2,6,7,8,9,10,16,17,18,19
A,16,12,2,12,19,12,13,11,7,8,6,16
B,5,18,5,2,17,17,17,11,1,3,2,12
C,2,8,12,17,6,12,5,5,15,16,1,3
D,1,11,16,15,8,10,6,16,5,17,10,11
E,13,9,17,16,6,6,11,4,3,6,16,13
F,19,2,18,12,13,5,7,16,10,12,10,3
G,1,4,12,6,7,14,19,19,8,2,10,8


Unnamed: 0,0,1,2,6,7,8,9,10,16,17,18,19
A,16,12,2,12,19,12,13,11,7,8,6,16
B,5,18,5,2,17,17,17,11,1,3,2,12
C,2,8,12,17,6,12,5,5,15,16,1,3
D,1,11,16,15,8,10,6,16,5,17,10,11
E,13,9,17,16,6,6,11,4,3,6,16,13
F,19,2,18,12,13,5,7,16,10,12,10,3
G,1,4,12,6,7,14,19,19,8,2,10,8


Unnamed: 0,0,1,2,6,7,8,9,10,16,17,18,19
A,16,12,2,12,19,12,13,11,7,8,6,16
B,5,18,5,2,17,17,17,11,1,3,2,12
C,2,8,12,17,6,12,5,5,15,16,1,3
D,1,11,16,15,8,10,6,16,5,17,10,11
E,13,9,17,16,6,6,11,4,3,6,16,13
F,19,2,18,12,13,5,7,16,10,12,10,3
G,1,4,12,6,7,14,19,19,8,2,10,8


## Извлечение столбца из DF и сохранение его в отдельную Series

In [201]:
df = pd.read_csv("IMDB-Movie-Data.csv")
df.head()

meta = df.pop("Metascore").to_frame()
df.head()
meta.head()

Unnamed: 0,Rank,Title,Genre,Description,Director,Actors,Year,Runtime (Minutes),Rating,Votes,Revenue (Millions),Metascore
0,1,Guardians of the Galaxy,"Action,Adventure,Sci-Fi",A group of intergalactic criminals are forced ...,James Gunn,"Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S...",2014,121,8.1,757074,333.13,76.0
1,2,Prometheus,"Adventure,Mystery,Sci-Fi","Following clues to the origin of mankind, a te...",Ridley Scott,"Noomi Rapace, Logan Marshall-Green, Michael Fa...",2012,124,7.0,485820,126.46,65.0
2,3,Split,"Horror,Thriller",Three girls are kidnapped by a man with a diag...,M. Night Shyamalan,"James McAvoy, Anya Taylor-Joy, Haley Lu Richar...",2016,117,7.3,157606,138.12,62.0
3,4,Sing,"Animation,Comedy,Family","In a city of humanoid animals, a hustling thea...",Christophe Lourdelet,"Matthew McConaughey,Reese Witherspoon, Seth Ma...",2016,108,7.2,60545,270.32,59.0
4,5,Suicide Squad,"Action,Adventure,Fantasy",A secret government agency recruits some of th...,David Ayer,"Will Smith, Jared Leto, Margot Robbie, Viola D...",2016,123,6.2,393727,325.02,40.0


Unnamed: 0,Rank,Title,Genre,Description,Director,Actors,Year,Runtime (Minutes),Rating,Votes,Revenue (Millions)
0,1,Guardians of the Galaxy,"Action,Adventure,Sci-Fi",A group of intergalactic criminals are forced ...,James Gunn,"Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S...",2014,121,8.1,757074,333.13
1,2,Prometheus,"Adventure,Mystery,Sci-Fi","Following clues to the origin of mankind, a te...",Ridley Scott,"Noomi Rapace, Logan Marshall-Green, Michael Fa...",2012,124,7.0,485820,126.46
2,3,Split,"Horror,Thriller",Three girls are kidnapped by a man with a diag...,M. Night Shyamalan,"James McAvoy, Anya Taylor-Joy, Haley Lu Richar...",2016,117,7.3,157606,138.12
3,4,Sing,"Animation,Comedy,Family","In a city of humanoid animals, a hustling thea...",Christophe Lourdelet,"Matthew McConaughey,Reese Witherspoon, Seth Ma...",2016,108,7.2,60545,270.32
4,5,Suicide Squad,"Action,Adventure,Fantasy",A secret government agency recruits some of th...,David Ayer,"Will Smith, Jared Leto, Margot Robbie, Viola D...",2016,123,6.2,393727,325.02


Unnamed: 0,Metascore
0,76.0
1,65.0
2,62.0
3,59.0
4,40.0


## Преобразование столбца с непрерывной последовательностью в категорию (использование cut(), qcut())

In [205]:
df = pd.read_csv("IMDB-Movie-Data.csv")
df.head()

df['cut_hend_bins'] = pd.cut(df['Metascore'], bins=[0, 25, 50, 75, 99])
df['qcut_auto_3bins'] = pd.qcut(df['Metascore'], q=3)
df['cut_group_bins'] = pd.qcut(df['Metascore'], q=4, labels=['awful', 'bad', 'average', 'good'])

df.sample(10)

df['cut_hend_bins'].value_counts()
df['qcut_auto_3bins'].value_counts()
df['cut_group_bins'].value_counts()


Unnamed: 0,Rank,Title,Genre,Description,Director,Actors,Year,Runtime (Minutes),Rating,Votes,Revenue (Millions),Metascore
0,1,Guardians of the Galaxy,"Action,Adventure,Sci-Fi",A group of intergalactic criminals are forced ...,James Gunn,"Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S...",2014,121,8.1,757074,333.13,76.0
1,2,Prometheus,"Adventure,Mystery,Sci-Fi","Following clues to the origin of mankind, a te...",Ridley Scott,"Noomi Rapace, Logan Marshall-Green, Michael Fa...",2012,124,7.0,485820,126.46,65.0
2,3,Split,"Horror,Thriller",Three girls are kidnapped by a man with a diag...,M. Night Shyamalan,"James McAvoy, Anya Taylor-Joy, Haley Lu Richar...",2016,117,7.3,157606,138.12,62.0
3,4,Sing,"Animation,Comedy,Family","In a city of humanoid animals, a hustling thea...",Christophe Lourdelet,"Matthew McConaughey,Reese Witherspoon, Seth Ma...",2016,108,7.2,60545,270.32,59.0
4,5,Suicide Squad,"Action,Adventure,Fantasy",A secret government agency recruits some of th...,David Ayer,"Will Smith, Jared Leto, Margot Robbie, Viola D...",2016,123,6.2,393727,325.02,40.0


Unnamed: 0,Rank,Title,Genre,Description,Director,Actors,Year,Runtime (Minutes),Rating,Votes,Revenue (Millions),Metascore,cut_hend_bins,qcut_auto_3bins,cut_group_bins
212,213,Transformers,"Action,Adventure,Sci-Fi",An ancient struggle between two Cybertronian r...,Michael Bay,"Shia LaBeouf, Megan Fox, Josh Duhamel, Tyrese ...",2007,144,7.1,531112,318.76,61.0,"(50.0, 75.0]","(51.0, 68.0]",average
370,371,Sleight,"Action,Drama,Sci-Fi",A young street magician (Jacob Latimore) is le...,J.D. Dillard,"Jacob Latimore, Seychelle Gabriel, Dulé Hill, ...",2016,89,6.0,702,3.85,62.0,"(50.0, 75.0]","(51.0, 68.0]",average
857,858,Freedom Writers,"Biography,Crime,Drama",A young teacher inspires her class of at-risk ...,Richard LaGravenese,"Hilary Swank, Imelda Staunton, Patrick Dempsey...",2007,123,7.5,55648,36.58,64.0,"(50.0, 75.0]","(51.0, 68.0]",average
905,906,In Dubious Battle,Drama,An activist gets caught up in the labor moveme...,James Franco,"Nat Wolff, James Franco, Vincent D'Onofrio, Se...",2016,110,6.2,1455,,43.0,"(25.0, 50.0]","(10.999, 51.0]",awful
129,130,The Revenant,"Adventure,Drama,Thriller",A frontiersman on a fur trading expedition in ...,Alejandro González Iñárritu,"Leonardo DiCaprio, Tom Hardy, Will Poulter, Do...",2015,156,8.0,499424,183.64,76.0,"(75.0, 99.0]","(68.0, 100.0]",good
654,655,The Levelling,Drama,"Somerset, October 2014. When Clover Catto (Ell...",Hope Dickson Leach,"Ellie Kendrick, David Troughton, Jack Holden,J...",2016,83,6.4,482,,82.0,"(75.0, 99.0]","(68.0, 100.0]",good
456,457,The Proposal,"Comedy,Drama,Romance",A pushy boss forces her young assistant to mar...,Anne Fletcher,"Sandra Bullock, Ryan Reynolds, Mary Steenburge...",2009,108,6.7,241709,163.95,48.0,"(25.0, 50.0]","(10.999, 51.0]",bad
401,402,The Black Room,Horror,PAUL and JENNIFER HEMDALE have just moved into...,Rolfe Kanefsky,"Natasha Henstridge, Lukas Hassel, Lin Shaye,Do...",2016,91,3.9,240,,71.0,"(50.0, 75.0]","(68.0, 100.0]",average
143,144,Room,Drama,A young boy is raised within the confines of a...,Lenny Abrahamson,"Brie Larson, Jacob Tremblay, Sean Bridgers,Wen...",2015,118,8.2,224132,14.68,86.0,"(75.0, 99.0]","(68.0, 100.0]",good
42,43,Don't Fuck in the Woods,Horror,A group of friends are going on a camping trip...,Shawn Burkett,"Brittany Blanton, Ayse Howard, Roman Jossart,N...",2016,73,2.7,496,,,,,


(50, 75]    453
(25, 50]    278
(75, 99]    182
(0, 25]      22
Name: cut_hend_bins, dtype: int64

(51.0, 68.0]      325
(10.999, 51.0]    322
(68.0, 100.0]     289
Name: qcut_auto_3bins, dtype: int64

awful      250
average    246
good       222
bad        218
Name: cut_group_bins, dtype: int64

# Отображение и форматирование

## Настройте форматирование так, чтобы большие числа не отображались в научной нотации

In [371]:
pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [367]:
df

Unnamed: 0,int_col,num_col,chr_col,word_col,mix_col
0,7,0.582,чиьфс,кошка,чиьфс
1,5,0.401,ьюжыз,собака,кошка
2,4,0.913,шцйжы,ЯБЛОКО,0.913
3,5,0.516,хцпхг,АПЕЛЬСИН,5
4,5,0.419,иьчщд,Дерево,ЯБЛОКО
5,2,0.577,жеикф,Машина,7
6,1,0.224,шмимо,Car,АПЕЛЬСИН
7,2,0.222,елчжз,House,4
8,4,0.653,пнфкц,Дом,АПЕЛЬСИН
9,6,0.972,ытлфь,кошка,ьюжыз


In [379]:
pd.options.display.float_format = '{:,.2f}'.format
# pd.options.display.float_format = None

In [380]:
df

Unnamed: 0,int_col,num_col,chr_col,word_col,mix_col
0,7,0.58,чиьфс,кошка,чиьфс
1,5,0.4,ьюжыз,собака,кошка
2,4,0.91,шцйжы,ЯБЛОКО,0.91
3,5,0.52,хцпхг,АПЕЛЬСИН,5
4,5,0.42,иьчщд,Дерево,ЯБЛОКО
5,2,0.58,жеикф,Машина,7
6,1,0.22,шмимо,Car,АПЕЛЬСИН
7,2,0.22,елчжз,House,4
8,4,0.65,пнфкц,Дом,АПЕЛЬСИН
9,6,0.97,ытлфь,кошка,ьюжыз


## Отображение только нескольких строк в DataFrame

In [71]:
print("This df occupies way too much space")
df = generate_sample_data()
df

print("using set_option to save some screen space")
pd.set_option("display.max_rows", 6)
df

print("use reset_option all to reset to default")
pd.reset_option("all")

This df occupies way too much space


Unnamed: 0,A,B,C,D,E,F,G
0,16,3,16,16,14,14,4
1,15,9,6,6,9,18,1
2,14,4,6,16,19,6,13
3,13,14,10,1,19,14,16
4,2,14,16,6,14,10,15
5,14,7,17,2,8,7,17
6,4,5,16,16,2,6,10
7,3,13,5,6,2,6,13
8,4,16,7,12,14,10,7
9,9,14,10,17,7,10,5


using set_option to save some screen space


Unnamed: 0,A,B,C,D,E,F,G
0,16,3,16,16,14,14,4
1,15,9,6,6,9,18,1
2,14,4,6,16,19,6,13
...,...,...,...,...,...,...,...
17,1,12,18,18,18,8,19
18,7,17,9,1,2,14,5
19,18,4,8,11,4,2,10


use reset_option all to reset to default

: boolean
    use_inf_as_null had been deprecated and will be removed in a future
    version. Use `use_inf_as_na` instead.



: boolean
    use_inf_as_null had been deprecated and will be removed in a future
    version. Use `use_inf_as_na` instead.

Matplotlib is building the font cache; this may take a moment.
