In [1]:
import numpy as np
import pandas as pd

In [2]:
### Object Creation

## Create a Series by passing a list of values with default integer index

In [3]:
 s = pd.Series([1, 3, 5, np.nan, 6, 8])

In [4]:
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [5]:
## Create a DataFrame by passing a Numpy array with a datetime index and labeled columns.

In [6]:
dates = pd.date_range('20130101', periods=6)

In [7]:
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [8]:
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD'))

In [9]:
df

Unnamed: 0,A,B,C,D
2013-01-01,-0.925085,-0.62571,0.590613,0.566678
2013-01-02,0.234304,-0.039567,0.074545,-0.957846
2013-01-03,-1.133536,-0.35996,0.534297,0.875705
2013-01-04,0.293298,0.355639,-0.225137,1.242969
2013-01-05,1.963148,1.757177,1.447698,0.453496
2013-01-06,0.092451,1.280601,0.446767,-0.052941


In [10]:
## Create a DataFrame from a dict of objects

In [11]:
df2 = pd.DataFrame({'A': 1.0,
                        'B': pd.Timestamp('20130102'),
                        'C': pd.Series(1, index=list(range(4)), dtype='float32'),
                        'D': np.array([3] * 4, dtype='int32'),
                        'E': pd.Categorical(["test", "train", "test", "train"]),
                        'F': 'foo'})

In [12]:
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [13]:
## The columns of the resulting DataFrame have different dtypes.

In [14]:
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

In [15]:
## In Jupyter, tab completion for column names (as well as public attributes) is automatically enabled. 
## Here’s a subset of the attributes that can be completed:

In [16]:
df2.<TAB>  # noqa: E225, E999
df2.A                  df2.bool
df2.abs                df2.boxplot
df2.add                df2.C
df2.add_prefix         df2.clip
df2.add_suffix         df2.clip_lower
df2.align              df2.clip_upper
df2.all                df2.columns
df2.any                df2.combine
df2.append             df2.combine_first
df2.apply              df2.consolidate
df2.applymap
df2.D

SyntaxError: invalid syntax (4084307864.py, line 1)

In [17]:
### Viewing Data

## Here is how to view the top and bottom rows of the frame:

In [18]:
df.head()

Unnamed: 0,A,B,C,D
2013-01-01,-0.925085,-0.62571,0.590613,0.566678
2013-01-02,0.234304,-0.039567,0.074545,-0.957846
2013-01-03,-1.133536,-0.35996,0.534297,0.875705
2013-01-04,0.293298,0.355639,-0.225137,1.242969
2013-01-05,1.963148,1.757177,1.447698,0.453496


In [19]:
df.tail(3)

Unnamed: 0,A,B,C,D
2013-01-04,0.293298,0.355639,-0.225137,1.242969
2013-01-05,1.963148,1.757177,1.447698,0.453496
2013-01-06,0.092451,1.280601,0.446767,-0.052941


In [20]:
## Display the index and columns:

In [21]:
df.index

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [22]:
 df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [23]:
## Convert DataFrame to numpy array.

In [24]:
## NumPy arrays have one dtype for the entire array, while Pandas DataFrames have one dtype per column. 
## When you call DataFrame.to_numpy(), Pandas will find the NumPy dtype that can hold all of the dtypes in the DataFrame. 
## This may end up being an object, which requires casting every value to a Python object. 
## This can lead to very expensive (time and memory-consuming) operations.

In [25]:
## The function describe() shows a quick statistic summary of your data.

In [26]:
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,0.08743,0.394696,0.478131,0.354677
std,1.104377,0.942553,0.568211,0.775127
min,-1.133536,-0.62571,-0.225137,-0.957846
25%,-0.670701,-0.279862,0.1676,0.073668
50%,0.163377,0.158036,0.490532,0.510087
75%,0.27855,1.04936,0.576534,0.798448
max,1.963148,1.757177,1.447698,1.242969


In [27]:
## Tranpose your data

In [28]:
df.T

Unnamed: 0,2013-01-01,2013-01-02,2013-01-03,2013-01-04,2013-01-05,2013-01-06
A,-0.925085,0.234304,-1.133536,0.293298,1.963148,0.092451
B,-0.62571,-0.039567,-0.35996,0.355639,1.757177,1.280601
C,0.590613,0.074545,0.534297,-0.225137,1.447698,0.446767
D,0.566678,-0.957846,0.875705,1.242969,0.453496,-0.052941


In [29]:
## We can also sort DataFrame by values in specific column.

In [30]:
df.sort_values(by='B')

Unnamed: 0,A,B,C,D
2013-01-01,-0.925085,-0.62571,0.590613,0.566678
2013-01-03,-1.133536,-0.35996,0.534297,0.875705
2013-01-02,0.234304,-0.039567,0.074545,-0.957846
2013-01-04,0.293298,0.355639,-0.225137,1.242969
2013-01-06,0.092451,1.280601,0.446767,-0.052941
2013-01-05,1.963148,1.757177,1.447698,0.453496
