<a href="https://colab.research.google.com/github/Rocude/LNN/blob/master/Pandas_basic_commands.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd

In [3]:
#Creating a Series by passing a list of values, letting pandas create a default integer index:
s = pd.Series([1,3,5,np.nan,6,8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [7]:
#Creating a DataFrame by passing a NumPy array, with a datetime index using date_range() and labeled columns:
dates = pd.date_range("20220809",periods=10)
dates

DatetimeIndex(['2022-08-09', '2022-08-10', '2022-08-11', '2022-08-12',
               '2022-08-13', '2022-08-14', '2022-08-15', '2022-08-16',
               '2022-08-17', '2022-08-18'],
              dtype='datetime64[ns]', freq='D')

In [12]:
df = pd.DataFrame(np.random.randn(10,4), index=dates,columns=list("ABCD"))

In [10]:
df

Unnamed: 0,A,B,C,D
2022-08-09,-0.086284,0.194524,-1.277287,-1.784368
2022-08-10,0.830648,-0.248653,-1.230333,-0.536182
2022-08-11,-0.283224,1.043713,-0.418247,1.131099
2022-08-12,1.670535,0.558791,0.106229,-0.511696
2022-08-13,-0.870388,0.181151,0.944775,0.798712
2022-08-14,-1.121611,0.146802,0.451308,0.453372
2022-08-15,-0.147673,1.618519,0.756265,-0.671491
2022-08-16,0.044496,1.303921,0.107043,-0.627409
2022-08-17,-0.724745,-1.115668,-0.473726,-0.252882
2022-08-18,-0.313653,0.487235,0.333924,-1.605313


In [15]:
#Creating a DataFrame by passing a dictionary of objects that can be converted into a series-like structure:

df2 = pd.DataFrame(
    {
        "A": 1.0,
        "B": pd.Timestamp("20130102"),
        "C": pd.Series(1, index=list(range(4)), dtype="float32"),
        "D": np.array([3] * 4, dtype="int32"),
        "E": pd.Categorical(["test", "train", "test", "train"]),
        "F": "foo",
    }
)


df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [18]:
#The columns of the resulting DataFrame have different dtypes:
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

In [26]:
#Use DataFrame.head() and DataFrame.tail() to view the top and bottom rows of the frame respectively:

df.head()



Unnamed: 0,A,B,C,D
2022-08-09,1.022741,0.866192,0.620858,1.020391
2022-08-10,0.857151,0.437666,-0.201628,0.658945
2022-08-11,-1.047418,-0.730522,0.425552,-1.577962
2022-08-12,-0.04216,-0.623398,0.089711,-1.275799
2022-08-13,-0.794555,-0.144574,0.835999,0.300366


In [27]:
df.tail(3)

Unnamed: 0,A,B,C,D
2022-08-16,1.028098,-1.179174,-0.094274,1.85121
2022-08-17,0.715416,0.470018,-1.527618,-0.553225
2022-08-18,-0.163618,0.027532,0.186601,-1.008409


In [29]:
#Display the DataFrame.index or DataFrame.columns:

df.index


DatetimeIndex(['2022-08-09', '2022-08-10', '2022-08-11', '2022-08-12',
               '2022-08-13', '2022-08-14', '2022-08-15', '2022-08-16',
               '2022-08-17', '2022-08-18'],
              dtype='datetime64[ns]', freq='D')

In [31]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [36]:
#DataFrame.to_numpy() gives a NumPy representation of the underlying data. 
#Note that this can be an expensive operation when your DataFrame has columns 
#with different data types, which comes down to a fundamental difference between 
#pandas and NumPy: NumPy arrays have one dtype for the entire array, while
#pandas DataFrames have one dtype per column. When you call DataFrame.to_numpy()
#, pandas will find the NumPy dtype that can hold all of the dtypes in the 
#DataFrame. This may end up being object, which requires casting every value to 
#a Python object.
#
#For df, our DataFrame of all floating-point values, and DataFrame.to_numpy() 
#is fast and doesn’t require copying data:
dfnp = df.to_numpy()
dfnp

array([[ 1.02274098e+00,  8.66191521e-01,  6.20857867e-01,
         1.02039130e+00],
       [ 8.57151193e-01,  4.37665867e-01, -2.01628201e-01,
         6.58944803e-01],
       [-1.04741812e+00, -7.30521541e-01,  4.25552121e-01,
        -1.57796162e+00],
       [-4.21600180e-02, -6.23397522e-01,  8.97114774e-02,
        -1.27579922e+00],
       [-7.94555346e-01, -1.44574160e-01,  8.35998916e-01,
         3.00365920e-01],
       [-1.23211692e-03,  7.16910175e-01,  8.53876905e-01,
         1.61521588e-01],
       [ 1.15429401e-01, -1.02836493e+00, -6.62582132e-01,
        -3.03413408e-01],
       [ 1.02809771e+00, -1.17917428e+00, -9.42741501e-02,
         1.85121010e+00],
       [ 7.15415823e-01,  4.70018206e-01, -1.52761755e+00,
        -5.53224567e-01],
       [-1.63617968e-01,  2.75319928e-02,  1.86601435e-01,
        -1.00840923e+00]])

In [38]:
#For df2, the DataFrame with multiple dtypes, DataFrame.to_numpy() is relatively expensive:
dfn2np = df2.to_numpy()
dfn2np
#DataFrame.to_numpy() does not include the index or column labels in the output.

array([[1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo']],
      dtype=object)

In [42]:
#describe() shows a quick statistic summary of your data:

df.describe()

Unnamed: 0,A,B,C,D
count,10.0,10.0,10.0,10.0
mean,0.168985,-0.118771,0.05265,-0.072637
std,0.732896,0.739939,0.732718,1.08005
min,-1.047418,-1.179174,-1.527618,-1.577962
25%,-0.133253,-0.703741,-0.17479,-0.894613
50%,0.057099,-0.058521,0.138156,-0.070946
75%,0.821717,0.46193,0.572031,0.5693
max,1.028098,0.866192,0.853877,1.85121


In [43]:
df2.describe()

Unnamed: 0,A,C,D
count,4.0,4.0,4.0
mean,1.0,1.0,3.0
std,0.0,0.0,0.0
min,1.0,1.0,3.0
25%,1.0,1.0,3.0
50%,1.0,1.0,3.0
75%,1.0,1.0,3.0
max,1.0,1.0,3.0


In [45]:
#Transposing your data:
print(df)
df.T

                   A         B         C         D
2022-08-09  1.022741  0.866192  0.620858  1.020391
2022-08-10  0.857151  0.437666 -0.201628  0.658945
2022-08-11 -1.047418 -0.730522  0.425552 -1.577962
2022-08-12 -0.042160 -0.623398  0.089711 -1.275799
2022-08-13 -0.794555 -0.144574  0.835999  0.300366
2022-08-14 -0.001232  0.716910  0.853877  0.161522
2022-08-15  0.115429 -1.028365 -0.662582 -0.303413
2022-08-16  1.028098 -1.179174 -0.094274  1.851210
2022-08-17  0.715416  0.470018 -1.527618 -0.553225
2022-08-18 -0.163618  0.027532  0.186601 -1.008409


Unnamed: 0,2022-08-09,2022-08-10,2022-08-11,2022-08-12,2022-08-13,2022-08-14,2022-08-15,2022-08-16,2022-08-17,2022-08-18
A,1.022741,0.857151,-1.047418,-0.04216,-0.794555,-0.001232,0.115429,1.028098,0.715416,-0.163618
B,0.866192,0.437666,-0.730522,-0.623398,-0.144574,0.71691,-1.028365,-1.179174,0.470018,0.027532
C,0.620858,-0.201628,0.425552,0.089711,0.835999,0.853877,-0.662582,-0.094274,-1.527618,0.186601
D,1.020391,0.658945,-1.577962,-1.275799,0.300366,0.161522,-0.303413,1.85121,-0.553225,-1.008409


In [48]:
#DataFrame.sort_index() sorts by an axis:
df.sort_index(axis=1,ascending=False)

Unnamed: 0,D,C,B,A
2022-08-09,1.020391,0.620858,0.866192,1.022741
2022-08-10,0.658945,-0.201628,0.437666,0.857151
2022-08-11,-1.577962,0.425552,-0.730522,-1.047418
2022-08-12,-1.275799,0.089711,-0.623398,-0.04216
2022-08-13,0.300366,0.835999,-0.144574,-0.794555
2022-08-14,0.161522,0.853877,0.71691,-0.001232
2022-08-15,-0.303413,-0.662582,-1.028365,0.115429
2022-08-16,1.85121,-0.094274,-1.179174,1.028098
2022-08-17,-0.553225,-1.527618,0.470018,0.715416
2022-08-18,-1.008409,0.186601,0.027532,-0.163618


In [51]:
#DataFrame.sort_values() sorts by values:
print(df)
df.sort_values(by="B")

                   A         B         C         D
2022-08-09  1.022741  0.866192  0.620858  1.020391
2022-08-10  0.857151  0.437666 -0.201628  0.658945
2022-08-11 -1.047418 -0.730522  0.425552 -1.577962
2022-08-12 -0.042160 -0.623398  0.089711 -1.275799
2022-08-13 -0.794555 -0.144574  0.835999  0.300366
2022-08-14 -0.001232  0.716910  0.853877  0.161522
2022-08-15  0.115429 -1.028365 -0.662582 -0.303413
2022-08-16  1.028098 -1.179174 -0.094274  1.851210
2022-08-17  0.715416  0.470018 -1.527618 -0.553225
2022-08-18 -0.163618  0.027532  0.186601 -1.008409


Unnamed: 0,A,B,C,D
2022-08-16,1.028098,-1.179174,-0.094274,1.85121
2022-08-15,0.115429,-1.028365,-0.662582,-0.303413
2022-08-11,-1.047418,-0.730522,0.425552,-1.577962
2022-08-12,-0.04216,-0.623398,0.089711,-1.275799
2022-08-13,-0.794555,-0.144574,0.835999,0.300366
2022-08-18,-0.163618,0.027532,0.186601,-1.008409
2022-08-10,0.857151,0.437666,-0.201628,0.658945
2022-08-17,0.715416,0.470018,-1.527618,-0.553225
2022-08-14,-0.001232,0.71691,0.853877,0.161522
2022-08-09,1.022741,0.866192,0.620858,1.020391
