# 10 minutes of pandas through library documentation


In [50]:
#  imports libraries sucha as numpy and pandas
import numpy as np

import pandas as pd

Creating a Series by passing a list of values, letting pandas create a default integer index:

In [51]:
s = pd.Series([1, 3, 5, np.nan, 6, 8])
# np numerical python numpy hogi to access hoga, khali krne k liye bhi numpy use krna parta hy
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

Creating a DataFrame by passing a NumPy array, with a datetime index using date_range() and labeled columns:

In [52]:
dates = pd.date_range("20230701", periods=10)

dates


DatetimeIndex(['2023-07-01', '2023-07-02', '2023-07-03', '2023-07-04',
               '2023-07-05', '2023-07-06', '2023-07-07', '2023-07-08',
               '2023-07-09', '2023-07-10'],
              dtype='datetime64[ns]', freq='D')

In [53]:
df = pd.DataFrame(np.random.randn(10, 4), index=dates, columns=list("ABCD"))

df

Unnamed: 0,A,B,C,D
2023-07-01,-2.121743,-1.739813,-1.206788,-0.498751
2023-07-02,-0.558504,0.628472,-0.116214,-0.199517
2023-07-03,1.125481,2.158547,1.300937,-0.999693
2023-07-04,-0.304673,-2.274526,1.303609,0.180047
2023-07-05,-0.000126,0.191834,0.807161,-0.882589
2023-07-06,0.24355,1.748937,1.922367,0.384036
2023-07-07,0.18905,0.332699,-0.650573,-0.305385
2023-07-08,-0.289029,-0.931905,0.220069,-1.184878
2023-07-09,0.799072,-1.534048,-0.255728,-0.735724
2023-07-10,1.279878,-1.681069,-0.171558,-2.411666


In [54]:
#dictionary
# dictionary uses curly braces, key value pair seperated by colon

dict = {"Ammar":6.0,"Farman":6.2,"Uzair":5.9}
dict

{'Ammar': 6.0, 'Farman': 6.2, 'Uzair': 5.9}

Creating a DataFrame by passing a dictionary of objects that can be converted into a series-like structure:



In [55]:
# this is a dictionary of 5 key value pairs
#  self created a dataframe

df2 = pd.DataFrame(
    {
        "A": 1.0,
        "B": pd.Timestamp("20130102"),
        "C": pd.Series(1, index=list(range(4)), dtype="float32"),
        "D": np.array([3] * 4, dtype="int32"),
        "E": pd.Categorical(["test", "train", "test", "train"]),
        "F": "foo",
    }
)


df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [56]:
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

In [57]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4 entries, 0 to 3
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   A       4 non-null      float64       
 1   B       4 non-null      datetime64[ns]
 2   C       4 non-null      float32       
 3   D       4 non-null      int32         
 4   E       4 non-null      category      
 5   F       4 non-null      object        
dtypes: category(1), datetime64[ns](1), float32(1), float64(1), int32(1), object(1)
memory usage: 288.0+ bytes


The columns of the resulting DataFrame have different dtypes:



In [58]:
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

In [59]:
df2.head()

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [60]:
df2.tail()

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [61]:
df2.index

Int64Index([0, 1, 2, 3], dtype='int64')

In [62]:
df2.columns

Index(['A', 'B', 'C', 'D', 'E', 'F'], dtype='object')

In [63]:
df2.to_numpy()

array([[1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo']],
      dtype=object)

In [64]:
df.head()

Unnamed: 0,A,B,C,D
2023-07-01,-2.121743,-1.739813,-1.206788,-0.498751
2023-07-02,-0.558504,0.628472,-0.116214,-0.199517
2023-07-03,1.125481,2.158547,1.300937,-0.999693
2023-07-04,-0.304673,-2.274526,1.303609,0.180047
2023-07-05,-0.000126,0.191834,0.807161,-0.882589


In [65]:
df.tail()

Unnamed: 0,A,B,C,D
2023-07-06,0.24355,1.748937,1.922367,0.384036
2023-07-07,0.18905,0.332699,-0.650573,-0.305385
2023-07-08,-0.289029,-0.931905,0.220069,-1.184878
2023-07-09,0.799072,-1.534048,-0.255728,-0.735724
2023-07-10,1.279878,-1.681069,-0.171558,-2.411666


In [66]:
df.index

DatetimeIndex(['2023-07-01', '2023-07-02', '2023-07-03', '2023-07-04',
               '2023-07-05', '2023-07-06', '2023-07-07', '2023-07-08',
               '2023-07-09', '2023-07-10'],
              dtype='datetime64[ns]', freq='D')

In [67]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 10 entries, 2023-07-01 to 2023-07-10
Freq: D
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   A       10 non-null     float64
 1   B       10 non-null     float64
 2   C       10 non-null     float64
 3   D       10 non-null     float64
dtypes: float64(4)
memory usage: 400.0 bytes


In [68]:
df.describe()

Unnamed: 0,A,B,C,D
count,10.0,10.0,10.0,10.0
mean,0.036295,-0.310087,0.315328,-0.665412
std,0.978393,1.547298,0.987751,0.793984
min,-2.121743,-2.274526,-1.206788,-2.411666
25%,-0.300762,-1.644313,-0.234686,-0.970417
50%,0.094462,-0.370036,0.051928,-0.617238
75%,0.660191,0.554529,1.177493,-0.225984
max,1.279878,2.158547,1.922367,0.384036


In [69]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [70]:
df.to_numpy()

array([[-2.12174329e+00, -1.73981333e+00, -1.20678752e+00,
        -4.98751500e-01],
       [-5.58503712e-01,  6.28471984e-01, -1.16214106e-01,
        -1.99517000e-01],
       [ 1.12548094e+00,  2.15854739e+00,  1.30093657e+00,
        -9.99692776e-01],
       [-3.04673462e-01, -2.27452627e+00,  1.30360924e+00,
         1.80047444e-01],
       [-1.25968885e-04,  1.91833576e-01,  8.07161245e-01,
        -8.82588876e-01],
       [ 2.43549548e-01,  1.74893671e+00,  1.92236677e+00,
         3.84036013e-01],
       [ 1.89049551e-01,  3.32699258e-01, -6.50572699e-01,
        -3.05385154e-01],
       [-2.89028875e-01, -9.31904642e-01,  2.20069240e-01,
        -1.18487846e+00],
       [ 7.99071644e-01, -1.53404765e+00, -2.55728483e-01,
        -7.35723702e-01],
       [ 1.27987789e+00, -1.68106853e+00, -1.71557884e-01,
        -2.41166585e+00]])

In [71]:
df.T

Unnamed: 0,2023-07-01,2023-07-02,2023-07-03,2023-07-04,2023-07-05,2023-07-06,2023-07-07,2023-07-08,2023-07-09,2023-07-10
A,-2.121743,-0.558504,1.125481,-0.304673,-0.000126,0.24355,0.18905,-0.289029,0.799072,1.279878
B,-1.739813,0.628472,2.158547,-2.274526,0.191834,1.748937,0.332699,-0.931905,-1.534048,-1.681069
C,-1.206788,-0.116214,1.300937,1.303609,0.807161,1.922367,-0.650573,0.220069,-0.255728,-0.171558
D,-0.498751,-0.199517,-0.999693,0.180047,-0.882589,0.384036,-0.305385,-1.184878,-0.735724,-2.411666


In [72]:
df

Unnamed: 0,A,B,C,D
2023-07-01,-2.121743,-1.739813,-1.206788,-0.498751
2023-07-02,-0.558504,0.628472,-0.116214,-0.199517
2023-07-03,1.125481,2.158547,1.300937,-0.999693
2023-07-04,-0.304673,-2.274526,1.303609,0.180047
2023-07-05,-0.000126,0.191834,0.807161,-0.882589
2023-07-06,0.24355,1.748937,1.922367,0.384036
2023-07-07,0.18905,0.332699,-0.650573,-0.305385
2023-07-08,-0.289029,-0.931905,0.220069,-1.184878
2023-07-09,0.799072,-1.534048,-0.255728,-0.735724
2023-07-10,1.279878,-1.681069,-0.171558,-2.411666


In [73]:
# if axis = 1 columns will change, axis=0 rows will change

df.sort_index(axis=1, ascending=False)

# df.sort_index(axis=1, ascending=False)


Unnamed: 0,D,C,B,A
2023-07-01,-0.498751,-1.206788,-1.739813,-2.121743
2023-07-02,-0.199517,-0.116214,0.628472,-0.558504
2023-07-03,-0.999693,1.300937,2.158547,1.125481
2023-07-04,0.180047,1.303609,-2.274526,-0.304673
2023-07-05,-0.882589,0.807161,0.191834,-0.000126
2023-07-06,0.384036,1.922367,1.748937,0.24355
2023-07-07,-0.305385,-0.650573,0.332699,0.18905
2023-07-08,-1.184878,0.220069,-0.931905,-0.289029
2023-07-09,-0.735724,-0.255728,-1.534048,0.799072
2023-07-10,-2.411666,-0.171558,-1.681069,1.279878


In [74]:
# to sort with values 
df.sort_values(by="B")

# how to use sort_values using two columns?
# df.sort_values(by=["B","A"])
# but it permanent the first column it sorted and then sort onlyl specific 2nd c

Unnamed: 0,A,B,C,D
2023-07-04,-0.304673,-2.274526,1.303609,0.180047
2023-07-01,-2.121743,-1.739813,-1.206788,-0.498751
2023-07-10,1.279878,-1.681069,-0.171558,-2.411666
2023-07-09,0.799072,-1.534048,-0.255728,-0.735724
2023-07-08,-0.289029,-0.931905,0.220069,-1.184878
2023-07-05,-0.000126,0.191834,0.807161,-0.882589
2023-07-07,0.18905,0.332699,-0.650573,-0.305385
2023-07-02,-0.558504,0.628472,-0.116214,-0.199517
2023-07-06,0.24355,1.748937,1.922367,0.384036
2023-07-03,1.125481,2.158547,1.300937,-0.999693


In [75]:
df

Unnamed: 0,A,B,C,D
2023-07-01,-2.121743,-1.739813,-1.206788,-0.498751
2023-07-02,-0.558504,0.628472,-0.116214,-0.199517
2023-07-03,1.125481,2.158547,1.300937,-0.999693
2023-07-04,-0.304673,-2.274526,1.303609,0.180047
2023-07-05,-0.000126,0.191834,0.807161,-0.882589
2023-07-06,0.24355,1.748937,1.922367,0.384036
2023-07-07,0.18905,0.332699,-0.650573,-0.305385
2023-07-08,-0.289029,-0.931905,0.220069,-1.184878
2023-07-09,0.799072,-1.534048,-0.255728,-0.735724
2023-07-10,1.279878,-1.681069,-0.171558,-2.411666


#### DATA SLICING
we use loc, iloc, and index in data slicing

In [106]:
df[["A","B"]]

Unnamed: 0,A,B
2023-07-01,-2.121743,-1.739813
2023-07-02,-0.558504,0.628472
2023-07-03,1.125481,2.158547
2023-07-04,-0.304673,-2.274526
2023-07-05,-0.000126,0.191834
2023-07-06,0.24355,1.748937
2023-07-07,0.18905,0.332699
2023-07-08,-0.289029,-0.931905
2023-07-09,0.799072,-1.534048
2023-07-10,1.279878,-1.681069


In [107]:
df.shape

(10, 4)

In [108]:
# it will print data from particular range of rows [start_row:end_row]
df[0:4] #  dataframe[rows]
#print data of rows and 

Unnamed: 0,A,B,C,D
2023-07-01,-2.121743,-1.739813,-1.206788,-0.498751
2023-07-02,-0.558504,0.628472,-0.116214,-0.199517
2023-07-03,1.125481,2.158547,1.300937,-0.999693
2023-07-04,-0.304673,-2.274526,1.303609,0.180047


In [109]:
# to print rows and columns use iloc method
df.iloc[0:5,0:2]  #index to loc krne k liye 
#dataframe.iloc[row:row,col:col]

Unnamed: 0,A,B
2023-07-01,-2.121743,-1.739813
2023-07-02,-0.558504,0.628472
2023-07-03,1.125481,2.158547
2023-07-04,-0.304673,-2.274526
2023-07-05,-0.000126,0.191834


In [110]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [111]:
df.loc[:,["A","B"]] # loc krne k liye

Unnamed: 0,A,B
2023-07-01,-2.121743,-1.739813
2023-07-02,-0.558504,0.628472
2023-07-03,1.125481,2.158547
2023-07-04,-0.304673,-2.274526
2023-07-05,-0.000126,0.191834
2023-07-06,0.24355,1.748937
2023-07-07,0.18905,0.332699
2023-07-08,-0.289029,-0.931905
2023-07-09,0.799072,-1.534048
2023-07-10,1.279878,-1.681069


In [112]:
# it will gives value at index 3
df.iloc[3]

A   -0.304673
B   -2.274526
C    1.303609
D    0.180047
Name: 2023-07-04 00:00:00, dtype: float64

#### Data wrangling on titanic dataset

In [114]:
import seaborn as sns
kashti = sns.load_dataset("titanic")
kashti

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,,1,2,23.4500,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True


In [115]:
kashti.shape

(891, 15)

In [116]:
kashti.sample(100)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
338,1,3,male,45.0,0,0,8.0500,S,Third,man,True,,Southampton,yes,True
871,1,1,female,47.0,1,1,52.5542,S,First,woman,False,D,Southampton,yes,False
350,0,3,male,23.0,0,0,9.2250,S,Third,man,True,,Southampton,no,True
577,1,1,female,39.0,1,0,55.9000,S,First,woman,False,E,Southampton,yes,False
731,0,3,male,11.0,0,0,18.7875,C,Third,child,False,,Cherbourg,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
116,0,3,male,70.5,0,0,7.7500,Q,Third,man,True,,Queenstown,no,True
54,0,1,male,65.0,0,1,61.9792,C,First,man,True,B,Cherbourg,no,False
430,1,1,male,28.0,0,0,26.5500,S,First,man,True,C,Southampton,yes,True
609,1,1,female,40.0,0,0,153.4625,S,First,woman,False,C,Southampton,yes,True


In [122]:
# filter data aged less than 5
kashti[kashti["age"] < 5 ]

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
7,0,3,male,2.0,3,1,21.075,S,Third,child,False,,Southampton,no,False
10,1,3,female,4.0,1,1,16.7,S,Third,child,False,G,Southampton,yes,False
16,0,3,male,2.0,4,1,29.125,Q,Third,child,False,,Queenstown,no,False
43,1,2,female,3.0,1,2,41.5792,C,Second,child,False,,Cherbourg,yes,False
63,0,3,male,4.0,3,2,27.9,S,Third,child,False,,Southampton,no,False
78,1,2,male,0.83,0,2,29.0,S,Second,child,False,,Southampton,yes,False
119,0,3,female,2.0,4,2,31.275,S,Third,child,False,,Southampton,no,False
164,0,3,male,1.0,4,1,39.6875,S,Third,child,False,,Southampton,no,False
171,0,3,male,4.0,4,1,29.125,Q,Third,child,False,,Queenstown,no,False
172,1,3,female,1.0,1,1,11.1333,S,Third,child,False,,Southampton,yes,False


In [123]:
kashti[kashti["age"] < 5 ].shape

(40, 15)

In [124]:
kashti[kashti["fare"] < 5 ]

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
179,0,3,male,36.0,0,0,0.0,S,Third,man,True,,Southampton,no,True
263,0,1,male,40.0,0,0,0.0,S,First,man,True,B,Southampton,no,True
271,1,3,male,25.0,0,0,0.0,S,Third,man,True,,Southampton,yes,True
277,0,2,male,,0,0,0.0,S,Second,man,True,,Southampton,no,True
302,0,3,male,19.0,0,0,0.0,S,Third,man,True,,Southampton,no,True
378,0,3,male,20.0,0,0,4.0125,C,Third,man,True,,Cherbourg,no,True
413,0,2,male,,0,0,0.0,S,Second,man,True,,Southampton,no,True
466,0,2,male,,0,0,0.0,S,Second,man,True,,Southampton,no,True
481,0,2,male,,0,0,0.0,S,Second,man,True,,Southampton,no,True
597,0,3,male,49.0,0,0,0.0,S,Third,man,True,,Southampton,no,True
