# Data Wrangling
## Sohail Ahmed 3/6/2023
### email ahmedsohailkhan14@gmail.com

In [1]:
# Import Libraried
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
s = pd.Series([1,3,5,np.nan, 6,8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [3]:
dates = pd.date_range("20130101", periods=6)
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [4]:
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list("ABCD"))
df

Unnamed: 0,A,B,C,D
2013-01-01,0.006931,0.045917,-0.436885,0.304655
2013-01-02,0.534468,0.219083,2.085644,0.018998
2013-01-03,0.334302,-0.16257,0.421786,-1.349789
2013-01-04,-0.603029,-0.122976,-0.112188,-1.352666
2013-01-05,-0.047621,0.989726,1.039723,-1.432109
2013-01-06,0.354557,0.269105,0.143727,0.54264


In [5]:
df2 = pd.DataFrame(
    {
        "A": 1.0,
        "B": pd.Timestamp("20130102"),
        "C": pd.Series(1, index=list(range(4)), dtype="float32"),
        "D": np.array([3] * 4, dtype="int32"),
        "E": pd.Categorical(["test", "train", "test", "train"]),
        "F": "foo",
    }
)
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [6]:
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

In [7]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4 entries, 0 to 3
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   A       4 non-null      float64       
 1   B       4 non-null      datetime64[ns]
 2   C       4 non-null      float32       
 3   D       4 non-null      int32         
 4   E       4 non-null      category      
 5   F       4 non-null      object        
dtypes: category(1), datetime64[ns](1), float32(1), float64(1), int32(1), object(1)
memory usage: 288.0+ bytes


In [8]:
# Convert into numpy
df2.to_numpy()

array([[1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo']],
      dtype=object)

In [9]:
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,0.096601,0.206381,0.523634,-0.544712
std,0.40819,0.421444,0.915182,0.928433
min,-0.603029,-0.16257,-0.436885,-1.432109
25%,-0.033983,-0.080753,-0.048209,-1.351947
50%,0.170616,0.1325,0.282756,-0.665396
75%,0.349494,0.256599,0.885239,0.233241
max,0.534468,0.989726,2.085644,0.54264


In [10]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
A,6.0,0.096601,0.40819,-0.603029,-0.033983,0.170616,0.349494,0.534468
B,6.0,0.206381,0.421444,-0.16257,-0.080753,0.1325,0.256599,0.989726
C,6.0,0.523634,0.915182,-0.436885,-0.048209,0.282756,0.885239,2.085644
D,6.0,-0.544712,0.928433,-1.432109,-1.351947,-0.665396,0.233241,0.54264


In [11]:
# sort column
df.sort_index(axis=1, ascending=False)

Unnamed: 0,D,C,B,A
2013-01-01,0.304655,-0.436885,0.045917,0.006931
2013-01-02,0.018998,2.085644,0.219083,0.534468
2013-01-03,-1.349789,0.421786,-0.16257,0.334302
2013-01-04,-1.352666,-0.112188,-0.122976,-0.603029
2013-01-05,-1.432109,1.039723,0.989726,-0.047621
2013-01-06,0.54264,0.143727,0.269105,0.354557


In [12]:
# sort rows
df.sort_index(axis=0, ascending=False)

Unnamed: 0,A,B,C,D
2013-01-06,0.354557,0.269105,0.143727,0.54264
2013-01-05,-0.047621,0.989726,1.039723,-1.432109
2013-01-04,-0.603029,-0.122976,-0.112188,-1.352666
2013-01-03,0.334302,-0.16257,0.421786,-1.349789
2013-01-02,0.534468,0.219083,2.085644,0.018998
2013-01-01,0.006931,0.045917,-0.436885,0.304655


In [13]:
# sort by values
df.sort_values(by="B")

Unnamed: 0,A,B,C,D
2013-01-03,0.334302,-0.16257,0.421786,-1.349789
2013-01-04,-0.603029,-0.122976,-0.112188,-1.352666
2013-01-01,0.006931,0.045917,-0.436885,0.304655
2013-01-02,0.534468,0.219083,2.085644,0.018998
2013-01-06,0.354557,0.269105,0.143727,0.54264
2013-01-05,-0.047621,0.989726,1.039723,-1.432109


In [14]:
df[['A',"B"]]

Unnamed: 0,A,B
2013-01-01,0.006931,0.045917
2013-01-02,0.534468,0.219083
2013-01-03,0.334302,-0.16257
2013-01-04,-0.603029,-0.122976
2013-01-05,-0.047621,0.989726
2013-01-06,0.354557,0.269105


In [15]:
# Indexing rows
df[0:3]

Unnamed: 0,A,B,C,D
2013-01-01,0.006931,0.045917,-0.436885,0.304655
2013-01-02,0.534468,0.219083,2.085644,0.018998
2013-01-03,0.334302,-0.16257,0.421786,-1.349789


In [16]:
# Indexing columns and rows
df.iloc[0:2,0:3]

Unnamed: 0,A,B,C
2013-01-01,0.006931,0.045917,-0.436885
2013-01-02,0.534468,0.219083,2.085644


In [17]:
# Indexing columns
df.loc[:,["A","B"]]

Unnamed: 0,A,B
2013-01-01,0.006931,0.045917
2013-01-02,0.534468,0.219083
2013-01-03,0.334302,-0.16257
2013-01-04,-0.603029,-0.122976
2013-01-05,-0.047621,0.989726
2013-01-06,0.354557,0.269105


In [18]:
titanic = sns.load_dataset('titanic')
titanic

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,,1,2,23.4500,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True


In [19]:
titanic.sample(100)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
586,0,2,male,47.0,0,0,15.0000,S,Second,man,True,,Southampton,no,True
67,0,3,male,19.0,0,0,8.1583,S,Third,man,True,,Southampton,no,True
667,0,3,male,,0,0,7.7750,S,Third,man,True,,Southampton,no,True
482,0,3,male,50.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
396,0,3,female,31.0,0,0,7.8542,S,Third,woman,False,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
173,0,3,male,21.0,0,0,7.9250,S,Third,man,True,,Southampton,no,True
341,1,1,female,24.0,3,2,263.0000,S,First,woman,False,C,Southampton,yes,False
261,1,3,male,3.0,4,2,31.3875,S,Third,child,False,,Southampton,yes,False
483,1,3,female,63.0,0,0,9.5875,S,Third,woman,False,,Southampton,yes,True


In [25]:
# Bolean indexing
titanic[titanic['fare']<5]

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
179,0,3,male,36.0,0,0,0.0,S,Third,man,True,,Southampton,no,True
263,0,1,male,40.0,0,0,0.0,S,First,man,True,B,Southampton,no,True
271,1,3,male,25.0,0,0,0.0,S,Third,man,True,,Southampton,yes,True
277,0,2,male,,0,0,0.0,S,Second,man,True,,Southampton,no,True
302,0,3,male,19.0,0,0,0.0,S,Third,man,True,,Southampton,no,True
378,0,3,male,20.0,0,0,4.0125,C,Third,man,True,,Cherbourg,no,True
413,0,2,male,,0,0,0.0,S,Second,man,True,,Southampton,no,True
466,0,2,male,,0,0,0.0,S,Second,man,True,,Southampton,no,True
481,0,2,male,,0,0,0.0,S,Second,man,True,,Southampton,no,True
597,0,3,male,49.0,0,0,0.0,S,Third,man,True,,Southampton,no,True
