# Data wrangling and data visualization using pandas and numpy

In [1]:
import numpy as np
import pandas as pd

In [2]:
#Creating a seriesby passing a list of values,
#nan is created using numpy, without it it gives an error. numpy makes an entry numericaly empty. 

s = pd.Series([1, 3, 5, np.nan, 6, 8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [3]:
#Creating a DataFrame by passing a NumPy array with a datetime index using date_range() and labeled columns:
#Date is also a dataset
dates = pd.date_range("20130101", periods=6)
dates   

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [4]:
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list("ABCD"))
df

Unnamed: 0,A,B,C,D
2013-01-01,-0.267878,-1.16264,-0.726115,-1.591991
2013-01-02,0.902103,1.078976,-0.885143,1.569185
2013-01-03,-0.562004,0.538388,1.859656,-1.273774
2013-01-04,-0.373705,-1.516164,-0.995393,-0.418315
2013-01-05,-0.49932,-0.026354,0.23565,0.410337
2013-01-06,0.897114,0.967683,0.103096,0.216516


In [8]:
# Dictionary: To provide a key(An argument in string) and it's value in curly backet. 
# A colon is used to separate key and its value. 

dict = {"Usman": 5.8, "Akif": 5.9}
dict

{'Usman': 5.8, 'Akif': 5.9}

In [6]:
df2 = pd.DataFrame(
    {
        "A": 1.0,
        "B": pd.Timestamp("20130102"),
        "C": pd.Series(1, index=list(range(4)), dtype="float32"),
        "D": np.array([3] * 4, dtype="int32"),
        "E": pd.Categorical(["test", "train", "test", "train"]),
        "F": "foo",
    }
)
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [9]:
df2.dtypes

A          float64
B    datetime64[s]
C          float32
D            int32
E         category
F           object
dtype: object

# Viewing data

In [10]:
# Use DataFrame.head() and DataFrame.tail() to view the top and bottom rows of the frame respectively:
df2.head()

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [11]:
df2.tail()

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [12]:
# Display the DataFrame.index

df.index

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [13]:
# Display the DataFrame.columns:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [16]:
# Return a NumPy representation of the underlying data with DataFrame.to_numpy() without the index or column labels:

df.to_numpy()


array([[-0.26787771, -1.16263992, -0.72611471, -1.59199102],
       [ 0.90210266,  1.07897631, -0.88514298,  1.56918471],
       [-0.56200401,  0.53838778,  1.85965612, -1.27377369],
       [-0.37370489, -1.51616361, -0.99539326, -0.41831507],
       [-0.49931981, -0.02635368,  0.23564982,  0.41033742],
       [ 0.89711363,  0.96768279,  0.10309572,  0.21651589]])

In [17]:
# describe() shows a quick statistic summary of your data:
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,0.016052,-0.020018,-0.068042,-0.18134
std,0.691902,1.099021,1.076869,1.167237
min,-0.562004,-1.516164,-0.995393,-1.591991
25%,-0.467916,-0.878568,-0.845386,-1.059909
50%,-0.320791,0.256017,-0.311509,-0.1009
75%,0.605866,0.860359,0.202511,0.361882
max,0.902103,1.078976,1.859656,1.569185


In [18]:
# Transposing your data:
df.T

Unnamed: 0,2013-01-01,2013-01-02,2013-01-03,2013-01-04,2013-01-05,2013-01-06
A,-0.267878,0.902103,-0.562004,-0.373705,-0.49932,0.897114
B,-1.16264,1.078976,0.538388,-1.516164,-0.026354,0.967683
C,-0.726115,-0.885143,1.859656,-0.995393,0.23565,0.103096
D,-1.591991,1.569185,-1.273774,-0.418315,0.410337,0.216516


In [21]:
# DataFrame.sort_index() sorts by an axis:
# axis = 1 changes columns and axis = 0 changes rows
# Ascending = false means descending order

df.sort_index(axis=1, ascending=False)

Unnamed: 0,D,C,B,A
2013-01-01,-1.591991,-0.726115,-1.16264,-0.267878
2013-01-02,1.569185,-0.885143,1.078976,0.902103
2013-01-03,-1.273774,1.859656,0.538388,-0.562004
2013-01-04,-0.418315,-0.995393,-1.516164,-0.373705
2013-01-05,0.410337,0.23565,-0.026354,-0.49932
2013-01-06,0.216516,0.103096,0.967683,0.897114


In [22]:
# DataFrame.sort_values() sorts by values:
df.sort_values(by="B")

Unnamed: 0,A,B,C,D
2013-01-04,-0.373705,-1.516164,-0.995393,-0.418315
2013-01-01,-0.267878,-1.16264,-0.726115,-1.591991
2013-01-05,-0.49932,-0.026354,0.23565,0.410337
2013-01-03,-0.562004,0.538388,1.859656,-1.273774
2013-01-06,0.897114,0.967683,0.103096,0.216516
2013-01-02,0.902103,1.078976,-0.885143,1.569185


# Getitem ([ ])

In [23]:
# For a DataFrame, passing a single label selects a columns and yields a Series equivalent to df.A:
df["A"]

2013-01-01   -0.267878
2013-01-02    0.902103
2013-01-03   -0.562004
2013-01-04   -0.373705
2013-01-05   -0.499320
2013-01-06    0.897114
Freq: D, Name: A, dtype: float64

In [24]:
df [["A", "B"]]

Unnamed: 0,A,B
2013-01-01,-0.267878,-1.16264
2013-01-02,0.902103,1.078976
2013-01-03,-0.562004,0.538388
2013-01-04,-0.373705,-1.516164
2013-01-05,-0.49932,-0.026354
2013-01-06,0.897114,0.967683


In [28]:
# For a DataFrame, passing a slice ':' selects matching rows:
df[0:3]

Unnamed: 0,A,B,C,D
2013-01-01,-0.267878,-1.16264,-0.726115,-1.591991
2013-01-02,0.902103,1.078976,-0.885143,1.569185
2013-01-03,-0.562004,0.538388,1.859656,-1.273774


In [29]:
# Also, for selecting rows and columns at the same time. to lock index number
#Integer slices acts similar to NumPy/Python:
df.iloc[0:3, 0:2]

Unnamed: 0,A,B
2013-01-01,-0.267878,-1.16264
2013-01-02,0.902103,1.078976
2013-01-03,-0.562004,0.538388


# Selection by label

In [32]:
#See more in Selection by Label using DataFrame.loc() or DataFrame.at().
#Selecting a row matching a label:
#

df.loc[dates[0]]

A   -0.267878
B   -1.162640
C   -0.726115
D   -1.591991
Name: 2013-01-01 00:00:00, dtype: float64

In [33]:
#Selecting all rows (:) with a select column labels:
#To choose a column of your own choice
df.loc[:, ["A", "B"]]

Unnamed: 0,A,B
2013-01-01,-0.267878,-1.16264
2013-01-02,0.902103,1.078976
2013-01-03,-0.562004,0.538388
2013-01-04,-0.373705,-1.516164
2013-01-05,-0.49932,-0.026354
2013-01-06,0.897114,0.967683


# Selection by position

In [34]:
# Select via the position of the passed integers:
df.iloc[3]

A   -0.373705
B   -1.516164
C   -0.995393
D   -0.418315
Name: 2013-01-04 00:00:00, dtype: float64

In [36]:
#Integer slices acts similar to NumPy/Python:
df.iloc[0:3, 0:2]

Unnamed: 0,A,B
2013-01-01,-0.267878,-1.16264
2013-01-02,0.902103,1.078976
2013-01-03,-0.562004,0.538388


In [38]:
# EDA of titanic
import seaborn as sns
kashti = sns.load_dataset('titanic')
kashti

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,,1,2,23.4500,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True


In [40]:
# sample() command gives random data from the dataset and it works on indices / instances 
kashti.sample(100)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
246,0,3,female,25.0,0,0,7.7750,S,Third,woman,False,,Southampton,no,True
177,0,1,female,50.0,0,0,28.7125,C,First,woman,False,C,Cherbourg,no,True
758,0,3,male,34.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
315,1,3,female,26.0,0,0,7.8542,S,Third,woman,False,,Southampton,yes,True
641,1,1,female,24.0,0,0,69.3000,C,First,woman,False,B,Cherbourg,yes,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
414,1,3,male,44.0,0,0,7.9250,S,Third,man,True,,Southampton,yes,True
682,0,3,male,20.0,0,0,9.2250,S,Third,man,True,,Southampton,no,True
175,0,3,male,18.0,1,1,7.8542,S,Third,man,True,,Southampton,no,False
191,0,2,male,19.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True


# Boolean indexing

In [42]:
# Select rows where 'df.A' is greater than '0'.
kashti[kashti["age"] < 1]

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
78,1,2,male,0.83,0,2,29.0,S,Second,child,False,,Southampton,yes,False
305,1,1,male,0.92,1,2,151.55,S,First,child,False,C,Southampton,yes,False
469,1,3,female,0.75,2,1,19.2583,C,Third,child,False,,Cherbourg,yes,False
644,1,3,female,0.75,2,1,19.2583,C,Third,child,False,,Cherbourg,yes,False
755,1,2,male,0.67,1,1,14.5,S,Second,child,False,,Southampton,yes,False
803,1,3,male,0.42,0,1,8.5167,C,Third,child,False,,Cherbourg,yes,False
831,1,2,male,0.83,1,1,18.75,S,Second,child,False,,Southampton,yes,False
