In [25]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib as plot
import matplotlib.pyplot as plt
%matplotlib inline

In [26]:
bear_family = [
    [100, 5, 20, 80],
    [50, 2.5, 10, 40],
    [110, 6, 22, 80]
]

In [27]:
bear_family_numpy = np.array(bear_family)
bear_family_numpy

array([[100. ,   5. ,  20. ,  80. ],
       [ 50. ,   2.5,  10. ,  40. ],
       [110. ,   6. ,  22. ,  80. ]])

In [28]:
bear_family_numpy[2,0]

110.0

In [29]:
bear_family_df = pd.DataFrame(bear_family, index=['mom','baby','dad'], columns=['leg', 'hair', 'tail', 'belly'])
bear_family_df

Unnamed: 0,leg,hair,tail,belly
mom,100,5.0,20,80
baby,50,2.5,10,40
dad,110,6.0,22,80


In [30]:
bear_family_df.belly

mom     80
baby    40
dad     80
Name: belly, dtype: int64

In [31]:
bear_family_df.loc['dad']

leg      110.0
hair       6.0
tail      22.0
belly     80.0
Name: dad, dtype: float64

In [32]:
bear_family_df.iloc[2]

leg      110.0
hair       6.0
tail      22.0
belly     80.0
Name: dad, dtype: float64

In [33]:
mask = bear_family_df['belly'] == 80
bear_family_df[mask]

Unnamed: 0,leg,hair,tail,belly
mom,100,5.0,20,80
dad,110,6.0,22,80


In [34]:
some_bears = pd.DataFrame([[105, 4, 19, 80],[100, 5, 20, 80]], columns=bear_family_df.columns)
some_bears

Unnamed: 0,leg,hair,tail,belly
0,105,4,19,80
1,100,5,20,80


In [35]:
all_bears = bear_family_df.append(some_bears)
all_bears

  all_bears = bear_family_df.append(some_bears)


Unnamed: 0,leg,hair,tail,belly
mom,100,5.0,20,80
baby,50,2.5,10,40
dad,110,6.0,22,80
0,105,4.0,19,80
1,100,5.0,20,80


In [36]:
all_bears.drop_duplicates() #Shots the uniques ones but does not updates
all_bears = all_bears.drop_duplicates()
all_bears

Unnamed: 0,leg,hair,tail,belly
mom,100,5.0,20,80
baby,50,2.5,10,40
dad,110,6.0,22,80
0,105,4.0,19,80


In [37]:
bear_family_df['sex'] = ['f','f','m']
bear_family_df

Unnamed: 0,leg,hair,tail,belly,sex
mom,100,5.0,20,80,f
baby,50,2.5,10,40,f
dad,110,6.0,22,80,m


Some Detailed Pandas samples

In [38]:
import numpy as np
a_bear_numpy = np.array([100,5,20,80])
a_bear_numpy
#array([100,  5, 20, 80])

array([100,   5,  20,  80])

In [39]:
bear_family = [    
    np.array([100, 5  , 20, 80]), 
    # Bear mom   
    np.array([50 , 2.5, 10, 40]), 
    # Bear baby    
    np.array([110, 6  , 22, 80]), 
    # Bear dad
    ]

In [40]:
bear_family = [    
    [100, 5  , 20, 80], 
    # Bear mom
    [50 , 2.5, 10, 40], 
    # Bear baby
    [110, 6  , 22, 80], 
    # Bear dad
    ]

bear_family_numpy = np.array(bear_family)
bear_family_numpy

array([[100. ,   5. ,  20. ,  80. ],
       [ 50. ,   2.5,  10. ,  40. ],
       [110. ,   6. ,  22. ,  80. ]])

⚠️ What if I want to know the leg sizes of my entire bear family?
Easy-peasy! Just delete the 2 (which corresponded to the papa bear), and replace it with the character : meaning that I want ALL bears!

In [41]:
bear_family_numpy[:, 0]

array([100.,  50., 110.])

In [42]:
bear_family_df = pd.DataFrame(bear_family_numpy, 
                index = ['mom', 'baby', 'dad'],
                columns = ['leg', 'hair', 'tail', 'belly']
            )

bear_family_df

Unnamed: 0,leg,hair,tail,belly
mom,100.0,5.0,20.0,80.0
baby,50.0,2.5,10.0,40.0
dad,110.0,6.0,22.0,80.0


In [43]:
bear_family_df.belly
bear_family_df["belly"]

mom     80.0
baby    40.0
dad     80.0
Name: belly, dtype: float64

In [44]:
for ind_row, content_row in bear_family_df.iterrows():
    print("Here is %s bear:" % ind_row)
    print(content_row)
    print("--------------------")

Here is mom bear:
leg      100.0
hair       5.0
tail      20.0
belly     80.0
Name: mom, dtype: float64
--------------------
Here is baby bear:
leg      50.0
hair      2.5
tail     10.0
belly    40.0
Name: baby, dtype: float64
--------------------
Here is dad bear:
leg      110.0
hair       6.0
tail      22.0
belly     80.0
Name: dad, dtype: float64
--------------------


In [45]:
bear_family_df.iloc[2] 
# iloc is the positional index
bear_family_df.loc["dad"] 
# loc is the label-based index

leg      110.0
hair       6.0
tail      22.0
belly     80.0
Name: dad, dtype: float64

In [46]:
mask = bear_family_df["belly"] == 80
bears_80 = bear_family_df[mask]
# Or more commonly :
bears_80 = bear_family_df[bear_family_df["belly"] == 80]

bears_80

Unnamed: 0,leg,hair,tail,belly
mom,100.0,5.0,20.0,80.0
dad,110.0,6.0,22.0,80.0


In [47]:
bear_family_df[~mask]

Unnamed: 0,leg,hair,tail,belly
baby,50.0,2.5,10.0,40.0


In [48]:
# get names of columns
bear_family_df.columns

# create a new column, containing strings
bear_family_df["sex"] = ["f", "f", "m"]
# mom and baby are female, dad is male

# get the number of rows:
len(bear_family_df)

# get the number of distinct values for a columns
bear_family_df.belly.unique()
bear_family_df

Unnamed: 0,leg,hair,tail,belly,sex
mom,100.0,5.0,20.0,80.0,f
baby,50.0,2.5,10.0,40.0,f
dad,110.0,6.0,22.0,80.0,m


In [49]:
import os
os.getcwd()

'c:\\Users\\vtest\\OneDrive\\Desktop\\Machine Learning\\BootCamp Samples'

Manipulate data contained in Data Frames

In [50]:
#import numpy as np
#import pandas as pd
#import seaborn as sns

Titanic Excersize

In [51]:
titanic = sns.load_dataset('titanic')

In [52]:
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [53]:
a = titanic.age.unique()
print(a)
len(a)

[22.   38.   26.   35.     nan 54.    2.   27.   14.    4.   58.   20.
 39.   55.   31.   34.   15.   28.    8.   19.   40.   66.   42.   21.
 18.    3.    7.   49.   29.   65.   28.5   5.   11.   45.   17.   32.
 16.   25.    0.83 30.   33.   23.   24.   46.   59.   71.   37.   47.
 14.5  70.5  32.5  12.    9.   36.5  51.   55.5  40.5  44.    1.   61.
 56.   50.   36.   45.5  20.5  62.   41.   52.   63.   23.5   0.92 43.
 60.   10.   64.   13.   48.    0.75 53.   57.   80.   70.   24.5   6.
  0.67 30.5   0.42 34.5  74.  ]


89

In [54]:
titanic.describe()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0,7.9104
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,38.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


In [55]:
titanic = titanic.dropna() #This will return the modified version of th DataFrame
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
6,0,1,male,54.0,0,0,51.8625,S,First,man,True,E,Southampton,no,True
10,1,3,female,4.0,1,1,16.7,S,Third,child,False,G,Southampton,yes,False
11,1,1,female,58.0,0,0,26.55,S,First,woman,False,C,Southampton,yes,True


In [56]:
titanic.describe()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare
count,182.0,182.0,182.0,182.0,182.0,182.0
mean,0.675824,1.192308,35.623187,0.467033,0.478022,78.919735
std,0.469357,0.516411,15.671615,0.645007,0.755869,76.490774
min,0.0,1.0,0.92,0.0,0.0,0.0
25%,0.0,1.0,24.0,0.0,0.0,29.7
50%,1.0,1.0,36.0,0.0,0.0,57.0
75%,1.0,1.0,47.75,1.0,1.0,90.0
max,1.0,3.0,80.0,3.0,4.0,512.3292


In [57]:
titanic.pivot_table('survived', index='sex', columns='class', aggfunc='sum')

class,First,Second,Third
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,71,8,3
male,35,4,2


In [58]:
titanic.describe()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare
count,182.0,182.0,182.0,182.0,182.0,182.0
mean,0.675824,1.192308,35.623187,0.467033,0.478022,78.919735
std,0.469357,0.516411,15.671615,0.645007,0.755869,76.490774
min,0.0,1.0,0.92,0.0,0.0,0.0
25%,0.0,1.0,24.0,0.0,0.0,29.7
50%,1.0,1.0,36.0,0.0,0.0,57.0
75%,1.0,1.0,47.75,1.0,1.0,90.0
max,1.0,3.0,80.0,3.0,4.0,512.3292


The first is to replace NaN with other values. This operation is performed using the fillna function. Let's look at its application on the age column:

In [59]:
titanic.age.head(10)

1     38.0
3     35.0
6     54.0
10     4.0
11    58.0
21    34.0
23    28.0
27    19.0
52    49.0
54    65.0
Name: age, dtype: float64

This returns a DataFrame where all NaN in the age column have been replaced by 0.

In [60]:
titanic.fillna(value={"age": 0}).age.head(10)

1     38.0
3     35.0
6     54.0
10     4.0
11    58.0
21    34.0
23    28.0
27    19.0
52    49.0
54    65.0
Name: age, dtype: float64

We could also have filled the NaN with the previous values:

In [61]:
titanic.fillna(method="pad").age.head(10)

1     38.0
3     35.0
6     54.0
10     4.0
11    58.0
21    34.0
23    28.0
27    19.0
52    49.0
54    65.0
Name: age, dtype: float64

Secondly, the dropna function let's you delete axes (columns or rows) that contain NaN . By default, it deletes the relevant lines:

In [62]:
titanic.dropna().head(10)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
6,0,1,male,54.0,0,0,51.8625,S,First,man,True,E,Southampton,no,True
10,1,3,female,4.0,1,1,16.7,S,Third,child,False,G,Southampton,yes,False
11,1,1,female,58.0,0,0,26.55,S,First,woman,False,C,Southampton,yes,True
21,1,2,male,34.0,0,0,13.0,S,Second,man,True,D,Southampton,yes,True
23,1,1,male,28.0,0,0,35.5,S,First,man,True,A,Southampton,yes,True
27,0,1,male,19.0,3,2,263.0,S,First,man,True,C,Southampton,no,False
52,1,1,female,49.0,1,0,76.7292,C,First,woman,False,D,Cherbourg,yes,False
54,0,1,male,65.0,0,1,61.9792,C,First,man,True,B,Cherbourg,no,False


But we can also delete the columns altogether!

In [63]:
titanic.dropna(axis="columns").head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
6,0,1,male,54.0,0,0,51.8625,S,First,man,True,E,Southampton,no,True
10,1,3,female,4.0,1,1,16.7,S,Third,child,False,G,Southampton,yes,False
11,1,1,female,58.0,0,0,26.55,S,First,woman,False,C,Southampton,yes,True


In [64]:
titanic.pivot_table('survived', index='sex', columns='class', aggfunc="sum")

class,First,Second,Third
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,71,8,3
male,35,4,2


In [65]:
titanic.dropna(inplace=True)
age = pd.cut(titanic['age'], [0, 18, 80])
titanic.pivot_table('survived', ['sex', age], 'class')

Unnamed: 0_level_0,class,First,Second,Third
sex,age,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
female,"(0, 18]",0.909091,1.0,0.5
female,"(18, 80]",0.968254,0.875,0.666667
male,"(0, 18]",0.8,1.0,1.0
male,"(18, 80]",0.397436,0.333333,0.25


In [66]:
ser = pd.Series([1,2,3], index=['a', 'b', 'c'])
ser

a    1
b    2
c    3
dtype: int64

In [67]:
data = [{'a':i, 'b':2*i} for i in range(3)]
data

[{'a': 0, 'b': 0}, {'a': 1, 'b': 2}, {'a': 2, 'b': 4}]

In [68]:
df = pd.DataFrame(data)
df

Unnamed: 0,a,b
0,0,0
1,1,2
2,2,4


In [69]:
ser.loc['b'] #To reach specific element
ser.iloc[-1] #to pass a number to see the representing element

3