In [1]:
import numpy as np
import pandas as pd

In [2]:
# Difference between Dataframes and Series


'''
Dataframe: Two-dimensional size-mutable, potentially heterogeneous tabular data structure with labeled axes (rows and columns).
Arithmetic operations align on both row and column labels.
Can be thought of as a dict-like container for Series objects. The primary pandas data structure.

Series is the data structure for a single column of a DataFrame, not only conceptually, but literally, 
i.e. the data in a DataFrame is actually stored in memory as a collection of Series.

Analogously: We need both lists and matrices, because matrices are built with lists. Single row matricies, 
while equivalent to lists in functionality still cannot exist without the list(s) they're composed of.
'''

In [11]:
d = {'one' : pd.Series([1., 2., 3.], index=['a', 'b', 'c']),
     'two' : pd.Series([1., 2., 3., 4.], index=['a', 'b', 'c', 'd'])}
df = pd.DataFrame(d)
df.head()

Unnamed: 0,one,two
a,1.0,1.0
b,2.0,2.0
c,3.0,3.0
d,,4.0


In [9]:
s = pd.Series([1., 2., 3.], index=['a', 'b', 'c'])
s.head()

a    1.0
b    2.0
c    3.0
dtype: float64

In [77]:
df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [38]:
cols = df.columns  # Return the list of indexes
type(cols)
train_cols = list(cols)
print(train_cols)

['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']


In [18]:
for col in cols:
    print(col)

PassengerId
Survived
Pclass
Name
Sex
Age
SibSp
Parch
Ticket
Fare
Cabin
Embarked


In [21]:
df.values[1]  # Get a row

array([2, 1, 1, 'Cumings, Mrs. John Bradley (Florence Briggs Thayer)',
       'female', 38.0, 1, 0, 'PC 17599', 71.2833, 'C85', 'C'],
      dtype=object)

In [134]:
# Get values from a specific column

dir(df)
df.Cabin.head(5)

0     NaN
1     C85
2     NaN
3    C123
4     NaN
Name: Cabin, dtype: object

In [32]:
Cabin_list = df['Cabin'].tolist()  # Convert series to list
                                   # Same as how you get values in a dictionary: choose the key

print(Cabin_list[:5])

[nan, 'C85', nan, 'C123', nan]


In [35]:
Cabin_array = df['Cabin'].to_numpy()
type(Cabin_array)
print(Cabin_array[:5])

'''
Both lists and arrays are used to store data in Python. 
Moreover, both data structures allow indexing, slicing, and iterating.
'''

[nan 'C85' nan 'C123' nan]


In [68]:
df_t = pd.read_csv('test_1.csv')
test_cols = list(df_t.columns)
print(test_cols)

['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'a']


In [69]:
df_1 = df_t.drop('a', axis=1)  # where 1 is the axis number (0 for rows and 1 for columns.)

'''To drop by column number instead of by column label, try this to delete,
e.g. the last columns:

df = df.drop(df.columns[[-1]], axis=1)'''

'Finally, to drop by column number instead of by column label, try this to delete,\ne.g. the 1st, 2nd and 4th columns:\n\ndf = df.drop(df.columns[[-1]], axis=1)'

In [70]:
print(list(df_1.columns))

['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']


In [71]:
del df_t['a']         # Another method

In [72]:
print(list(df_t.columns))

['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']


In [78]:
df.drop(['PassengerId', 'Name', 'Age', 'Ticket'], axis=1, inplace=False)  # Play with the last parameter: True/ False

Unnamed: 0,Survived,Pclass,Sex,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,male,1,0,7.2500,,S
1,1,1,female,1,0,71.2833,C85,C
2,1,3,female,0,0,7.9250,,S
3,1,1,female,1,0,53.1000,C123,S
4,0,3,male,0,0,8.0500,,S
...,...,...,...,...,...,...,...,...
886,0,2,male,0,0,13.0000,,S
887,1,1,female,0,0,30.0000,B42,S
888,0,3,female,1,2,23.4500,,S
889,1,1,male,0,0,30.0000,C148,C


In [79]:
print(list(df.columns))  # inplace = True overwrite our original data df, while inplace = False doesn't

['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']


In [80]:
# Select items based on column values

df.loc[df['Age'] < 10]

'''
The loc property is used to access a group of rows and columns by label(s) or a boolean array.

Selecting rows of Pandas Dataframe based on particular column value 
using ‘>’, ‘==’, ‘<=’, ‘!=’ operator.

AND operator:
df.loc[(df['column_name'] >= A) & (df['column_name'] <= B)]

OR operator:
df.loc[(df['column_name'] >= A) | (df['column_name'] <= B)]
'''

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.00,3,1,349909,21.0750,,S
10,11,1,3,"Sandstrom, Miss. Marguerite Rut",female,4.00,1,1,PP 9549,16.7000,G6,S
16,17,0,3,"Rice, Master. Eugene",male,2.00,4,1,382652,29.1250,,Q
24,25,0,3,"Palsson, Miss. Torborg Danira",female,8.00,3,1,349909,21.0750,,S
43,44,1,2,"Laroche, Miss. Simonne Marie Anne Andree",female,3.00,1,2,SC/Paris 2123,41.5792,,C
...,...,...,...,...,...,...,...,...,...,...,...,...
827,828,1,2,"Mallet, Master. Andre",male,1.00,0,2,S.C./PARIS 2079,37.0042,,C
831,832,1,2,"Richards, Master. George Sibley",male,0.83,1,1,29106,18.7500,,S
850,851,0,3,"Andersson, Master. Sigvard Harald Elias",male,4.00,4,2,347082,31.2750,,S
852,853,0,3,"Boulos, Miss. Nourelain",female,9.00,1,1,2678,15.2458,,C


In [120]:
midage = df.loc[(df['Age'] > 10) & (df['Age'] < 50)]
midage.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [121]:
selected = midage[['Pclass', 'Sex']]
selected.values[99]

array([2, 'male'], dtype=object)

In [136]:
#select the 100th, 105th, and 108th row of the DataFrame
midage = midage.reset_index(drop=True)
midage.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [137]:
midage.loc[[99, 104, 107], ['Pclass', 'Name', 'Sex']]  # Rows, then Columns

Unnamed: 0,Pclass,Name,Sex
99,2,"Navratil, Mr. Michel (""Louis M Hoffman"")",male
104,3,"Corn, Mr. Harry",male
107,3,"Bengtsson, Mr. John Viktor",male


In [138]:
midage.iloc[[99, 104, 107], 2:4]

Unnamed: 0,Pclass,Name
99,2,"Navratil, Mr. Michel (""Louis M Hoffman"")"
104,3,"Corn, Mr. Harry"
107,3,"Bengtsson, Mr. John Viktor"


In [None]:
'''
.loc selects rows based on a labeled index.


The iloc indexer for Pandas Dataframe is used for integer-location based indexing / selection by position.
.iloc in pandas is used to select rows and columns by number, in the order that they appear in the data frame.
'''