In [1]:
import pandas as pd

In [2]:
cnt1={'name':'Jack','points':200,'age':12}
cnt2={'name':'Peter','points':157,'age':15}
cnt3={'name':'sam','points':435,'age':16}

In [3]:
df=pd.DataFrame([cnt1,cnt2,cnt3],index=['palyer1','player2','player3'])

In [4]:
df

Unnamed: 0,name,points,age
palyer1,Jack,200,12
player2,Peter,157,15
player3,sam,435,16


In [6]:
pd.Series([cnt1,cnt2])

0     {'name': 'Jack', 'points': 200, 'age': 12}
1    {'name': 'Peter', 'points': 157, 'age': 15}
dtype: object

In [20]:
cnt1=pd.Series(cnt1)
cnt2=pd.Series(cnt2)
cnt3=pd.Series(cnt3)

In [8]:
df=pd.DataFrame([cnt1,cnt2,cnt3])

In [9]:
df

Unnamed: 0,name,points,age
0,Jack,200,12
1,Peter,157,15
2,sam,435,16


In [10]:
# Like a Series, the DataFrame object is index. Here I'll use a group of series, where each series 
# represents a row of data. Just like the Series function, we can pass in our individual items
# in an array, and we can pass in our index values as a second arguments
df=pd.DataFrame([cnt1,cnt2,cnt3],index=['player1','player2','player3'])

In [11]:
df

Unnamed: 0,name,points,age
player1,Jack,200,12
player2,Peter,157,15
player3,sam,435,16


In [12]:
# You'll notice here that Jupyter creates a nice bit of HTML to render the results of the
# dataframe. So we have the index, which is the leftmost column and is the player_number , and
# then we have the rows of data, where each row has a column header which was given in our initial
# cnt dictionaries

In [15]:
# An alternative method is that you could use a list of dictionaries, where each dictionary 
# represents a row of data.

cnt=[{'name': 'Jack', 'points': 200, 'age': 12},
     {'name': 'Peter', 'points': 157, 'age': 15},
     {'name':'sam','points':435,'age':16}]
df=pd.DataFrame(cnt,index=['player1','player2','player3'])

In [16]:
df

Unnamed: 0,name,points,age
player1,Jack,200,12
player2,Peter,157,15
player3,sam,435,16


In [17]:
# Similar to the series, we can extract data using the .iloc and .loc attributes. Because the 
# DataFrame is two-dimensional, passing a single value to the loc indexing operator will return 
# the series if there's only one row to return.

df.loc['player1']

name      Jack
points     200
age         12
Name: player1, dtype: object

In [18]:
type(df.loc['player1'])

pandas.core.series.Series

In [25]:
cnt4={'name':'jack','points':'212','age':12}


In [29]:
df=pd.DataFrame([cnt1,cnt2,cnt3],index=['player1','player2','player2'])

In [30]:
df

Unnamed: 0,name,points,age
player1,Jack,200,12
player2,Peter,157,15
player2,sam,435,16


In [31]:
# It's important to remember that the indices and column names along either axes horizontal or 
# vertical, could be non-unique. In this example, we see two records for player2 as different rows.
# If we use a single value with the DataFrame lock attribute, multiple rows of the DataFrame will 
# return, not as a new series, but as a new DataFrame.

df.loc['player2']

Unnamed: 0,name,points,age
player2,Peter,157,15
player2,sam,435,16


In [32]:
type(df.loc['player2'])

pandas.core.frame.DataFrame

In [33]:
## One of the powers of the Panda's DataFrame is that you can quickly select data based on multiple axes.
# For instance, if you wanted to just list the student names for player2, you would supply two 
# parameters to .loc, one being the row index and the other being the column name.

# For instance, if we are only interested in player2's student names

df.loc['player2','name']

player2    Peter
player2      sam
Name: name, dtype: object

In [34]:
# Remember, just like the Series, the pandas developers have implemented this using the indexing
# operator and not as parameters to a function.

# What would we do if we just wanted to select a single column though? Well, there are a few
# mechanisms. Firstly, we could transpose the matrix. This pivots all of the rows into columns
# and all of the columns into rows, and is done with the T attribute

df.T

Unnamed: 0,player1,player2,player2.1
name,Jack,Peter,sam
points,200,157,435
age,12,15,16


In [35]:
df

Unnamed: 0,name,points,age
player1,Jack,200,12
player2,Peter,157,15
player2,sam,435,16


In [36]:
df.T.loc['name']

player1     Jack
player2    Peter
player2      sam
Name: name, dtype: object

In [37]:
#Panda reserves the indexing operator 
# directly on the DataFrame for column selection. In a Panda's DataFrame, columns always have a name. 
# So this selection is always label based, and is not as confusing as it was when using the square 
# bracket operator on the series objects. For those familiar with relational databases, this operator 
# is analogous to column projection.

df['name']

player1     Jack
player2    Peter
player2      sam
Name: name, dtype: object

In [38]:
#you get a key error if you try and use .loc with a column name
df.loc['name']

KeyError: 'name'

In [40]:
# Note too that the result of a single column projection is a Series object

type(df['name'])

pandas.core.series.Series

In [42]:
df.loc['player2'] #DataFrame object


Unnamed: 0,name,points,age
player2,Peter,157,15
player2,sam,435,16


In [43]:
df.loc['player2']['name'] #Series object

player2    Peter
player2      sam
Name: name, dtype: object

In [44]:
# Chaining, by indexing on the return type of another index, can come with some costs and is
# best avoided if you can use another approach. In particular, chaining tends to cause Pandas 
# to return a copy of the DataFrame instead of a view on the DataFrame. 
# For selecting data, this is not a big deal, though it might be slower than necessary. 
# If you are changing data though this is an important distinction and can be a source of error.

In [47]:
# Here's another approach. As we saw, .loc does row selection, and it can take two parameters, 
# the row index and the list of column names. The .loc attribute also supports slicing.

# If we wanted to select all rows, we can use a colon to indicate a full slice from beginning to end. 
# This is just like slicing characters in a list in python. Then we can add the column name as the 
# second parameter as a string. If we wanted to include multiple columns, we could do so in a list. 
# and Pandas will bring back only the columns we have asked for.

# Here's an example, where we ask for all the names and scores for all schools using the .loc operator.
df.loc[:,['name','points']]

Unnamed: 0,name,points
player1,Jack,200
player2,Peter,157
player2,sam,435


In [48]:
# It's easy to delete data in Series and DataFrames, and we can use the drop function to do so. 
# This function takes a single parameter, which is the index or row label, to drop. This is another 
# tricky place for new users -- the drop function doesn't change the DataFrame by default! Instead,
# the drop function returns to you a copy of the DataFrame with the given rows removed.
df.drop('player1')

Unnamed: 0,name,points,age
player2,Peter,157,15
player2,sam,435,16


In [49]:
df

Unnamed: 0,name,points,age
player1,Jack,200,12
player2,Peter,157,15
player2,sam,435,16


In [50]:
# Drop has two interesting optional parameters. The first is called inplace, and if it's 
# set to true, the DataFrame will be updated in place, instead of a copy being returned. 
# The second parameter is the axes, which should be dropped. By default, this value is 0, 
# indicating the row axis. But you could change it to 1 if you want to drop a column.

df_copy=df.copy()

df_copy.drop('name',inplace=True,axis=1)

In [51]:
df_copy


Unnamed: 0,points,age
player1,200,12
player2,157,15
player2,435,16


In [52]:
# There is a second way to drop a column, and that's directly through the use of the indexing 
# operator, using the del keyword. This way of dropping data,

del df_copy['points']

In [53]:
df_copy

Unnamed: 0,age
player1,12
player2,15
player2,16


In [54]:
#adding a new column to the DataFrame is as easy as assigning it to some value using
# the indexing operator. 
df['rankings']=None

In [55]:
df

Unnamed: 0,name,points,age,rankings
player1,Jack,200,12,
player2,Peter,157,15,
player2,sam,435,16,
