In [1]:
#first thing is first, Imports
import pandas as pd
import numpy as np

In [2]:
#We shall create a small dictionary filled with different types of data, so we can manipulate it later
test_data = dict( 
    A = np.random.rand(3),
    B = 1,
    C = 'foo',
    D = pd.Timestamp('20010102'),
    E = pd.Series([1.0]*3).astype('float32'),
    F = False,
    G = pd.Series([1]*3,dtype='int8')
)

In [3]:
#Using the data above, we will create a Dataframe, which will hold all the data
my_data = pd.DataFrame(test_data)
my_data

Unnamed: 0,A,B,C,D,E,F,G
0,0.881441,1,foo,2001-01-02,1.0,False,1
1,0.836522,1,foo,2001-01-02,1.0,False,1
2,0.198255,1,foo,2001-01-02,1.0,False,1


In [4]:
my_data.head()

Unnamed: 0,A,B,C,D,E,F,G
0,0.881441,1,foo,2001-01-02,1.0,False,1
1,0.836522,1,foo,2001-01-02,1.0,False,1
2,0.198255,1,foo,2001-01-02,1.0,False,1


In [5]:
#Now we shall examine the datatypes in the columns we created. To do so, we use info()
my_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 7 columns):
A    3 non-null float64
B    3 non-null int64
C    3 non-null object
D    3 non-null datetime64[ns]
E    3 non-null float32
F    3 non-null bool
G    3 non-null int8
dtypes: bool(1), datetime64[ns](1), float32(1), float64(1), int64(1), int8(1), object(1)
memory usage: 242.0+ bytes


In [6]:
#To determine how many columns contain which data types, we use value counts
my_data.get_dtype_counts()
#This will show that each column has a different datatype or a different memory form of a datatype

  


float64           1
float32           1
int64             1
int8              1
datetime64[ns]    1
bool              1
object            1
dtype: int64

In [7]:
#use .astype() to change the datatype of a column.
my_data["B"] = my_data["B"].astype("object")

In [8]:
my_data["B"]

0    1
1    1
2    1
Name: B, dtype: object

In [9]:
#if you create a series, Usually panda tries not to lose any data. So it has a certain heirarchy
#Having any float on your data will make the whole series a float type
#Having any string on your data will make the whole series an Object type

#Creating a series with values 0-4
ser1 = pd.Series(list(range(0,5)))

In [10]:
#To add a column to a series, choose the column index, and assign a number to it
ser1["im here"] = "700"
ser1

0            0
1            1
2            2
3            3
4            4
im here    700
dtype: object

In [11]:
#To assign a datatype, use assignment operator and astype
ser1 = pd.Series(ser1).astype("int")

# Applying Functions to DataFrames
* `df.apply()`
* `df.applymap()`

In [12]:
my_data.dtypes

A           float64
B            object
C            object
D    datetime64[ns]
E           float32
F              bool
G              int8
dtype: object

In [13]:
#Lets first grab all the int columns from our original data
my_data_int = my_data.select_dtypes(exclude=["object","datetime","bool"])
my_data_int

Unnamed: 0,A,E,G
0,0.881441,1.0,1
1,0.836522,1.0,1
2,0.198255,1.0,1


In [14]:
#Now we can use apply to find the mean of all the columns provided.
my_data_int.apply(np.mean)

A    0.63874
E    1.00000
G    1.00000
dtype: float64

In [15]:
#You can also use applymap to round the numbers, using a lambda expression
my_data_int.applymap(lambda x : round(x,2))

Unnamed: 0,A,E,G
0,0.88,1.0,1
1,0.84,1.0,1
2,0.2,1.0,1


# Manipulating Data 

In [16]:
#You can also add a whole row to a data. Lets try it
my_data_int.loc["3"] = [3,2,1]
my_data_int

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


Unnamed: 0,A,E,G
0,0.881441,1.0,1
1,0.836522,1.0,1
2,0.198255,1.0,1
3,3.0,2.0,1


In [17]:
#We can add a column as well
my_data_int["F"] = [3,-1,-1,-1]
my_data_int

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,A,E,G,F
0,0.881441,1.0,1,3
1,0.836522,1.0,1,-1
2,0.198255,1.0,1,-1
3,3.0,2.0,1,-1


In [18]:
#Filling missing values is also duable. Kets create a table with missing values
missing = my_data_int.apply(np.sqrt)
missing

Unnamed: 0,A,E,G,F
0,0.938851,1.0,1.0,1.732051
1,0.914616,1.0,1.0,
2,0.445259,1.0,1.0,
3,1.732051,1.414214,1.0,


In [19]:
#First, lets find out how many nulls do we have
missing.isnull().sum()
#If we want to ignore columns that dont have anyting null

A    0
E    0
G    0
F    3
dtype: int64

In [20]:
#You can use .fillna to fill in missing data
missing.fillna("neg")

Unnamed: 0,A,E,G,F
0,0.938851,1.0,1.0,1.73205
1,0.914616,1.0,1.0,neg
2,0.445259,1.0,1.0,neg
3,1.732051,1.414214,1.0,neg


In [21]:
#Instead of filling it with custom information, we can fill it based on alrready given values.
missing.fillna(method="ffill")
#ffill fills it forward.

Unnamed: 0,A,E,G,F
0,0.938851,1.0,1.0,1.732051
1,0.914616,1.0,1.0,1.732051
2,0.445259,1.0,1.0,1.732051
3,1.732051,1.414214,1.0,1.732051


In [22]:
#if we wanna find the amount of unique values in each column, we can use value_counts()
missing["E"].value_counts()

1.000000    3
1.414214    1
Name: E, dtype: int64