In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('sales_data_types.csv')
df.head()

Unnamed: 0,Customer Number,Customer Name,2016,2017,Percent Growth,Jan Units,Month,Day,Year,Active
0,10002.0,Quest Industries,"$125,000.00",$162500.00,30.00%,500,1,10,2015,Y
1,552278.0,Smith Plumbing,"$920,000.00","$101,2000.00",10.00%,700,6,15,2014,Y
2,23477.0,ACME Industrial,"$50,000.00",$62500.00,25.00%,125,3,29,2016,Y
3,24900.0,Brekke LTD,"$350,000.00",$490000.00,4.00%,75,10,27,2015,Y
4,651029.0,Harbor Co,"$15,000.00",$12750.00,-15.00%,Closed,2,2,2014,N


In [3]:
df['2016'] + df['2017']

0      $125,000.00$162500.00
1    $920,000.00$101,2000.00
2        $50,000.00$62500.00
3      $350,000.00$490000.00
4        $15,000.00$12750.00
dtype: object

In [4]:
df.dtypes

Customer Number    float64
Customer Name       object
2016                object
2017                object
Percent Growth      object
Jan Units           object
Month                int64
Day                  int64
Year                 int64
Active              object
dtype: object

In [5]:
df['Customer Number'].astype('int')
df['Customer Number'].astype('int32')
df['Customer Number'].astype('int64')

0     10002
1    552278
2     23477
3     24900
4    651029
Name: Customer Number, dtype: int64

In [6]:
# n order to actually change the customer number in the original dataframe, make sure to assign 
# it back since the astype() functions returns a copy.

df["Customer Number"] = df['Customer Number'].astype('int')
df.dtypes

Customer Number     int32
Customer Name      object
2016               object
2017               object
Percent Growth     object
Jan Units          object
Month               int64
Day                 int64
Year                int64
Active             object
dtype: object

In [9]:
df['Active'].astype('bool')

0    True
1    True
2    True
3    True
4    True
Name: Active, dtype: bool

In [10]:
def convert_currency(val):
    """
    Convert the string number value to a float
     - Remove $
     - Remove commas
     - Convert to float type
    """
    new_val = val.replace(',','').replace('$', '')
    return float(new_val)

In [11]:
df['2016'].apply(convert_currency)

0    125000.0
1    920000.0
2     50000.0
3    350000.0
4     15000.0
Name: 2016, dtype: float64

In [12]:
df['2016'] = df['2016'].apply(convert_currency)
df['2017'] = df['2017'].apply(convert_currency)

In [13]:
def convert_percent(val):
    """
    Convert the percentage string to an actual floating point percent
    - Remove %
    - Divide by 100 to make decimal
    """
    new_val = val.replace('%', '')
    return float(new_val) / 100

In [14]:
df['Percent Growth'].apply(convert_percent)

0    0.30
1    0.10
2    0.25
3    0.04
4   -0.15
Name: Percent Growth, dtype: float64

In [15]:
df["Active"] = np.where(df["Active"] == "Y", True, False)

## Series

In [16]:
list('abcd')

['a', 'b', 'c', 'd']

In [17]:
series1 = pd.Series(list('abcd'))
series1

0    a
1    b
2    c
3    d
dtype: object

In [18]:
city = pd.Series(data=list(['Blore', 'delhi', 'chennai']))
city

0      Blore
1      delhi
2    chennai
dtype: object

In [19]:
obj = pd.Series([4, 7, -5, 3])
obj

0    4
1    7
2   -5
3    3
dtype: int64

In [20]:
obj2 = pd.Series(index=['d', 'b', 'a', 'a'], data=[4, 7.0, -5, 3] )
obj2

d    4.0
b    7.0
a   -5.0
a    3.0
dtype: float64

In [21]:
obj2.index

Index(['d', 'b', 'a', 'a'], dtype='object')

In [22]:
obj2['a']

a   -5.0
a    3.0
dtype: float64

In [23]:
obj2['d'] = 600000
obj2

d    600000.0
b         7.0
a        -5.0
a         3.0
dtype: float64

In [31]:
print(obj2 > 0)
print('-----------------------')
print(obj2[obj2 > 0])
print('-----------------------')
print(obj2 * 2)

d     True
b     True
a    False
a     True
dtype: bool
-----------------------
d    600000.0
b         7.0
a         3.0
dtype: float64
-----------------------
d    1200000.0
b         14.0
a        -10.0
a          6.0
dtype: float64


In [32]:
citynames = ['Bangalore', 'Delhi', 'Hybd', 'Chennai']
citydata = {'Hybd': 35000, 
            'Bangalore': 71000, 
            'Delhi': 16000, 
            'Chennai': 5000}

In [33]:
obj4 = pd.Series(citydata, index=citynames)
obj4

Bangalore    71000
Delhi        16000
Hybd         35000
Chennai       5000
dtype: int64

# DataFrame:

In [34]:
data_dict = {'Country':    ['Belgium', 'India', 'Brazil'],
             'Capital':    ['Brussels', 'New Delhi', 'Brasilia'],
             'Population': [122020, 7774744, 3664838]}

df = pd.DataFrame(data_dict)
df

Unnamed: 0,Country,Capital,Population
0,Belgium,Brussels,122020
1,India,New Delhi,7774744
2,Brazil,Brasilia,3664838


In [35]:
# Constructing DataFrame from a dictionary.
# Example - 1 
data_dict = {'xcol1': [1, 2], 'ycol2': [3, 4], 'col3': ['a','b']}
df = pd.DataFrame(data_dict)
df

Unnamed: 0,xcol1,ycol2,col3
0,1,3,a
1,2,4,b


In [36]:
df.shape

(2, 3)

In [37]:
# Example - 2 
data_dict = {'one' : pd.Series([1., 2., 3.],     index=['a', 'b', 'c']),
             'two' : pd.Series([1., 2., 3., 4.], index=['a', 'b', 'c', 'd'])}

In [38]:
df = pd.DataFrame(data_dict, index=['d', 'b', 'a'])
df

Unnamed: 0,one,two
d,,4.0
b,2.0,2.0
a,1.0,1.0


In [39]:
df= pd.DataFrame(data_dict, index=['d', 'b', 'a'], columns=['PYTHON', 'SPARK'])
df

Unnamed: 0,PYTHON,SPARK
d,,
b,,
a,,


In [40]:
# Column selection, addition, deletion
data_dict = {'one' : pd.Series([1., 2., 3.],     index=['a', 'b', 'c']),
             'two' : pd.Series([1., 2., 3., 4.], index=['d', 'b', 'c', 'a'])}
df= pd.DataFrame(data_dict)
df

Unnamed: 0,one,two
a,1.0,4.0
b,2.0,2.0
c,3.0,3.0
d,,1.0


In [41]:
df['three'] = df['one'] * df['two']

In [42]:
df.isnull()

Unnamed: 0,one,two,three
a,False,False,False
b,False,False,False
c,False,False,False
d,True,False,True


In [43]:
df.isnull().sum()

one      1
two      0
three    1
dtype: int64

In [45]:
df.dropna()

Unnamed: 0,one,two,three
a,1.0,4.0,4.0
b,2.0,2.0,4.0
c,3.0,3.0,9.0


In [46]:
df.T

Unnamed: 0,a,b,c,d
one,1.0,2.0,3.0,
two,4.0,2.0,3.0,1.0
three,4.0,4.0,9.0,


In [49]:
df =pd.read_csv("pandas_ex2.csv") 
df

Unnamed: 0,something,a,b,c,d,message
0,one,1.0,2.0,3.0,4,
1,two,5.0,6.0,,8,world
2,three,9.0,10.0,11.0,12,foo
3,four,,,4.0,Globe,
4,five,,,,,Earth


In [50]:
names = ['Col1', 'Col2', 'Col3', 'Col4', 'message']
df =pd.read_csv("pandas_ex2.csv",index_col= 'message') 
df

Unnamed: 0_level_0,something,a,b,c,d
message,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
,one,1.0,2.0,3.0,4
world,two,5.0,6.0,,8
foo,three,9.0,10.0,11.0,12
,four,,,4.0,Globe
Earth,five,,,,


In [51]:
pd.isnull(df)

Unnamed: 0_level_0,something,a,b,c,d
message,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
,False,False,False,False,False
world,False,False,False,True,False
foo,False,False,False,False,False
,False,True,True,False,False
Earth,False,True,True,True,True


In [52]:
pd.isnull(df['a'])

message
NaN      False
world    False
foo      False
NaN       True
Earth     True
Name: a, dtype: bool