In [None]:
import pandas as pd
import numpy as np

In [None]:
pd.__version__

# Create a dataFrame

## Create a dataframe from a list

In [None]:
pd.DataFrame([i for i in range(5)])

In [None]:
pd.DataFrame([[i for i in range(5)],[i for i in range(5,10)],[i for i in range(10,15)]])

In [None]:
pd.DataFrame([['Alex',10,'male'],['Bob',12,'male'],['Clarke',13,'male']], columns=['name','age','sex'],index=[i for i in range(15,18)])

## Create a dataframe from a dictionary

In [None]:
dic = {'name':['alex','bob','clarke','tania','shomaila'],
      'age':[12,13,19,24,29],
      'sex':['male','male','male','female','female']}

In [None]:
pd.DataFrame(dic, index=['rank1','rank2','rank3','rank4','rank5'])

the keys of dictionary will appear as column name

## Create a dataframe from a list of dictionaries

In this case the common entries will appear as indexes

In [None]:
area_dict = {'California': 423967, 'Texas': 695662, 'New York': 141297,'Florida': 170312, 'Illinois': 149995}
population_dict = {'California': 38332521,'Texas': 26448193,'New York': 19651127,'Florida': 19552860,'Illinois': 12882135}
pd.DataFrame({'population':population_dict, 'area':area_dict})

In [None]:
dic1 = {'name':['alex','bob','clarke','tania','shomaila'],
      'age':[12,13,19,24,29],
      'sex':['male','male','male','female','female']}
dic2 = {'name':['alex','tom','micky','cersie','shae'],
      'age':[12,13,19,24,29],
      'sex':['male','male','male','female','female']}
data = {'dic1':dic1,'dic2':dic2}
pd.DataFrame(data)

In [None]:
data = [{'a':i,'b':i**2,'c':i**3,'d':i**4}
       for i in range(10)]
pd.DataFrame(data)

In [None]:
index = [('California', 2000), ('California', 2010),('New York', 2000), ('New York', 2010),('Texas', 2000), ('Texas', 2010)]
populations = [33871648, 37253956,18976457, 19378102,20851820, 25145561]

In [None]:
index

In [None]:
populations

In [None]:
pop = pd.Series(populations, index=index)
pop

In [None]:
new = pd.MultiIndex.from_tuples(index)
new

In [None]:
pop.reindex(new)

# Combining data sets

In [None]:
def make_df(cols, ind):
    """Quickly make a DataFrame"""
    data = {c: [str(c) + str(i) for i in ind]for c in cols}
    return pd.DataFrame(data, ind)

In [None]:
df1 = make_df(['a', 'b','c'], range(3));df1

In [None]:
df2 = make_df(['d', 'b','c'], range(3));df2

## concating  

In [None]:
pd.concat([df1,df2])

In [None]:
df2 = make_df(['a', 'b','c'], range(4,7));df2

In [None]:
pd.concat([df1,df2])

In [None]:
pd.concat([df1,df2], axis=1)

In [None]:
df2 = make_df(['a', 'b','c'], range(4,7));df2.set_index=[100,200,300];df2

In [None]:
# by default the dataframes are concatinated according to axis=0 and by index values
pd.concat([df1,df2], axis=1)

In [None]:
pd.concat([df1,df2], axis=1)

In [None]:
x = make_df(['a','b'], range(1,4)); y = make_df(['a','b'], range(7,10));print(x);print('\n',y); pd.concat([x,y], ignore_index=True)

In [None]:
x = make_df(['a','b'], range(1,4)); y = make_df(['c','d'], range(7,10));print(x);print('\n',y); pd.concat([x,y], ignore_index=True)

In [None]:
x = make_df(['a','b','c'], range(1,4)); y = make_df(['a','b','d'], range(7,10));print(x);print('\n',y); pd.concat([x,y], ignore_index=True, join='inner')

## Merging data frames

In [None]:
x = make_df(['a','b','c'], range(1,4)); y = make_df(['a','d'], range(1,4))
print(x);print('\n',y);
print(pd.merge(x,y))
pd.concat([x,y])

In [None]:
z=y
z['b']=['b3','b4','b5']

In [None]:
pd.merge(x,z)

In [None]:
pd.merge(x,y,how='outer')

In [None]:
pd.merge(x,y,on='a')

In [None]:
pd.merge(x,y,on='a',suffixes=('_L','_R'))

In [None]:
xa = x.set_index('a')
ya = y.set_index('a')
print(xa,'\n',ya)

In [None]:
pd.merge(xa,ya)

In [None]:
pd.merge(xa,ya,left_index=True,right_index=True)

# Grouping & Aggregation

In [None]:
import seaborn as sns
data = sns.load_dataset("planets")
data = pd.DataFrame(data)
data

In [None]:
data.head()

In [None]:
data.describe()

In [None]:
group = data.groupby('method')

In [None]:
group.describe()['number']
# group.describe()['mass']

In [None]:
group.sum()
# years have alos been added which does not make any sense

In [None]:
group['mass'].sum()

In [None]:
group['mass'].mean()

In [None]:
group.aggregate('max')
#giving us only the maximum of a specific group

In [None]:
group.aggregate(np.max)

In [None]:
group.aggregate(['min',np.max,'mean'])

In [None]:
group.mean()[group.mean()['orbital_period']>1000]
# gives us only those group informations which have average orbital period greater than 1000

In [None]:
group.apply(lambda x: x.mean())
# similar to group.mean()

In [None]:
data['decade'] = 10 * (data['year']//10)

In [None]:
decade_grp = data.groupby('decade')
decade_grp.mean()

In [None]:
data.groupby(['decade','method']).sum()

In [None]:
data.groupby(['decade','method']).sum().unstack()['number']['Astrometry']

# Pivot tables

In [None]:
data.pivot_table('decade', index='number',columns='method')
# it is saying that group the decade based on "number" and "method"
# number will be at indices while 'method' will be at columns

In [None]:
def compute(x):
    if x < 500:
        return 1
    elif x > 500 and x <1000:
        return 2
    elif x > 1000 and x < 2000:
        return 3
    elif x > 2000 and x < 3000:
        return 4
    elif x > 3000:
        return 5

data['gen_dis'] = data['distance'].apply(lambda x: compute(x))

In [None]:
data.head()

In [None]:
data['gen_dis'].value_counts()

In [None]:
data.pivot_table(values='mass', index='decade',columns='number')

In [None]:
data.pivot_table(values='mass', index='decade',columns='number',dropna=False,fill_value='fuck')
#dropna drops the columns with NaN
# fill value is used to replace NaN with the keyword

In [None]:
data.pivot_table(values='mass', index='decade',columns='number',margins=True)

In [None]:
data.pivot_table(values='mass', index='decade',columns='number',aggfunc=np.sum, fill_value='fuck',dropna=False)

In [None]:
data.pivot_table(values='mass', index='decade',columns=['number','gen_dis'],aggfunc=np.sum, fill_value='fuck',margins=True, margins_name='Sum',observed=True)

In [None]:
data.pivot_table(values='mass', index='decade',columns='number').plot()
# i don't know what is going on

# String functions

In [None]:
names = pd.Series(['peter', 'Paul', None, 'MARY', 'gUIDO'])

In [None]:
names

In [None]:
names.str.capitalize()

#### list of all available str methods
len() lower () translate() islower() ljust() upper() startswith() isupper() rjust() find() endswith() isnumeric()
center() rfind() isalnum() isdecimal() zfill() index() isalpha() split() strip() rindex() isdigit() rsplit() rstrip()
capitalize() isspace() partition() lstrip() swapcase() istitle() rpartition()

# eval & get_dummies

In [None]:
arr = np.random.randint(10000,size=(1000,1000))

In [None]:
df1 = pd.DataFrame(np.random.randint(10000,size=(1000,1000)))
df2 = pd.DataFrame(np.random.randint(10000,size=(1000,1000)))
df3 = pd.DataFrame(np.random.randint(10000,size=(1000,1000)))

In [None]:
%timeit df1+df2+df3+df1+df2+df3+df1+df2+df3+df1+df2+df3

In [None]:
%timeit pd.eval('df1+df2+df3+df1+df2+df3+df1+df2+df3+df1+df2+df3')

In [None]:
pd.get_dummies(data)
# 0 is for not and 1 for yes

In [None]:
pd.get_dummies(data['method'])

# Null Values

In [None]:
import seaborn as sns
data2 = sns.load_dataset("planets")
data2 = pd.DataFrame(data2)
data2

In [None]:
data2.isnull().sum()

More about to come....!!!!