# import pandas 


In [1]:
import pandas as pd
import numpy as np

# Object Creation


In [2]:
# Creating a Series by passing a list of values, letting pandas create a default integer index

data = pd.Series([0.25, 0.5, 0.75, 1.0, np.nan])


#Like a dictionary, the Series object provides a mapping from a collection of keys to a collection of values:

data1 = pd.Series([0.35, 0.4, 0.7, 0.5, np.nan,], index = ['a', 'b', 'c', 'd', 'e',])
data1

# Creating a DataFrame by passing a dict of objects that can be converted to series-like.

area = pd.Series({'California': 423967, 'Texas': 695662,
                  'New York': 141297, 'Florida': 170312,'Illinois': 149995})
popu = pd.Series({'California': 38332521, 'Texas': 26448193,
                 'New York': 19651127, 'Florida': 19552860,'Illinois': 12882135})

data2 = pd.DataFrame({'area':area, 'popu':popu})
data2

# Checking data types

data.dtypes


dtype('float64')

# Getting Data In / Out

In [3]:
# Reading from a csv file.

data3 = pd.read_csv(r"C:\Users\Scorpio\Data Analysis/train.csv")

In [4]:
# Writing to a csv file.

data3.to_csv('modified.csv')

# Viewing Data

In [5]:
# Here is how to view the top and bottom rows of the frame:

data3.head()
data3.tail()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q


In [6]:
# Display the index, columns:

data3.index
data3.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [8]:
# describe() shows a quick statistic summary of your data:

data3.describe()
data3.values

array([[1, 0, 3, ..., 7.25, nan, 'S'],
       [2, 1, 1, ..., 71.2833, 'C85', 'C'],
       [3, 1, 3, ..., 7.925, nan, 'S'],
       ...,
       [889, 0, 3, ..., 23.45, nan, 'S'],
       [890, 1, 1, ..., 30.0, 'C148', 'C'],
       [891, 0, 3, ..., 7.75, nan, 'Q']], dtype=object)

# Sorting 

In [None]:
# Sorting by an axis

data3.sort_index(axis=1, ascending=False)

# Sorting by values
data3.sort_values(by='Name')

# Data Selection / Indexing

In [24]:
#data3.keys()
#list(data3.items())


data3['Name']
data3[2:9]
data3.loc[0:7,'Name']
data3.iloc[2:8]
data3[['Name','Ticket']]
data3.loc[0:10, ['Name', 'Parch']]
data3['Total'] = data3['Fare'] / data3['Age']



In [None]:
# masking

data3[(data3['Age'] > 0.3) & (data3['Age'] < 0.8)]

# Handling Missing Data

In [None]:
data3.dropna(how='any')
data3.fillna(value = 0)
data3.isna()

# Operations

In [None]:

data3.mean()
data3.mean(0)
data3['Fare'] + data3['Age']
data3['Fare'] / data3['Age']
data3['Fare'] - data3['Age']
data3['Fare'] * data3['Age']
data3['Fare'].add(data3['Age'])

# Merging Data

In [39]:
df = pd.DataFrame(np.random.randn(10,4))

In [40]:
df2 = [df[:3], df[3:7], df[7:]]

In [41]:
pd.concat(df2)

Unnamed: 0,0,1,2,3
0,-1.54138,1.719404,-0.49534,0.464978
1,0.666104,-0.118602,-0.133261,-1.310078
2,-0.228911,-1.604255,0.311906,-0.034831
3,-0.480585,-0.220734,-0.004626,-1.150494
4,-1.383676,0.363159,-1.809212,-0.065175
5,0.358792,-0.85314,-1.671779,2.119022
6,0.983306,0.639719,0.586196,-0.434034
7,0.556042,1.094694,0.353639,-1.931519
8,-0.770325,1.052969,0.076859,0.277511
9,-0.761501,-0.791653,-1.38,0.133487


In [51]:
left = pd.DataFrame({'Names': ['Bob','John','Tobias','Kingsley'],
                     'Office': ['HR','ICT','AUDIT','FINANCE'],
                     'ID': [1,4,6,7]})

left

Unnamed: 0,Names,Office,ID
0,Bob,HR,1
1,John,ICT,4
2,Tobias,AUDIT,6
3,Kingsley,FINANCE,7


In [45]:
right = pd.DataFrame({'Names':['Collins','Chisom','Cute_Boy','Kenneth'],
                      'ID': [1,4,6,7]})

In [52]:
pd.merge(left,right, on = 'ID')


Unnamed: 0,Names_x,Office,ID,Names_y
0,Bob,HR,1,Collins
1,John,ICT,4,Chisom
2,Tobias,AUDIT,6,Cute_Boy
3,Kingsley,FINANCE,7,Kenneth


# Grouping Data

In [61]:
df.groupby(2).sum()


Unnamed: 0_level_0,0,1,3
2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-1.809212,-1.383676,0.363159,-0.065175
-1.671779,0.358792,-0.85314,2.119022
-1.38,-0.761501,-0.791653,0.133487
-0.49534,-1.54138,1.719404,0.464978
-0.133261,0.666104,-0.118602,-1.310078
-0.004626,-0.480585,-0.220734,-1.150494
0.076859,-0.770325,1.052969,0.277511
0.311906,-0.228911,-1.604255,-0.034831
0.353639,0.556042,1.094694,-1.931519
0.586196,0.983306,0.639719,-0.434034


# Reshaping using Stack and Pivot_Table

In [66]:
my_tuple = list(zip(*[[1,2,4,5,7,88,9,21,44],[3,54,78,90,23,6,61,43,67]]))


In [67]:
index= pd.MultiIndex.from_tuples(my_tuple, names = ['First', 'Second'] )


In [69]:
df = pd.DataFrame(np.random.randn(9,2), index = index , columns = ['A','B'])

In [70]:
df3 = df[:4]
df3

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
First,Second,Unnamed: 2_level_1,Unnamed: 3_level_1
1,3,0.116813,-0.602682
2,54,0.710928,1.917324
4,78,-0.010763,0.22353
5,90,0.9779,-0.14674


In [72]:
df3.unstack()

Unnamed: 0_level_0,A,A,A,A,B,B,B,B
Second,3,54,78,90,3,54,78,90
First,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
1,0.116813,,,,-0.602682,,,
2,,0.710928,,,,1.917324,,
4,,,-0.010763,,,,0.22353,
5,,,,0.9779,,,,-0.14674


In [76]:
df = pd.DataFrame({'A': ['a','b','c','d']*3,
                   'B': ['foo','bar','baz']*4,
                   'C': ['P','P','P','Q','Q','Q']* 2,
                   'D': np.random.randn(12),
                   'E': np.random.randn(12)})

In [79]:
pd.pivot_table(df, index = ['A','B'], values ='D', columns = ['C'])

Unnamed: 0_level_0,C,P,Q
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
a,bar,,0.047677
a,baz,0.000945,
a,foo,-0.520727,
b,bar,-2.316681,
b,baz,,0.565845
b,foo,,1.19864
c,bar,,1.05047
c,baz,-1.523912,
c,foo,0.910133,
d,bar,-0.683312,


# TimeSeries

In [84]:
dates = pd.date_range('3/3/2020', periods = 100, freq = 'S')


In [86]:
ts = pd.Series(np.random.randint(0,500,len(dates)),dates)

In [88]:
ts.resample('5min').sum()

2020-03-03    26505
Freq: 5T, dtype: int32

In [90]:
dates = pd.date_range(20200503, periods = 5, freq ='D')
                      

In [91]:
ts = pd.Series(np.random.randn(len(dates)),dates)

In [92]:
ts

1970-01-01 00:00:00.020200503   -0.366265
1970-01-02 00:00:00.020200503   -0.098422
1970-01-03 00:00:00.020200503   -0.083247
1970-01-04 00:00:00.020200503   -1.512313
1970-01-05 00:00:00.020200503    0.859285
Freq: D, dtype: float64

In [95]:
ts_utc = ts.tz_localize('UTC')

ts_utc

1970-01-01 00:00:00.020200503+00:00   -0.366265
1970-01-02 00:00:00.020200503+00:00   -0.098422
1970-01-03 00:00:00.020200503+00:00   -0.083247
1970-01-04 00:00:00.020200503+00:00   -1.512313
1970-01-05 00:00:00.020200503+00:00    0.859285
Freq: D, dtype: float64

In [96]:
ts_utc.tz_convert('US/EASTERN')

1969-12-31 19:00:00.020200503-05:00   -0.366265
1970-01-01 19:00:00.020200503-05:00   -0.098422
1970-01-02 19:00:00.020200503-05:00   -0.083247
1970-01-03 19:00:00.020200503-05:00   -1.512313
1970-01-04 19:00:00.020200503-05:00    0.859285
Freq: D, dtype: float64

In [None]:
dates = pd.date_range(20200503, periods = 5, freq ='M')

In [98]:
dates = pd.Series(np.random.randn(len(dates)),dates)

In [102]:
ts

1970-01-01 00:00:00.020200503   -0.366265
1970-01-02 00:00:00.020200503   -0.098422
1970-01-03 00:00:00.020200503   -0.083247
1970-01-04 00:00:00.020200503   -1.512313
1970-01-05 00:00:00.020200503    0.859285
Freq: D, dtype: float64

In [99]:
ps = ts.to_period()

In [100]:
ps

1970-01-01   -0.366265
1970-01-02   -0.098422
1970-01-03   -0.083247
1970-01-04   -1.512313
1970-01-05    0.859285
Freq: D, dtype: float64

In [101]:
ps.to_timestamp()

1970-01-01   -0.366265
1970-01-02   -0.098422
1970-01-03   -0.083247
1970-01-04   -1.512313
1970-01-05    0.859285
Freq: D, dtype: float64

# Categorical


In [2]:
df = pd.DataFrame ({"id": [1,2,3,4,5,6],
                   "grade": ['a','b', 'f','c','d','b']})

In [3]:
df


Unnamed: 0,id,grade
0,1,a
1,2,b
2,3,f
3,4,c
4,5,d
5,6,b


In [9]:
df["Grade"] = df["grade"].astype("category")

In [10]:
df['Grade']

0    a
1    b
2    f
3    c
4    d
5    b
Name: Grade, dtype: category
Categories (5, object): [a, b, c, d, f]

In [18]:
df["Grade"].cat.categories = ['execellent', 'very good', 'fail','good','pass']

In [16]:
df['Grade']

0          NaN
1    very good
2         pass
3         fail
4         good
5    very good
Name: Grade, dtype: category
Categories (5, object): [bad, very good, good, pass, fail]

In [14]:
df["Grade"] = df['Grade'].cat.set_categories(['bad','very good','good','pass','fail'])

In [8]:
df['Grade']

0          NaN
1    very good
2         pass
3         fail
4         good
5    very good
Name: Grade, dtype: category
Categories (5, object): [bad, very good, good, pass, fail]