In [1]:
import numpy as np
import pandas as pd

# Pandas

Pandas primarily provides two data structures:
<br/>
Series: A one-dimensional labeled array capable of holding any data type.<br/>
DataFrame: A two-dimensional labeled data structure with columns of potentially different types.

# Series
The Series object is built on top of the NumPy array and is very similar to it but with additional capabilities like handling missing data. The indices of a pandas Series are more flexible than those in a simple NumPy array.

In [2]:
s = pd.Series([1, 3, 5, 7, 9])
print(s)

0    1
1    3
2    5
3    7
4    9
dtype: int64


In [3]:
type(s)

pandas.core.series.Series

In [4]:
print(s.min())
print(s.max())
print(s.sum())

1
9
25


In [5]:
aggregated = s.aggregate(['sum', 'mean', 'std'])
print(aggregated)

sum     25.000000
mean     5.000000
std      3.162278
dtype: float64


In [6]:
double = s.apply(lambda x: x * 2)
print(double)

sorted_s = s.sort_values()
print(sorted_s)

0     2
1     6
2    10
3    14
4    18
dtype: int64
0    1
1    3
2    5
3    7
4    9
dtype: int64


In [7]:
s.drop(0)

1    3
2    5
3    7
4    9
dtype: int64

In [8]:
series_dict=pd.Series({'a':1,'b':2,'c':3})
print(series_dict)

a    1
b    2
c    3
dtype: int64


In [9]:
data=[10,20,30]
index=['a','b','c']
pd.Series(data,index=index)

a    10
b    20
c    30
dtype: int64

Dataframe

In [10]:
data={
    'Name':['Raghu','Akash','Hemanth'],
    'Age':[18,20,19],
    'City':['Vijayawada','Banglore','Hyderabad']
}
df=pd.DataFrame(data)

df

Unnamed: 0,Name,Age,City
0,Raghu,18,Vijayawada
1,Akash,20,Banglore
2,Hemanth,19,Hyderabad


In [11]:
type(df)

pandas.core.frame.DataFrame

In [12]:
data=[
    {'Name':'A','Age':20,'City':'Bangalore'},
    {'Name':'B','Age':17,'City':'Bangalore'},
    {'Name':'C','Age':21,'City':'Bangalore'},
    {'Name':'D','Age':35,'City':'Bangalore'}
    
]
df=pd.DataFrame(data)

df

Unnamed: 0,Name,Age,City
0,A,20,Bangalore
1,B,17,Bangalore
2,C,21,Bangalore
3,D,35,Bangalore


In [13]:
df=pd.read_csv('tips.csv')
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [14]:
df.tail()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.0,Female,Yes,Sat,Dinner,2
241,22.67,2.0,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2
243,18.78,3.0,Female,No,Thur,Dinner,2


In [15]:
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


In [16]:
df.shape

(244, 7)

In [17]:
df.columns

Index(['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size'], dtype='object')

In [18]:
print(df.isnull())

     total_bill    tip    sex  smoker    day   time   size
0         False  False  False   False  False  False  False
1         False  False  False   False  False  False  False
2         False  False  False   False  False  False  False
3         False  False  False   False  False  False  False
4         False  False  False   False  False  False  False
..          ...    ...    ...     ...    ...    ...    ...
239       False  False  False   False  False  False  False
240       False  False  False   False  False  False  False
241       False  False  False   False  False  False  False
242       False  False  False   False  False  False  False
243       False  False  False   False  False  False  False

[244 rows x 7 columns]


In [19]:
print(df['tip'])

0      1.01
1      1.66
2      3.50
3      3.31
4      3.61
       ... 
239    5.92
240    2.00
241    2.00
242    1.75
243    3.00
Name: tip, Length: 244, dtype: float64


In [20]:
df.loc[0]

total_bill     16.99
tip             1.01
sex           Female
smoker            No
day              Sun
time          Dinner
size               2
Name: 0, dtype: object

In [21]:
df.iloc[1]

total_bill     10.34
tip             1.66
sex             Male
smoker            No
day              Sun
time          Dinner
size               3
Name: 1, dtype: object

In [22]:
# Selecting a specific row and columns by integer indices
specific_data = df.iloc[3, [1, 2, 3]]  # row at index 10 and columns at indices 1, 2, and 3
specific_data

tip       3.31
sex       Male
smoker      No
Name: 3, dtype: object

In [23]:
multi_slice = df.iloc[10:15, 0:4]  # Rows 10 to 14 and columns 0 to 3
multi_slice

Unnamed: 0,total_bill,tip,sex,smoker
10,10.27,1.71,Male,No
11,35.26,5.0,Female,No
12,15.42,1.57,Male,No
13,18.43,3.0,Male,No
14,14.83,3.02,Female,No


In [24]:
bf=df.head(5)

In [25]:
bf

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [26]:
bf.at[2,'tip']

3.5

In [27]:
bf.iat[2,5]

'Dinner'

In [28]:
bf['Alpha']=['A','B','C','D','E']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bf['Alpha']=['A','B','C','D','E']


In [29]:
bf

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,Alpha
0,16.99,1.01,Female,No,Sun,Dinner,2,A
1,10.34,1.66,Male,No,Sun,Dinner,3,B
2,21.01,3.5,Male,No,Sun,Dinner,3,C
3,23.68,3.31,Male,No,Sun,Dinner,2,D
4,24.59,3.61,Female,No,Sun,Dinner,4,E


In [30]:
bf.drop('Alpha',axis=1,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bf.drop('Alpha',axis=1,inplace=True)


In [31]:
bf

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [32]:
bf['smoker']=bf['smoker']+' never'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bf['smoker']=bf['smoker']+' never'


In [33]:
bf

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No never,Sun,Dinner,2
1,10.34,1.66,Male,No never,Sun,Dinner,3
2,21.01,3.5,Male,No never,Sun,Dinner,3
3,23.68,3.31,Male,No never,Sun,Dinner,2
4,24.59,3.61,Female,No never,Sun,Dinner,4


In [34]:
bf.drop(1,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bf.drop(1,inplace=True)


In [35]:
bf

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No never,Sun,Dinner,2
2,21.01,3.5,Male,No never,Sun,Dinner,3
3,23.68,3.31,Male,No never,Sun,Dinner,2
4,24.59,3.61,Female,No never,Sun,Dinner,4


In [36]:
bf.dtypes

total_bill    float64
tip           float64
sex            object
smoker         object
day            object
time           object
size            int64
dtype: object

In [37]:
bf.describe()

Unnamed: 0,total_bill,tip,size
count,4.0,4.0,4.0
mean,21.5675,2.8575,2.75
std,3.408933,1.237885,0.957427
min,16.99,1.01,2.0
25%,20.005,2.735,2.0
50%,22.345,3.405,2.5
75%,23.9075,3.5275,3.25
max,24.59,3.61,4.0


In [38]:

s = pd.Series([1, 2, np.nan, 4, np.nan])

print(s)

0    1.0
1    2.0
2    NaN
3    4.0
4    NaN
dtype: float64


In [39]:
filled = s.fillna(9)
print(filled)

0    1.0
1    2.0
2    9.0
3    4.0
4    9.0
dtype: float64


In [40]:
s

0    1.0
1    2.0
2    NaN
3    4.0
4    NaN
dtype: float64

In [41]:
s.dropna()

0    1.0
1    2.0
3    4.0
dtype: float64