In [1]:
import pandas as pd

In [2]:
df = pd.read_csv(r'data/abalone.csv')
df.head(3)

Unnamed: 0,Type,LongestShell,Diameter,Height,WholeWeight,ShuckedWeight,VisceraWeight,ShellWeight,Rings
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9


In [3]:
# apply a function to each row in a column
df['Type'].apply(lambda x: 'Female' if x=='F' else 'Male').value_counts()

Type
Male      2870
Female    1307
Name: count, dtype: int64

In [4]:
# apply a function to each row 
def process_row(row):
    column = 'Height'
    row[column] *= 2
    return row

df.apply(process_row,axis=1).head(3) # here the axis=1 indicates apply to row 

Unnamed: 0,Type,LongestShell,Diameter,Height,WholeWeight,ShuckedWeight,VisceraWeight,ShellWeight,Rings
0,M,0.455,0.365,0.19,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.18,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.27,0.677,0.2565,0.1415,0.21,9


In [5]:
# split the dataframe to smaller sub dataframes
condition = df['Height'] < 0.1
splited_df = []
for name, group in df.groupby(condition):
    splited_df.append(group)
print('group0: ', splited_df[0].Height.min()) # the first sub dataframe
print('group1: ', splited_df[1].Height.max()) # the second sub dataframe

group0:  0.1
group1:  0.095


In [6]:
# check if the values in the column are all false
print(df.all()) # just 'Height' column includes False value
print(df['Height'].apply(lambda x:True if x >0 else False).value_counts()) # check the false height value --> 0.0
print(df.loc[df['Height'].apply(lambda x:True if x >0 else False) == False,['Type','Height']].value_counts()) # check the Type of false Height

Type              True
LongestShell      True
Diameter          True
Height           False
WholeWeight       True
ShuckedWeight     True
VisceraWeight     True
ShellWeight       True
Rings             True
dtype: bool
Height
True     4175
False       2
Name: count, dtype: int64
Type  Height
I     0.0       2
Name: count, dtype: int64


In [7]:
# pd.melt is the implementation of unpivoting in DataFrame

df_sample = df[df['Rings']>25][['Type','Rings']].drop_duplicates()
print(df_sample)

pd.melt(df_sample,id_vars='Type',var_name='Melted_Variable',value_name='Melted_Value')

     Type  Rings
294     M     26
480     F     29
2108    M     27
2209    F     27


Unnamed: 0,Type,Melted_Variable,Melted_Value
0,M,Rings,26
1,F,Rings,29
2,M,Rings,27
3,F,Rings,27


In [8]:
# pd.pivot() is the implementation of pivoting in DataFrame
df_sample = df[df['Rings']>25][['Type','Rings','Height','Diameter']].drop_duplicates()
print(df_sample)

print(pd.pivot(df_sample,index='Rings',columns='Type')) # If not specified "value=", all remaining columns will be used with hierarchically indexed columns. 
pd.pivot(df_sample,index='Rings',columns='Type',values='Height') # only "Height" is used as pivoted value


     Type  Rings  Height  Diameter
294     M     26   0.195     0.495
480     F     29   0.185     0.585
2108    M     27   0.225     0.535
2209    F     27   0.180     0.465
      Height        Diameter       
Type       F      M        F      M
Rings                              
26       NaN  0.195      NaN  0.495
27     0.180  0.225    0.465  0.535
29     0.185    NaN    0.585    NaN


Type,F,M
Rings,Unnamed: 1_level_1,Unnamed: 2_level_1
26,,0.195
27,0.18,0.225
29,0.185,


In [9]:
# extract rows based on a list of indices
indices = [294,480,2108,2209]
df.iloc[indices][['Type','Rings','Height']]

Unnamed: 0,Type,Rings,Height
294,M,26,0.195
480,F,29,0.185
2108,M,27,0.225
2209,F,27,0.18


In [10]:
# merge two dataframes on specific colums

# columns to merge on
merge_cols = ['Type','Rings']
df1 = df[df['Rings']>25][['Type','Rings','Height']].drop_duplicates()
# if you want to specify which columns of the right dataframe that you want to return, you can select them before merging
pd.merge(df1,df[['Type','Rings','Height','ShellWeight']],how='left',on=merge_cols,suffixes=('_left','_right')) 

Unnamed: 0,Type,Rings,Height_left,Height_right,ShellWeight
0,M,26,0.195,0.195,0.375
1,F,29,0.185,0.185,0.475
2,M,27,0.225,0.225,0.885
3,F,27,0.18,0.18,0.525


In [11]:
# the value that appears most often. There can be multiple modes.

print(df['Type'].mode())
df.mode()

0    M
Name: Type, dtype: object


Unnamed: 0,Type,LongestShell,Diameter,Height,WholeWeight,ShuckedWeight,VisceraWeight,ShellWeight,Rings
0,M,0.55,0.45,0.15,0.2225,0.175,0.1715,0.275,9.0
1,,0.625,,,,,,,


In [12]:
s = pd.Series([2, 4, 2, 2, 4,4, None])
s.mode()

0    2.0
1    4.0
dtype: float64

In [13]:
# cumulative calculation
print(df_sample)
df_sample.cumsum()

     Type  Rings  Height  Diameter
294     M     26   0.195     0.495
480     F     29   0.185     0.585
2108    M     27   0.225     0.535
2209    F     27   0.180     0.465


Unnamed: 0,Type,Rings,Height,Diameter
294,M,26,0.195,0.495
480,MF,55,0.38,1.08
2108,MFM,82,0.605,1.615
2209,MFMF,109,0.785,2.08


In [14]:
# rolling calculation
print(df_sample)
df_sample.loc[:,['Height','Diameter']].rolling(2).sum()

     Type  Rings  Height  Diameter
294     M     26   0.195     0.495
480     F     29   0.185     0.585
2108    M     27   0.225     0.535
2209    F     27   0.180     0.465


Unnamed: 0,Height,Diameter
294,,
480,0.38,1.08
2108,0.41,1.12
2209,0.405,1.0


In [15]:
# convert a df to dict
df_sample.to_dict() # column name as key, index as inner key

{'Type': {294: 'M', 480: 'F', 2108: 'M', 2209: 'F'},
 'Rings': {294: 26, 480: 29, 2108: 27, 2209: 27},
 'Height': {294: 0.195, 480: 0.185, 2108: 0.225, 2209: 0.18},
 'Diameter': {294: 0.495, 480: 0.585, 2108: 0.535, 2209: 0.465}}

In [16]:
df_sample.to_dict(orient='index') # index as key, column name as inner key

{294: {'Type': 'M', 'Rings': 26, 'Height': 0.195, 'Diameter': 0.495},
 480: {'Type': 'F', 'Rings': 29, 'Height': 0.185, 'Diameter': 0.585},
 2108: {'Type': 'M', 'Rings': 27, 'Height': 0.225, 'Diameter': 0.535},
 2209: {'Type': 'F', 'Rings': 27, 'Height': 0.18, 'Diameter': 0.465}}

In [17]:
# convert a df to json 
print(type(df_sample.to_json()))
df_sample.to_json()

#In Python, for example, JSON data is usually treated as a serialized string until it's deserialized (parsed) into a Python object like a dictionary or list.

<class 'str'>


'{"Type":{"294":"M","480":"F","2108":"M","2209":"F"},"Rings":{"294":26,"480":29,"2108":27,"2209":27},"Height":{"294":0.195,"480":0.185,"2108":0.225,"2209":0.18},"Diameter":{"294":0.495,"480":0.585,"2108":0.535,"2209":0.465}}'

Serialization is the process of converting an object (e.g., a Python dictionary, list, class instance) into a format <b>that can be easily stored or transmitted</b>. The result is usually a string or binary format.  
<b>Purpose</b>:  
    The serialized form can be saved to a file, transmitted over a network, or sent between systems (e.g., between different programming languages).  
<b>Common Formats</b>:  
- JSON (JavaScript Object Notation)  
- XML (Extensible Markup Language)  
- Binary formats (e.g., protocol buffers)    
<br>

Deserialization is the reverse process of serialization. It involves converting a serialized format (like a JSON string) back into an object <b>that the programming language can understand (e.g., a dictionary, list, or class instance)</b>.  
<b>Purpose</b>:  
    It allows you to reconstruct an object from a stored or transmitted format.

Why Use Serialization and Deserialization?  
- Data Transmission: Serialized data can be easily sent over the internet or between different systems (e.g., through APIs).
- Storage: Serialized data can be stored in databases or files for later retrieval and reuse.
- Interoperability: Serialization allows communication between different programming languages or platforms (e.g., Java and Python via JSON).
- State Saving: Serialized data can save the state of objects and later deserialize them to restore that state.


In [18]:
# drop row based on index range
df_sample.drop(df_sample.index[:1])

Unnamed: 0,Type,Rings,Height,Diameter
480,F,29,0.185,0.585
2108,M,27,0.225,0.535
2209,F,27,0.18,0.465


In [19]:
# reset index 
print(df_sample.reset_index(drop=False).columns) # the previous index becomes a new column
df_sample.reset_index(drop=False)


Index(['index', 'Type', 'Rings', 'Height', 'Diameter'], dtype='object')


Unnamed: 0,index,Type,Rings,Height,Diameter
0,294,M,26,0.195,0.495
1,480,F,29,0.185,0.585
2,2108,M,27,0.225,0.535
3,2209,F,27,0.18,0.465


In [20]:
print(df_sample.reset_index(drop=True).columns) # the previous index is dropped
df_sample.reset_index(drop=True)

Index(['Type', 'Rings', 'Height', 'Diameter'], dtype='object')


Unnamed: 0,Type,Rings,Height,Diameter
0,M,26,0.195,0.495
1,F,29,0.185,0.585
2,M,27,0.225,0.535
3,F,27,0.18,0.465
