**Presented by: Reza Saadatyar (2024-2025)**<br/>
**E-mail: Reza.Saadatyar@outlook.com**

- Series
- DataFrame
- Missing data
- Get_dummies: Convert categorical variable into dummy/indicator variables
- Groupby

In [2]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer

**Series**

In [3]:
series = pd.Series([7, 15, 4, 4, 7, 4], index=[
                   'r1', 'r2', 'r3', 'r4', 'r5', 'r6'])
series

r1     7
r2    15
r3     4
r4     4
r5     7
r6     4
dtype: int64

In [4]:
print(f"{series.r2=}")
print(f"\n {series[1]=}")
print(f"\n {series.values=}")
print(f"\n {series.index=}")
print(f"\n {series.value_counts=}")

series.r2=15

 series[1]=15

 series.values=array([ 7, 15,  4,  4,  7,  4], dtype=int64)

 series.index=Index(['r1', 'r2', 'r3', 'r4', 'r5', 'r6'], dtype='object')

 series.value_counts=<bound method IndexOpsMixin.value_counts of r1     7
r2    15
r3     4
r4     4
r5     7
r6     4
dtype: int64>


  print(f"\n {series[1]=}")


**DataFrame**

In [5]:
df = pd.DataFrame([[1, 8], [3, 4], [5, 9], [7, 2]], index=[
                  'A1', 'A2', 'A3', 'A4'], columns=['B1', 'B2'])
df

Unnamed: 0,B1,B2
A1,1,8
A2,3,4
A3,5,9
A4,7,2


In [6]:
pd.crosstab(df['B1'], df['B2'])

B2,2,4,8,9
B1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0,0,1,0
3,0,1,0,0
5,0,0,0,1
7,1,0,0,0


In [7]:
print(f"df[:3]:\n {df[:3]}")
print(f"\n df['B2']:\n{df['B2']}")

df[:3]:
     B1  B2
A1   1   8
A2   3   4
A3   5   9

 df['B2']:
A1    8
A2    4
A3    9
A4    2
Name: B2, dtype: int64


In [8]:
df.values

array([[1, 8],
       [3, 4],
       [5, 9],
       [7, 2]], dtype=int64)

In [9]:
df[df < 5]

Unnamed: 0,B1,B2
A1,1.0,
A2,3.0,4.0
A3,,
A4,,2.0


In [10]:
df.head()

Unnamed: 0,B1,B2
A1,1,8
A2,3,4
A3,5,9
A4,7,2


In [11]:
df.tail(2)

Unnamed: 0,B1,B2
A3,5,9
A4,7,2


In [12]:
df.index

Index(['A1', 'A2', 'A3', 'A4'], dtype='object')

In [13]:
df.columns

Index(['B1', 'B2'], dtype='object')

In [14]:
df.T

Unnamed: 0,A1,A2,A3,A4
B1,1,3,5,7
B2,8,4,9,2


In [15]:
df.drop('A3', axis=0)

Unnamed: 0,B1,B2
A1,1,8
A2,3,4
A4,7,2


In [16]:
df.drop('B2', axis=1)

Unnamed: 0,B1
A1,1
A2,3
A3,5
A4,7


In [17]:
df.describe()

Unnamed: 0,B1,B2
count,4.0,4.0
mean,4.0,5.75
std,2.581989,3.304038
min,1.0,2.0
25%,2.5,3.5
50%,4.0,6.0
75%,5.5,8.25
max,7.0,9.0


In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4 entries, A1 to A4
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   B1      4 non-null      int64
 1   B2      4 non-null      int64
dtypes: int64(2)
memory usage: 268.0+ bytes


In [19]:
df = pd.DataFrame({'x': [1, 5],  'b': [-1, 4], 'c': [5, 6]})
df

Unnamed: 0,x,b,c
0,1,-1,5
1,5,4,6


In [34]:
df.sort_index(axis=1)

Unnamed: 0,b,c,x
0,-1,5,1
1,4,6,5


In [20]:
df.sort_index(axis=1, ascending=False)

Unnamed: 0,x,c,b
0,1,5,-1
1,5,6,4


In [21]:
df.sort_values(by='x',ascending=False)

Unnamed: 0,x,b,c
1,5,4,6
0,1,-1,5


In [38]:
df[df.x < 5]

Unnamed: 0,x,b,c
0,1,-1,5


In [41]:
df.iloc[1][1], df.iloc[1]['b']

(4, 4)

In [55]:
np.random.seed(1)
pd.DataFrame(np.random.randn(6,4))

Unnamed: 0,0,1,2,3
0,1.624345,-0.611756,-0.528172,-1.072969
1,0.865408,-2.301539,1.744812,-0.761207
2,0.319039,-0.24937,1.462108,-2.060141
3,-0.322417,-0.384054,1.133769,-1.099891
4,-0.172428,-0.877858,0.042214,0.582815
5,-1.100619,1.144724,0.901591,0.502494


**Missing data**

In [6]:
df = pd.DataFrame([[np.nan, 2, np.nan, 0], 
                   [3, 4, np.nan, 1], 
                   [np.nan, np.nan, np.nan, 5], 
                   [np.nan, 3, np.nan, 4],
                   [np.nan, np.nan, np.nan, np.nan]],
                    columns=list('ABCD'))
df

Unnamed: 0,A,B,C,D
0,,2.0,,0.0
1,3.0,4.0,,1.0
2,,,,5.0
3,,3.0,,4.0
4,,,,


In [44]:
df.isnull().sum()

A    4
B    2
C    5
D    1
dtype: int64

In [45]:
df.isna()

Unnamed: 0,A,B,C,D
0,True,False,True,False
1,False,False,True,False
2,True,True,True,False
3,True,False,True,False
4,True,True,True,True


In [46]:
df.notna()

Unnamed: 0,A,B,C,D
0,False,True,False,True
1,True,True,False,True
2,False,False,False,True
3,False,True,False,True
4,False,False,False,False


In [47]:
pd.isna(df['A'])

0     True
1    False
2     True
3     True
4     True
Name: A, dtype: bool

In [48]:
df.fillna(0)

Unnamed: 0,A,B,C,D
0,0.0,2.0,0.0,0.0
1,3.0,4.0,0.0,1.0
2,0.0,0.0,0.0,5.0
3,0.0,3.0,0.0,4.0
4,0.0,0.0,0.0,0.0


In [49]:
df.fillna(df.mean())

Unnamed: 0,A,B,C,D
0,3.0,2.0,,0.0
1,3.0,4.0,,1.0
2,3.0,3.0,,5.0
3,3.0,3.0,,4.0
4,3.0,3.0,,2.5


In [50]:
df.fillna(df.mean()['A':'B'])

Unnamed: 0,A,B,C,D
0,3.0,2.0,,0.0
1,3.0,4.0,,1.0
2,3.0,3.0,,5.0
3,3.0,3.0,,4.0
4,3.0,3.0,,


In [51]:
df.fillna(method='ffill')

Unnamed: 0,A,B,C,D
0,,2.0,,0.0
1,3.0,4.0,,1.0
2,3.0,4.0,,5.0
3,3.0,3.0,,4.0
4,3.0,3.0,,4.0


In [53]:
df.fillna(value={'A': 0, 'B': 1, 'C': 2, 'D': 3}, limit=2)

Unnamed: 0,A,B,C,D
0,0.0,2.0,2.0,0.0
1,3.0,4.0,2.0,1.0
2,0.0,1.0,,5.0
3,,3.0,,4.0
4,,1.0,,3.0


In [54]:
df.fillna(value={'A': 0, 'B': 1, 'C': 2, 'D': 3})

Unnamed: 0,A,B,C,D
0,0.0,2.0,2.0,0.0
1,3.0,4.0,2.0,1.0
2,0.0,1.0,2.0,5.0
3,0.0,3.0,2.0,4.0
4,0.0,1.0,2.0,3.0


In [59]:
df.dropna(axis=0)

Unnamed: 0,A,B,C,D


In [60]:
df['A'].dropna()

1    3.0
Name: A, dtype: float64

In [62]:
print(f"{df = }")
df.dropna(thresh=2)  # drop rows that have not at least 2 non-NaN values

df =      A    B   C    D
0  NaN  2.0 NaN  0.0
1  3.0  4.0 NaN  1.0
2  NaN  NaN NaN  5.0
3  NaN  3.0 NaN  4.0
4  NaN  NaN NaN  NaN


Unnamed: 0,A,B,C,D
0,,2.0,,0.0
1,3.0,4.0,,1.0
3,,3.0,,4.0


In [63]:
print(f"{df = }")
df.dropna(how='all')  #only drop rows where all columns are NaN

df =      A    B   C    D
0  NaN  2.0 NaN  0.0
1  3.0  4.0 NaN  1.0
2  NaN  NaN NaN  5.0
3  NaN  3.0 NaN  4.0
4  NaN  NaN NaN  NaN


Unnamed: 0,A,B,C,D
0,,2.0,,0.0
1,3.0,4.0,,1.0
2,,,,5.0
3,,3.0,,4.0


In [64]:
df.dropna(subset=['B'])  # only drop rows where NaN appear in specific columns B

Unnamed: 0,A,B,C,D
0,,2.0,,0.0
1,3.0,4.0,,1.0
3,,3.0,,4.0


In [76]:
imputer = SimpleImputer(strategy='mean')     # Create a SimpleImputer with strategy='mean'
imputed_data = imputer.fit_transform(df)     # Fit the imputer on the data and transform it
df_imputed = pd.DataFrame(imputed_data, columns=df.columns) # Create a new DataFrame with imputed values
df_imputed

Unnamed: 0,A,B,C,D
0,1.0,6.0,11.0,16.0
1,2.0,8.25,12.0,17.0
2,3.0,8.0,13.0,18.0
3,4.0,9.0,14.0,17.75
4,5.0,10.0,15.0,20.0


**Get_dummies**

In [34]:
df = pd.read_csv("C:/Users/Reza/Documents/Pandas/Code/train_Titanic.csv")
df.head(5)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,0
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,1
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,1
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,0


In [26]:
pd.crosstab(df['Sex'], df['Survived'])

Survived,0,1
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1
female,81,233
male,468,109


In [35]:
df = df.drop(['PassengerId','Name','Ticket'], axis=1)        # Drop unnecessary columns
df.head(5)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Survived
0,3,male,22.0,1,0,7.25,,S,0
1,1,female,38.0,1,0,71.2833,C85,C,1
2,3,female,26.0,0,0,7.925,,S,1
3,1,female,35.0,1,0,53.1,C123,S,1
4,3,male,35.0,0,0,8.05,,S,0


In [36]:
edt  = pd.get_dummies(df['Embarked'], dtype=int)      # Convert categorical variable into dummy/indicator variables
edt

Unnamed: 0,C,Q,S
0,0,0,1
1,1,0,0
2,0,0,1
3,0,0,1
4,0,0,1
...,...,...,...
886,0,0,1
887,0,0,1
888,0,0,1
889,1,0,0


In [37]:
df = df.join(edt)
df.head(5)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Survived,C,Q,S
0,3,male,22.0,1,0,7.25,,S,0,0,0,1
1,1,female,38.0,1,0,71.2833,C85,C,1,1,0,0
2,3,female,26.0,0,0,7.925,,S,1,0,0,1
3,1,female,35.0,1,0,53.1,C123,S,1,0,0,1
4,3,male,35.0,0,0,8.05,,S,0,0,0,1


In [38]:
df.drop(['Embarked'], axis=1,inplace=True)
df.head(5)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Survived,C,Q,S
0,3,male,22.0,1,0,7.25,,0,0,0,1
1,1,female,38.0,1,0,71.2833,C85,1,1,0,0
2,3,female,26.0,0,0,7.925,,1,0,0,1
3,1,female,35.0,1,0,53.1,C123,1,0,0,1
4,3,male,35.0,0,0,8.05,,0,0,0,1


**Groupby**

In [39]:
df = pd.DataFrame({'Bird' : ['A', 'A', 'B', 'B', 'B'],'Speed' : [380, 370, 24, 26,np.nan]})
df

Unnamed: 0,Bird,Speed
0,A,380.0
1,A,370.0
2,B,24.0
3,B,26.0
4,B,


In [40]:
df.groupby(['Bird']).mean()

Unnamed: 0_level_0,Speed
Bird,Unnamed: 1_level_1
A,375.0
B,25.0


In [42]:
df['Speed'] = df.groupby(['Bird'])['Speed'].transform(lambda x: x.fillna(x.mean()))
df    # 4 --> B ---> df.groupby(['Bird']).mean()

Unnamed: 0,Bird,Speed
0,A,380.0
1,A,370.0
2,B,24.0
3,B,26.0
4,B,25.0


In [12]:
a = {"a":1, "b":2}
print(f"{a.keys() = }")
print(f"{a.values() = }")
print(f"{a.get("a") = }, {a.get("c") = }")
a["c"] = 5
print(f"{a = }")
print(f"{a.items() = }")

a.keys() = dict_keys(['a', 'b'])
a.values() = dict_values([1, 2])
a.get("a") = 1, a.get("c") = None
a = {'a': 1, 'b': 2, 'c': 5}
a.items() = dict_items([('a', 1), ('b', 2), ('c', 5)])
