# Overloaded Operators

In [2]:
import pandas as pd

In [3]:
df = pd.read_csv('https://docs.google.com/spreadsheets/d/e/2PACX-1vReZBM5OC6GLYbacisp_ToNiu3CLWxqPXw7mWBsdRjnYOFLWNufdQ4qd8u5qTzUF2_sBUAMEi5cgy1U/pub?gid=1040198428&single=true&output=csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
# Simplifying by combining siblings and parents and children to single feature
df['TotalFamily'] = df['SibSp'] + df['Parch'] 
df = df.drop(['SibSp', 'Parch'], axis=1)
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,Ticket,Fare,Cabin,Embarked,TotalFamily
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,A/5 21171,7.25,,S,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,PC 17599,71.2833,C85,C,1
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,STON/O2. 3101282,7.925,,S,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,113803,53.1,C123,S,1
4,5,0,3,"Allen, Mr. William Henry",male,35.0,373450,8.05,,S,0


In [5]:
# Rounding 'Age' to nearest decade
df['Age'] = df['Age'].round(-1)
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,Ticket,Fare,Cabin,Embarked,TotalFamily
0,1,0,3,"Braund, Mr. Owen Harris",male,20.0,A/5 21171,7.25,,S,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,40.0,PC 17599,71.2833,C85,C,1
2,3,1,3,"Heikkinen, Miss. Laina",female,30.0,STON/O2. 3101282,7.925,,S,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,40.0,113803,53.1,C123,S,1
4,5,0,3,"Allen, Mr. William Henry",male,40.0,373450,8.05,,S,0


In [6]:
# Features need the same dtype to be overloaded. We can change 'Age' in place while combining with 'Sex'
df['GenderAge'] = df['Sex'] + df['Age'].astype('string')
df.drop(columns=['Sex','Age'], inplace=True)
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Ticket,Fare,Cabin,Embarked,TotalFamily,GenderAge
0,1,0,3,"Braund, Mr. Owen Harris",A/5 21171,7.25,,S,1,male20.0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",PC 17599,71.2833,C85,C,1,female40.0
2,3,1,3,"Heikkinen, Miss. Laina",STON/O2. 3101282,7.925,,S,0,female30.0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",113803,53.1,C123,S,1,female40.0
4,5,0,3,"Allen, Mr. William Henry",373450,8.05,,S,0,male40.0


In [7]:
# We can normalize data here by multiplying fare by pclass squared
df['NormedFare'] = df['Fare'] * df['Pclass']**2
df.drop(columns='Fare', inplace=True)
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Ticket,Cabin,Embarked,TotalFamily,GenderAge,NormedFare
0,1,0,3,"Braund, Mr. Owen Harris",A/5 21171,,S,1,male20.0,65.25
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",PC 17599,C85,C,1,female40.0,71.2833
2,3,1,3,"Heikkinen, Miss. Laina",STON/O2. 3101282,,S,0,female30.0,71.325
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",113803,C123,S,1,female40.0,53.1
4,5,0,3,"Allen, Mr. William Henry",373450,,S,0,male40.0,72.45


# Strings

In [8]:
# create 2 new columns, FirstName and LastName by splitting the Name column
df[['LastName','FirstName']] = df['Name'].str.split(',', expand=True)
# drop the 'Name' column
df.drop('Name', axis=1, inplace=True)
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Ticket,Cabin,Embarked,TotalFamily,GenderAge,NormedFare,LastName,FirstName
0,1,0,3,A/5 21171,,S,1,male20.0,65.25,Braund,Mr. Owen Harris
1,2,1,1,PC 17599,C85,C,1,female40.0,71.2833,Cumings,Mrs. John Bradley (Florence Briggs Thayer)
2,3,1,3,STON/O2. 3101282,,S,0,female30.0,71.325,Heikkinen,Miss. Laina
3,4,1,1,113803,C123,S,1,female40.0,53.1,Futrelle,Mrs. Jacques Heath (Lily May Peel)
4,5,0,3,373450,,S,0,male40.0,72.45,Allen,Mr. William Henry


In [9]:
# Slicing out a row we can see a space is still present since we split on the comma and not the space
df.loc[0,'FirstName']

' Mr. Owen Harris'

In [10]:
# Pandaseries str.strip removes white spaces from front and back of strings
df['FirstName'] = df['FirstName'].str.strip()
df.loc[0, 'FirstName']

'Mr. Owen Harris'

In [11]:
# With + operator we can add a space between string columns when combining to make presentable
df['Name'] = df['FirstName'] + ' ' + df['LastName']
df.drop(columns=['LastName','FirstName'], inplace= True)
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Ticket,Cabin,Embarked,TotalFamily,GenderAge,NormedFare,Name
0,1,0,3,A/5 21171,,S,1,male20.0,65.25,Mr. Owen Harris Braund
1,2,1,1,PC 17599,C85,C,1,female40.0,71.2833,Mrs. John Bradley (Florence Briggs Thayer) Cum...
2,3,1,3,STON/O2. 3101282,,S,0,female30.0,71.325,Miss. Laina Heikkinen
3,4,1,1,113803,C123,S,1,female40.0,53.1,Mrs. Jacques Heath (Lily May Peel) Futrelle
4,5,0,3,373450,,S,0,male40.0,72.45,Mr. William Henry Allen


# Datetime

In [12]:
import pandas as pd
df2 = pd.read_csv('https://docs.google.com/spreadsheets/d/e/2PACX-1vSrgrUnz8mdosU-_k0aECouymqwds_mlaHpYlXzRtf7MBJ4N1r1inCfSDebaXwTVfLtH133EhwKf3mi/pub?gid=394699239&single=true&output=csv',                  usecols=['date','price','bedrooms','bathrooms'])
df2.head()

Unnamed: 0,date,price,bedrooms,bathrooms
0,20141013T000000,221900.0,3,1.0
1,20141209T000000,538000.0,3,2.25
2,20150225T000000,180000.0,2,1.0
3,20141209T000000,604000.0,4,3.0
4,20150218T000000,510000.0,3,2.0


In [13]:
# Changing 'date' to datetime
df2['date'] = pd.to_datetime(df2['date'])
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21613 entries, 0 to 21612
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   date       21613 non-null  datetime64[ns]
 1   price      21613 non-null  float64       
 2   bedrooms   21613 non-null  int64         
 3   bathrooms  21613 non-null  float64       
dtypes: datetime64[ns](1), float64(2), int64(1)
memory usage: 675.5 KB


In [14]:
# From datetime column we can form different features to represent various units of time
df2['year'] = df2['date'].dt.year
df2['month (numeric)'] = df2['date'].dt.month
df2['month (name)'] = df2['date'].dt.month_name()
df2['day of month'] = df2['date'].dt.day
df2['day of week (numeric)'] = df2['date'].dt.weekday
df2['day of week (name)'] = df2['date'].dt.day_name()
df2.head()

Unnamed: 0,date,price,bedrooms,bathrooms,year,month (numeric),month (name),day of month,day of week (numeric),day of week (name)
0,2014-10-13,221900.0,3,1.0,2014,10,October,13,0,Monday
1,2014-12-09,538000.0,3,2.25,2014,12,December,9,1,Tuesday
2,2015-02-25,180000.0,2,1.0,2015,2,February,25,2,Wednesday
3,2014-12-09,604000.0,4,3.0,2014,12,December,9,1,Tuesday
4,2015-02-18,510000.0,3,2.0,2015,2,February,18,2,Wednesday


# Functions

In [15]:
# Revisiting Titanic dataset
df = pd.read_csv('https://docs.google.com/spreadsheets/d/e/2PACX-1vReZBM5OC6GLYbacisp_ToNiu3CLWxqPXw7mWBsdRjnYOFLWNufdQ4qd8u5qTzUF2_sBUAMEi5cgy1U/pub?gid=1040198428&single=true&output=csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [16]:
# find the median fare price
median_fare = df['Fare'].median()
# define a function that returns 'Expensive' or 'Cheap'
def bin_fare(fare):
    if fare > median_fare:    
        return 'Expensive'  
    else:    
        return 'Cheap'

In [17]:
# apply bin_fare() function to each item in the 'Fare' column
df['Fare'] = df['Fare'].apply(bin_fare)
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,Cheap,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,Expensive,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,Cheap,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,Expensive,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,Cheap,,S


In [18]:
df['Age'] = df['Age'].apply(lambda x: 'elderly' if x > 30 else 'young')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,young,1,0,A/5 21171,Cheap,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,elderly,1,0,PC 17599,Expensive,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,young,0,0,STON/O2. 3101282,Cheap,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,elderly,1,0,113803,Expensive,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,elderly,0,0,373450,Cheap,,S
