In [2]:
import pandas as pd
import numpy as np

## Duplicates

Another important argument for drop_duplicates() is keep, which has three possible options:   

first: (default) Drop duplicates except for the first occurrence.   
last: Drop duplicates except for the last occurrence.   
False: Drop all duplicates.   
      
The two main methods that we will use are duplicated() and drop_duplicates().  The former returns a boolean series and the latter can be used to directly delete duplicate rows from a dataframe. For duplicated() method the inputs are:   
  
- keep   
    - "first": Mark duplicates as True except for the first occurrence.   
    - "last": Mark duplicates as True except for the last occurrence.   
    - False: Mark all duplicates as True   
      
For the drop_duplicates() method the keep arguments does the following.   
  
- keep  
    - "first": Drop duplicates except for the first occurrence.  
    - "last": Drop duplicates except for the last occurrence.  
    - False: Drop all duplicates   
      
The second arguments for both is:   

- subset: Only consider certain columns for identifying duplicates. If subset is not specific,  by default all of the columns will be used.

In [3]:
# Creating dataframe using dictionary
dict1 = {'Gender': ["Male", "Female", "Male", "Female","Male"], 
        'Married':["Yes", "No","No", "No","Yes"],
       'Loan_Status':["Yes", "No", "No", "No","Yes"]} 
  
df = pd.DataFrame(dict1)
df

Unnamed: 0,Gender,Married,Loan_Status
0,Male,Yes,Yes
1,Female,No,No
2,Male,No,No
3,Female,No,No
4,Male,Yes,Yes


In [5]:
# with keep = "first" - mark duplicates as True except for the first one

duplicates_first=df.duplicated(keep = "first")
duplicates_first

0    False
1    False
2    False
3     True
4     True
dtype: bool

In [12]:


duplicates_last=df.duplicated(keep = "last")
duplicates_last

0     True
1     True
2    False
3    False
4    False
dtype: bool

In [17]:

duplicates_false=df.duplicated(keep = False)
duplicates_false

0     True
1     True
2    False
3     True
4     True
dtype: bool

In [19]:
df.loc[duplicates_first]

Unnamed: 0,Gender,Married,Loan_Status
3,Female,No,No
4,Male,Yes,Yes


In [10]:
# with keep = "first" - keep duplicates which occured first time and drop others

df1 = df.drop_duplicates(keep = "first")
df1

Unnamed: 0,Gender,Married,Loan_Status
0,Male,Yes,Yes
1,Female,No,No
2,Male,No,No


In [25]:
dup_last = df.duplicated(keep="first")
dup_last

0    False
1    False
2    False
3     True
4     True
dtype: bool

In [26]:
~dup_last

0     True
1     True
2     True
3    False
4    False
dtype: bool

In [27]:
df.loc[~dup_last]

Unnamed: 0,Gender,Married,Loan_Status
0,Male,Yes,Yes
1,Female,No,No
2,Male,No,No


In [28]:
df1 = df.drop_duplicates(keep = "last")
df1

Unnamed: 0,Gender,Married,Loan_Status
2,Male,No,No
3,Female,No,No
4,Male,Yes,Yes


In [29]:
df1 = df.drop_duplicates(keep = False)
df1

Unnamed: 0,Gender,Married,Loan_Status
2,Male,No,No


In [30]:
df5 = df.drop_duplicates(keep = "last")
df5

Unnamed: 0,Gender,Married,Loan_Status
2,Male,No,No
3,Female,No,No
4,Male,Yes,Yes


In [32]:
df5.reset_index()

Unnamed: 0,index,Gender,Married,Loan_Status
0,2,Male,No,No
1,3,Female,No,No
2,4,Male,Yes,Yes


In [33]:
df5.reset_index(drop=True)

Unnamed: 0,Gender,Married,Loan_Status
0,Male,No,No
1,Female,No,No
2,Male,Yes,Yes


In [35]:
df5.reset_index(drop=True,inplace = True)

In [36]:
df5

Unnamed: 0,Gender,Married,Loan_Status
0,Male,No,No
1,Female,No,No
2,Male,Yes,Yes


In [37]:
df

Unnamed: 0,Gender,Married,Loan_Status
0,Male,Yes,Yes
1,Female,No,No
2,Male,No,No
3,Female,No,No
4,Male,Yes,Yes


In [16]:
#Using drop_duplicates() method

df.drop_duplicates(keep = "first", subset=["Gender"])

Unnamed: 0,Gender,Married,Loan_Status
0,Male,Yes,Yes
1,Female,No,No


# Pandas TIME

## date_range()
- ##### pandas.date_range() is one of the general functions in Pandas which is used to return a fixed frequency DatetimeIndex.

We see that it is a timestamp. Timestamps have lots of nice attributes that we can extract.
#Reset the column Issue Date to be a datetime
df_parking["Issue_Date"] = pd.to_datetime(df_parking["Issue_Date"]) 
I want Issue_Date to be a datetime and not a string! Let's convert it.


In [38]:
# Day wise
print(pd.date_range('2000/1/11', periods=5))

DatetimeIndex(['2000-01-11', '2000-01-12', '2000-01-13', '2000-01-14',
               '2000-01-15'],
              dtype='datetime64[ns]', freq='D')


In [43]:

print(pd.date_range('2000/5/10', periods=5,freq='Y'))

DatetimeIndex(['2000-12-31', '2001-12-31', '2002-12-31', '2003-12-31',
               '2004-12-31'],
              dtype='datetime64[ns]', freq='A-DEC')


In [44]:
start = pd.datetime(2011, 1, 1)
print('Start :',start)

end = pd.datetime(2011, 1, 20)
print("Stop :",end)

print(pd.date_range(start, end))

Start : 2011-01-01 00:00:00
Stop : 2011-01-20 00:00:00
DatetimeIndex(['2011-01-01', '2011-01-02', '2011-01-03', '2011-01-04',
               '2011-01-05', '2011-01-06', '2011-01-07', '2011-01-08',
               '2011-01-09', '2011-01-10', '2011-01-11', '2011-01-12',
               '2011-01-13', '2011-01-14', '2011-01-15', '2011-01-16',
               '2011-01-17', '2011-01-18', '2011-01-19', '2011-01-20'],
              dtype='datetime64[ns]', freq='D')


  start = pd.datetime(2011, 1, 1)
  end = pd.datetime(2011, 1, 20)


In [46]:
df = pd.DataFrame({'name': ['Tom', 'Andy', 'Lucas'],
                 'DoB': ['08-05-1997', '04-28-1996', '12/16/1995']})

In [47]:
df

Unnamed: 0,name,DoB
0,Tom,08-05-1997
1,Andy,04-28-1996
2,Lucas,12/16/1995


In [48]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   name    3 non-null      object
 1   DoB     3 non-null      object
dtypes: object(2)
memory usage: 176.0+ bytes


In [49]:
df['DoB'] = pd.to_datetime(df['DoB'])

In [50]:
df['DoB'].dt.year

0    1997
1    1996
2    1995
Name: DoB, dtype: int64

In [57]:
today = pd.to_datetime('today')


In [58]:
today

Timestamp('2022-05-24 11:10:10.755215')

In [59]:
today.year

2022

In [60]:
today.month

5

In [62]:
today.day

24

In [37]:
today.year - df['DoB'].dt.year

0    25
1    26
2    27
Name: DoB, dtype: int64

In [38]:
df["Age"] = today.year - df['DoB'].dt.year
df

Unnamed: 0,name,DoB,Age
0,Tom,1997-08-05,25
1,Andy,1996-04-28,26
2,Lucas,1995-12-16,27


In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   name    3 non-null      object        
 1   DoB     3 non-null      datetime64[ns]
dtypes: datetime64[ns](1), object(1)
memory usage: 176.0+ bytes


In [25]:
help(df["DoB"].dt)

Help on DatetimeProperties in module pandas.core.indexes.accessors object:

class DatetimeProperties(Properties)
 |  DatetimeProperties(data: 'Series', orig)
 |  
 |  Accessor object for datetimelike properties of the Series values.
 |  
 |  Examples
 |  --------
 |  >>> seconds_series = pd.Series(pd.date_range("2000-01-01", periods=3, freq="s"))
 |  >>> seconds_series
 |  0   2000-01-01 00:00:00
 |  1   2000-01-01 00:00:01
 |  2   2000-01-01 00:00:02
 |  dtype: datetime64[ns]
 |  >>> seconds_series.dt.second
 |  0    0
 |  1    1
 |  2    2
 |  dtype: int64
 |  
 |  >>> hours_series = pd.Series(pd.date_range("2000-01-01", periods=3, freq="h"))
 |  >>> hours_series
 |  0   2000-01-01 00:00:00
 |  1   2000-01-01 01:00:00
 |  2   2000-01-01 02:00:00
 |  dtype: datetime64[ns]
 |  >>> hours_series.dt.hour
 |  0    0
 |  1    1
 |  2    2
 |  dtype: int64
 |  
 |  >>> quarters_series = pd.Series(pd.date_range("2000-01-01", periods=3, freq="q"))
 |  >>> quarters_series
 |  0   2000-03-31
 | 

In [26]:
df['DoB'].dt.year

0    1997
1    1996
2    1995
Name: DoB, dtype: int64

In [27]:
df['DoB'].dt.day

0     5
1    28
2    16
Name: DoB, dtype: int64

In [28]:
df['DoB'].dt.month

0     8
1     4
2    12
Name: DoB, dtype: int64

In [29]:
df['DoB'].dt.week

  df['DoB'].dt.week


0    32
1    17
2    50
Name: DoB, dtype: int64

In [31]:
df['DoB'].dt.day_name()

0     Tuesday
1      Sunday
2    Saturday
Name: DoB, dtype: object

In [32]:
today = pd.to_datetime('today')
today

Timestamp('2022-02-17 10:45:39.240481')

In [33]:
today.year

2022

#### Calculate Age

In [68]:
d1 = pd.to_datetime('2021/11/04')

In [69]:
d2 = pd.to_datetime("1998/10/16")

In [70]:
d1-d2

Timedelta('8420 days 00:00:00')

In [75]:
d1.year-d2.year

23