# Working with Text Data in pandas

In [1]:
import pandas as pd

In [2]:
time_sentences = ["Monday: The doctor's appointment is at 2:45pm.", 
                  "Tuesday: The dentist's appointment is at 11:30 am.",
                  "Wednesday: At 7:00pm, there is a basketball game!",
                  "Thursday: Be back home by 11:15 pm at the latest.",
                  "Friday: Take the train at 08:10 am, arrive at 09:00am."]

In [21]:
data = pd.DataFrame(time_sentences, columns=['Text'])
data

Unnamed: 0,Text
0,Monday: The doctor's appointment is at 2:45pm.
1,Tuesday: The dentist's appointment is at 11:30...
2,"Wednesday: At 7:00pm, there is a basketball game!"
3,Thursday: Be back home by 11:15 pm at the latest.
4,"Friday: Take the train at 08:10 am, arrive at ..."


## Find all the text line that contains any appointment

In [25]:
data['Appointments'] = data['Text']

In [26]:
def appointment_checker(x):
#     print(x)
    appointment = x.find('appointment')
    if (appointment != -1):
        return True
    else:
        return False

In [27]:
data.Appointments

0       Monday: The doctor's appointment is at 2:45pm.
1    Tuesday: The dentist's appointment is at 11:30...
2    Wednesday: At 7:00pm, there is a basketball game!
3    Thursday: Be back home by 11:15 pm at the latest.
4    Friday: Take the train at 08:10 am, arrive at ...
Name: Appointments, dtype: object

In [28]:
data['Appointments'] = data['Appointments'].apply(appointment_checker)

In [29]:
data.head()

Unnamed: 0,Text,Appointments
0,Monday: The doctor's appointment is at 2:45pm.,True
1,Tuesday: The dentist's appointment is at 11:30...,True
2,"Wednesday: At 7:00pm, there is a basketball game!",False
3,Thursday: Be back home by 11:15 pm at the latest.,False
4,"Friday: Take the train at 08:10 am, arrive at ...",False


## Extract the weekday from the data

In [132]:
data['WeekDay'] = data['Text']

In [34]:
import re

In [133]:
def week_extractor(x):
#     print(x)
    week = re.search('\w+day', x).group(0)
    return week

In [134]:
data['WeekDay'] = data['WeekDay'].apply(week_extractor)

## Extract time from the dataset

In [121]:
data['Time'] = data['Text']

In [122]:
def time_extractor(x):
#     print(x)
    time = re.search("[0-9]+:[0-9]*[a-z]*[\s]*[a-z]*", x).group(0)
    return time

In [123]:
data['Time'] = data['Time'].apply(time_extractor)

In [135]:
data.head()

Unnamed: 0,Text,Appointments,WeekDay,Detail,Time
0,Monday: The doctor's appointment is at 2:45pm.,True,Monday,pm,2:45pm
1,Tuesday: The dentist's appointment is at 11:30...,True,Tuesday,am,11:30 am
2,"Wednesday: At 7:00pm, there is a basketball game!",False,Wednesday,pm,7:00pm
3,Thursday: Be back home by 11:15 pm at the latest.,False,Thursday,pm,11:15 pm
4,"Friday: Take the train at 08:10 am, arrive at ...",False,Friday,am,08:10 am


## Extract appointment detail either am or pm

In [125]:
data['Detail'] = data['Time']

In [126]:
def noon_extractor(x):
#     print(x)
    noon = re.search("[a-z]+", x).group(0)
    return noon

In [127]:
data['Detail'] = data['Detail'].apply(noon_extractor)

In [136]:
data.head()

Unnamed: 0,Text,Appointments,WeekDay,Detail,Time
0,Monday: The doctor's appointment is at 2:45pm.,True,Monday,pm,2:45pm
1,Tuesday: The dentist's appointment is at 11:30...,True,Tuesday,am,11:30 am
2,"Wednesday: At 7:00pm, there is a basketball game!",False,Wednesday,pm,7:00pm
3,Thursday: Be back home by 11:15 pm at the latest.,False,Thursday,pm,11:15 pm
4,"Friday: Take the train at 08:10 am, arrive at ...",False,Friday,am,08:10 am


## Delete am, pm from the Time series or column

In [145]:
def time_noon_extractor(x):
#     print(x)
    time = re.search("[0-9]*:[0-9]*", x).group(0)
    return time

In [146]:
data['Time'] = data['Time'].apply(time_noon_extractor)

In [147]:
data.head()

Unnamed: 0,Text,Appointments,WeekDay,Detail,Time
0,Monday: The doctor's appointment is at 2:45pm.,True,Monday,pm,2:45
1,Tuesday: The dentist's appointment is at 11:30...,True,Tuesday,am,11:30
2,"Wednesday: At 7:00pm, there is a basketball game!",False,Wednesday,pm,7:00
3,Thursday: Be back home by 11:15 pm at the latest.,False,Thursday,pm,11:15
4,"Friday: Take the train at 08:10 am, arrive at ...",False,Friday,am,08:10


## Extract important elements from the dataset

In [162]:
data['Aspects'] = data['Text']

In [161]:
def aspects_extractor(x):
#     print(x)
    words = x.split(' ')
#     print(words)
    longer_words = [word for word in words if (len(word)>3)]
#     print(longer_words)
    return longer_words

In [163]:
data['Aspects'] = data['Aspects'].apply(aspects_extractor)

In [164]:
data.head()

Unnamed: 0,Text,Appointments,WeekDay,Detail,Time,Aspects
0,Monday: The doctor's appointment is at 2:45pm.,True,Monday,pm,2:45,"[Monday:, doctor's, appointment, 2:45pm.]"
1,Tuesday: The dentist's appointment is at 11:30...,True,Tuesday,am,11:30,"[Tuesday:, dentist's, appointment, 11:30]"
2,"Wednesday: At 7:00pm, there is a basketball game!",False,Wednesday,pm,7:00,"[Wednesday:, 7:00pm,, there, basketball, game!]"
3,Thursday: Be back home by 11:15 pm at the latest.,False,Thursday,pm,11:15,"[Thursday:, back, home, 11:15, latest.]"
4,"Friday: Take the train at 08:10 am, arrive at ...",False,Friday,am,08:10,"[Friday:, Take, train, 08:10, arrive, 09:00am.]"
