In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
df=pd.read_csv("/content/titanic.csv")

In [4]:
df.head()

Unnamed: 0,Cabin,Ticket,number,Survived
0,,A/5 21171,5,0
1,C85,PC 17599,3,1
2,,STON/O2. 3101282,6,1
3,C123,113803,3,1
4,,373450,A,0


#Handling Mixed Data

In [5]:
df['number'].unique()

array(['5', '3', '6', 'A', '2', '1', '4'], dtype=object)

Number column contains digits as well as alphabets.  
Extracting digits and alphabets into different columns

In [6]:
df['numerical']=pd.to_numeric(df['number'],
                              errors='coerce',
                              downcast='integer')
df['categorical']=np.where(df['numerical'].isnull(),df['number'],np.nan)
df.head()

Unnamed: 0,Cabin,Ticket,number,Survived,numerical,categorical
0,,A/5 21171,5,0,5.0,
1,C85,PC 17599,3,1,3.0,
2,,STON/O2. 3101282,6,1,6.0,
3,C123,113803,3,1,3.0,
4,,373450,A,0,,A


Handling columns with alphanumeric values

In [7]:
df['cabin_num']=df['Cabin'].str.extract('(\d+)')
df['cabin_cat']=df['Cabin'].str[0]
df.head()

  df['cabin_num']=df['Cabin'].str.extract('(\d+)')


Unnamed: 0,Cabin,Ticket,number,Survived,numerical,categorical,cabin_num,cabin_cat
0,,A/5 21171,5,0,5.0,,,
1,C85,PC 17599,3,1,3.0,,85.0,C
2,,STON/O2. 3101282,6,1,6.0,,,
3,C123,113803,3,1,3.0,,123.0,C
4,,373450,A,0,,A,,


In [8]:
df['ticket_num']=df['Ticket'].apply(lambda s:s.split()[-1])
df['ticket_num']=pd.to_numeric(df['ticket_num'],
                               errors='coerce',
                               downcast='integer')
df['ticket_cat']=df['Ticket'].apply(lambda s:s.split()[0])
df['ticket_cat']=np.where(df['ticket_cat'].str.isdigit(),np.nan,df['Ticket'])
df.head()

Unnamed: 0,Cabin,Ticket,number,Survived,numerical,categorical,cabin_num,cabin_cat,ticket_num,ticket_cat
0,,A/5 21171,5,0,5.0,,,,21171.0,A/5 21171
1,C85,PC 17599,3,1,3.0,,85.0,C,17599.0,PC 17599
2,,STON/O2. 3101282,6,1,6.0,,,,3101282.0,STON/O2. 3101282
3,C123,113803,3,1,3.0,,123.0,C,113803.0,
4,,373450,A,0,,A,,,373450.0,


#Handling date and time column

In [49]:
time=pd.read_csv('/content/messages.csv')
date=pd.read_csv('/content/orders.csv')

In [50]:
date.head()

Unnamed: 0,date,product_id,city_id,orders
0,2019-12-10,5628,25,3
1,2018-08-15,3646,14,157
2,2018-10-23,1859,25,1
3,2019-08-17,7292,25,1
4,2019-01-06,4344,25,3


In [51]:
time.head()

Unnamed: 0,date,msg
0,2013-12-15 00:50:00,ищу на сегодня мужика 37
1,2014-04-29 23:40:00,ПАРЕНЬ БИ ИЩЕТ ДРУГА СЕЙЧАС!! СМС ММС 0955532826
2,2012-12-30 00:21:00,Днепр.м 43 позн.с д/ж *.о 067.16.34.576
3,2014-11-28 00:31:00,КИЕВ ИЩУ Д/Ж ДО 45 МНЕ СЕЙЧАС СКУЧНО 093 629 9...
4,2013-10-26 23:11:00,Зая я тебя никогда не обижу люблю тебя!) Даше


Handling Date column

In [52]:
date['date']=pd.to_datetime(date['date'])

In [53]:
date['year']=date['date'].dt.year

In [54]:
date['month']=date['date'].dt.month

In [55]:
date['month_name']=date['date'].dt.month_name()

In [56]:
date['day']=date['date'].dt.day

In [57]:
date.head()

Unnamed: 0,date,product_id,city_id,orders,year,month,month_name,day
0,2019-12-10,5628,25,3,2019,12,December,10
1,2018-08-15,3646,14,157,2018,8,August,15
2,2018-10-23,1859,25,1,2018,10,October,23
3,2019-08-17,7292,25,1,2019,8,August,17
4,2019-01-06,4344,25,3,2019,1,January,6


Handling time

In [58]:
time.head()

Unnamed: 0,date,msg
0,2013-12-15 00:50:00,ищу на сегодня мужика 37
1,2014-04-29 23:40:00,ПАРЕНЬ БИ ИЩЕТ ДРУГА СЕЙЧАС!! СМС ММС 0955532826
2,2012-12-30 00:21:00,Днепр.м 43 позн.с д/ж *.о 067.16.34.576
3,2014-11-28 00:31:00,КИЕВ ИЩУ Д/Ж ДО 45 МНЕ СЕЙЧАС СКУЧНО 093 629 9...
4,2013-10-26 23:11:00,Зая я тебя никогда не обижу люблю тебя!) Даше


In [61]:
time['date']=pd.to_datetime(time['date'])

In [62]:
time.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   date    1000 non-null   datetime64[ns]
 1   msg     1000 non-null   object        
 2   time    1000 non-null   datetime64[ns]
dtypes: datetime64[ns](2), object(1)
memory usage: 23.6+ KB


In [64]:
time['hour']=time['date'].dt.hour

In [65]:
time['minutes']=time['date'].dt.minute

In [66]:
time['seconds']=time['date'].dt.second

In [67]:
time.head()

Unnamed: 0,date,msg,time,hour,minutes,seconds
0,2013-12-15 00:50:00,ищу на сегодня мужика 37,2013-12-15 00:50:00,0,50,0
1,2014-04-29 23:40:00,ПАРЕНЬ БИ ИЩЕТ ДРУГА СЕЙЧАС!! СМС ММС 0955532826,2014-04-29 23:40:00,23,40,0
2,2012-12-30 00:21:00,Днепр.м 43 позн.с д/ж *.о 067.16.34.576,2012-12-30 00:21:00,0,21,0
3,2014-11-28 00:31:00,КИЕВ ИЩУ Д/Ж ДО 45 МНЕ СЕЙЧАС СКУЧНО 093 629 9...,2014-11-28 00:31:00,0,31,0
4,2013-10-26 23:11:00,Зая я тебя никогда не обижу люблю тебя!) Даше,2013-10-26 23:11:00,23,11,0


Extracting time part only

In [68]:
time['time']=time['date'].dt.time
time.head()

Unnamed: 0,date,msg,time,hour,minutes,seconds
0,2013-12-15 00:50:00,ищу на сегодня мужика 37,00:50:00,0,50,0
1,2014-04-29 23:40:00,ПАРЕНЬ БИ ИЩЕТ ДРУГА СЕЙЧАС!! СМС ММС 0955532826,23:40:00,23,40,0
2,2012-12-30 00:21:00,Днепр.м 43 позн.с д/ж *.о 067.16.34.576,00:21:00,0,21,0
3,2014-11-28 00:31:00,КИЕВ ИЩУ Д/Ж ДО 45 МНЕ СЕЙЧАС СКУЧНО 093 629 9...,00:31:00,0,31,0
4,2013-10-26 23:11:00,Зая я тебя никогда не обижу люблю тебя!) Даше,23:11:00,23,11,0
