In [1]:
import csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import json
import zipfile
import datetime
from sklearn.model_selection import train_test_split

%matplotlib inline
plt.style.use('fivethirtyeight')
import seaborn as sns
sns.set()
sns.set_context("talk")
import re

## Original Dataset

In [2]:
df = pd.read_csv('raw_master.csv', names=['subject', 'from', 'date', 'to', 'label', 'thread'], encoding='iso-8859-1')
df['date'] = df['date'].apply(lambda x: pd.to_datetime(x, errors='coerce'))
df = df[df['date'].notna()]
df = df[1:]
df

Unnamed: 0,subject,from,date,to,label,thread
2,#ERROR!,DailyDropout.fyi <dailydropout@substack.com>,2021-02-09 17:41:10+00:00,tanishkumar@berkeley.edu,"Inbox,Category Promotions,Unread",1691240491013211129
3,#ERROR!,#ERROR!,2021-02-19 20:02:40+00:00,<tanishkumar@berkeley.edu>,"Inbox,Category Promotions,Unread",1692155584056927524
4,#ERROR!,neuvoo.com<job@neuvoo.com>,2021-02-08 13:38:06+00:00,tanishkumar@berkeley.edu,"Inbox,Category Promotions,Unread",1691134489346807599
5,#ERROR!,Ladder Newsletter <remoteworkforstudents@subst...,2021-02-11 14:52:50+00:00,tanishkumar@berkeley.edu,"Inbox,Category Promotions,Unread",1691411213622612924
6,Complete your setup with the Slack desktop app,"""Slack"" <no-reply@email.slackhq.com>",2021-01-21 11:02:05-06:00,<tanishkumar@berkeley.edu>,"Inbox,Important,Category Promotions,Unread",1689516575768603440
...,...,...,...,...,...,...
30532,Find out why Beefy-T Tees are all the rave!,"""Hanes.com"" <hanes@mail.hanes.com>",2021-02-23 07:11:30-05:00,noorgill@berkeley.edu,"Archived,Important,Opened,Category Promotions",1.69249E+18
30533,swimwear & sandals & beach towels & sunscreen,"""Urban Outfitters"" <urbanoutfitters@e.urbanout...",2021-03-17 16:04:08+00:00,"""noorgill@berkeley.edu"" <noorgill@berkeley.edu>","Trash,Category Promotions,Unread",1.6945E+18
30534,You want WHAT?! No problem.,"""Let's Roam"" <support@letsroamscavengerhunts.com>",2021-02-18 18:35:54+00:00,noorgill@berkeley.edu,"Inbox,Important,Category Promotions",1.69206E+18
30535,=?utf-8?B?V2FudCB0byBnZXQgbW9yZSBmcm9tIFVPIFJl...,"""UO Rewards"" <urbanoutfitters@e.urbanoutfitter...",2021-02-09 22:25:17+00:00,"""noorgill@berkeley.edu"" <noorgill@berkeley.edu>","Inbox,Important,Category Promotions",1.69126E+18


## Remove Duplicates

In [3]:
len(df.subject.unique())

18367

We will compare the length of the dataset with the number of unique elements. Both are 14752.

In [4]:
df = df.drop_duplicates(subset=['subject'])
df

Unnamed: 0,subject,from,date,to,label,thread
2,#ERROR!,DailyDropout.fyi <dailydropout@substack.com>,2021-02-09 17:41:10+00:00,tanishkumar@berkeley.edu,"Inbox,Category Promotions,Unread",1691240491013211129
6,Complete your setup with the Slack desktop app,"""Slack"" <no-reply@email.slackhq.com>",2021-01-21 11:02:05-06:00,<tanishkumar@berkeley.edu>,"Inbox,Important,Category Promotions,Unread",1689516575768603440
13,Are you ready to play?,#ERROR!,2020-12-15 12:06:10-08:00,tanishkumar@berkeley.edu,"Inbox,Category Promotions,Unread",1686176030552159651
14,"Gadabout - plus, Go behind the scenes of our n...",#ERROR!,2021-02-01 04:14:58-06:00,<tanishkumar@berkeley.edu>,"Inbox,Category Promotions,Unread",1690487487729257421
17,General Registration is now open for Winter/Sp...,Mission College <nadler@wvm.edu>,2020-12-07 13:45:42-05:00,tanishkumar@berkeley.edu,"Inbox,Category Promotions,Unread",1685446190686767938
...,...,...,...,...,...,...
30531,How to pretend to be a morning person,"""Taco from Trello"" <taco@trello.com>",2020-10-01 15:46:29+00:00,"""noorgill"" <noorgill@berkeley.edu>","Inbox,Category Promotions",1.67936E+18
30532,Find out why Beefy-T Tees are all the rave!,"""Hanes.com"" <hanes@mail.hanes.com>",2021-02-23 07:11:30-05:00,noorgill@berkeley.edu,"Archived,Important,Opened,Category Promotions",1.69249E+18
30533,swimwear & sandals & beach towels & sunscreen,"""Urban Outfitters"" <urbanoutfitters@e.urbanout...",2021-03-17 16:04:08+00:00,"""noorgill@berkeley.edu"" <noorgill@berkeley.edu>","Trash,Category Promotions,Unread",1.6945E+18
30534,You want WHAT?! No problem.,"""Let's Roam"" <support@letsroamscavengerhunts.com>",2021-02-18 18:35:54+00:00,noorgill@berkeley.edu,"Inbox,Important,Category Promotions",1.69206E+18


## Remove Errors and Unreadable UTFs

The first type of error happens when the entire string can not be decoded properly, which will show "#ERROR!". We will clean all rows with such errors.

In [5]:
df = df[df['subject'].str.contains("\#ERROR\!")==False]
df = df[df['from'].str.contains("\#ERROR\!")==False]
df

Unnamed: 0,subject,from,date,to,label,thread
6,Complete your setup with the Slack desktop app,"""Slack"" <no-reply@email.slackhq.com>",2021-01-21 11:02:05-06:00,<tanishkumar@berkeley.edu>,"Inbox,Important,Category Promotions,Unread",1689516575768603440
17,General Registration is now open for Winter/Sp...,Mission College <nadler@wvm.edu>,2020-12-07 13:45:42-05:00,tanishkumar@berkeley.edu,"Inbox,Category Promotions,Unread",1685446190686767938
25,A global bank that doubles as a cybersecurity ...,"""Tricia at WayUp"" <info@bb3.wayup.com>",2020-12-08 02:19:45+00:00,"""Tanish Kumar"" <tanishkumar@berkeley.edu>","Inbox,Opened,Category Promotions",1685474756691839098
27,The next biggest thing,DailyDropout.fyi <dailydropout@substack.com>,2021-01-29 23:09:33+00:00,tanishkumar@berkeley.edu,"Inbox,Category Promotions,Unread",1690264506723567323
30,Read free for 30 days,Scribd <hello@hello.scribd.com>,2021-02-18 08:51:55+00:00,tanishkumar@berkeley.edu,"Inbox,Category Promotions,Unread",1692022410445336472
...,...,...,...,...,...,...
30531,How to pretend to be a morning person,"""Taco from Trello"" <taco@trello.com>",2020-10-01 15:46:29+00:00,"""noorgill"" <noorgill@berkeley.edu>","Inbox,Category Promotions",1.67936E+18
30532,Find out why Beefy-T Tees are all the rave!,"""Hanes.com"" <hanes@mail.hanes.com>",2021-02-23 07:11:30-05:00,noorgill@berkeley.edu,"Archived,Important,Opened,Category Promotions",1.69249E+18
30533,swimwear & sandals & beach towels & sunscreen,"""Urban Outfitters"" <urbanoutfitters@e.urbanout...",2021-03-17 16:04:08+00:00,"""noorgill@berkeley.edu"" <noorgill@berkeley.edu>","Trash,Category Promotions,Unread",1.6945E+18
30534,You want WHAT?! No problem.,"""Let's Roam"" <support@letsroamscavengerhunts.com>",2021-02-18 18:35:54+00:00,noorgill@berkeley.edu,"Inbox,Important,Category Promotions",1.69206E+18


Another type of error happens on strings that contain emoticons, and they are usually decoded into a series of codes starting with either "UTF" or "utf". We will also clean all rows with this kind of codes.

In [6]:
df = df[df['subject'].str.contains('.*(UTF).*')==False]
df = df[df['subject'].str.contains('.*(utf).*')==False]
df

  return func(self, *args, **kwargs)
  return func(self, *args, **kwargs)


Unnamed: 0,subject,from,date,to,label,thread
6,Complete your setup with the Slack desktop app,"""Slack"" <no-reply@email.slackhq.com>",2021-01-21 11:02:05-06:00,<tanishkumar@berkeley.edu>,"Inbox,Important,Category Promotions,Unread",1689516575768603440
17,General Registration is now open for Winter/Sp...,Mission College <nadler@wvm.edu>,2020-12-07 13:45:42-05:00,tanishkumar@berkeley.edu,"Inbox,Category Promotions,Unread",1685446190686767938
25,A global bank that doubles as a cybersecurity ...,"""Tricia at WayUp"" <info@bb3.wayup.com>",2020-12-08 02:19:45+00:00,"""Tanish Kumar"" <tanishkumar@berkeley.edu>","Inbox,Opened,Category Promotions",1685474756691839098
27,The next biggest thing,DailyDropout.fyi <dailydropout@substack.com>,2021-01-29 23:09:33+00:00,tanishkumar@berkeley.edu,"Inbox,Category Promotions,Unread",1690264506723567323
30,Read free for 30 days,Scribd <hello@hello.scribd.com>,2021-02-18 08:51:55+00:00,tanishkumar@berkeley.edu,"Inbox,Category Promotions,Unread",1692022410445336472
...,...,...,...,...,...,...
30528,"Now Hiring at Gilead Sciences, Daiichi Sankyo ...",BioSpace <biospace_noreply@biospace.com>,2020-10-28 12:16:34-05:00,noorgill@berkeley.edu,"Inbox,Category Promotions",1.68182E+18
30531,How to pretend to be a morning person,"""Taco from Trello"" <taco@trello.com>",2020-10-01 15:46:29+00:00,"""noorgill"" <noorgill@berkeley.edu>","Inbox,Category Promotions",1.67936E+18
30532,Find out why Beefy-T Tees are all the rave!,"""Hanes.com"" <hanes@mail.hanes.com>",2021-02-23 07:11:30-05:00,noorgill@berkeley.edu,"Archived,Important,Opened,Category Promotions",1.69249E+18
30533,swimwear & sandals & beach towels & sunscreen,"""Urban Outfitters"" <urbanoutfitters@e.urbanout...",2021-03-17 16:04:08+00:00,"""noorgill@berkeley.edu"" <noorgill@berkeley.edu>","Trash,Category Promotions,Unread",1.6945E+18


The last type of error happens on some special characters, as they are replaced with unicode replacement characters "ï¿½" when they are extracted from our mailboxes. We will clean the rows with these characters by replacing such characters with empty strings.

In [7]:
df['subject'] = df['subject'].str.replace('ï¿½', '')

## Feature Engineering

The first feature is the labels of the email addresses that sent out these emails. As suggested by our mentor, these labels convey what kind of impressions these promotional email senders want to leave on us, which may be related to the emotions in the subject lines.

In [8]:
df['name'] = df['from'].str.split('<').str[0]
df['name'] = df['name'].str.replace('"', '')
df

Unnamed: 0,subject,from,date,to,label,thread,name
6,Complete your setup with the Slack desktop app,"""Slack"" <no-reply@email.slackhq.com>",2021-01-21 11:02:05-06:00,<tanishkumar@berkeley.edu>,"Inbox,Important,Category Promotions,Unread",1689516575768603440,Slack
17,General Registration is now open for Winter/Sp...,Mission College <nadler@wvm.edu>,2020-12-07 13:45:42-05:00,tanishkumar@berkeley.edu,"Inbox,Category Promotions,Unread",1685446190686767938,Mission College
25,A global bank that doubles as a cybersecurity ...,"""Tricia at WayUp"" <info@bb3.wayup.com>",2020-12-08 02:19:45+00:00,"""Tanish Kumar"" <tanishkumar@berkeley.edu>","Inbox,Opened,Category Promotions",1685474756691839098,Tricia at WayUp
27,The next biggest thing,DailyDropout.fyi <dailydropout@substack.com>,2021-01-29 23:09:33+00:00,tanishkumar@berkeley.edu,"Inbox,Category Promotions,Unread",1690264506723567323,DailyDropout.fyi
30,Read free for 30 days,Scribd <hello@hello.scribd.com>,2021-02-18 08:51:55+00:00,tanishkumar@berkeley.edu,"Inbox,Category Promotions,Unread",1692022410445336472,Scribd
...,...,...,...,...,...,...,...
30528,"Now Hiring at Gilead Sciences, Daiichi Sankyo ...",BioSpace <biospace_noreply@biospace.com>,2020-10-28 12:16:34-05:00,noorgill@berkeley.edu,"Inbox,Category Promotions",1.68182E+18,BioSpace
30531,How to pretend to be a morning person,"""Taco from Trello"" <taco@trello.com>",2020-10-01 15:46:29+00:00,"""noorgill"" <noorgill@berkeley.edu>","Inbox,Category Promotions",1.67936E+18,Taco from Trello
30532,Find out why Beefy-T Tees are all the rave!,"""Hanes.com"" <hanes@mail.hanes.com>",2021-02-23 07:11:30-05:00,noorgill@berkeley.edu,"Archived,Important,Opened,Category Promotions",1.69249E+18,Hanes.com
30533,swimwear & sandals & beach towels & sunscreen,"""Urban Outfitters"" <urbanoutfitters@e.urbanout...",2021-03-17 16:04:08+00:00,"""noorgill@berkeley.edu"" <noorgill@berkeley.edu>","Trash,Category Promotions,Unread",1.6945E+18,Urban Outfitters


The second group of features is about the time that these emails are sent. We converted the dates in the "date" column to datetime objects and extracted the exact times, the hours, and the months during which the emails are sent. The exact times are kept in the time zone that the receivers are in, which better match with the purposes of these emails.

In [9]:
df['date_utc'] = pd.to_datetime(df['date'], format="%Y%m%d %H:%M:%S.%f", utc=True)
df['time'] = [x.strftime("%H:%M:%S") for x in df['date']]
df['hour'] = [re.findall('[0-9]{2}',x)[0] for x in df['time']]
df['month'] = pd.DatetimeIndex(df['date_utc']).month
df

Unnamed: 0,subject,from,date,to,label,thread,name,date_utc,time,hour,month
6,Complete your setup with the Slack desktop app,"""Slack"" <no-reply@email.slackhq.com>",2021-01-21 11:02:05-06:00,<tanishkumar@berkeley.edu>,"Inbox,Important,Category Promotions,Unread",1689516575768603440,Slack,2021-01-21 17:02:05+00:00,11:02:05,11,1
17,General Registration is now open for Winter/Sp...,Mission College <nadler@wvm.edu>,2020-12-07 13:45:42-05:00,tanishkumar@berkeley.edu,"Inbox,Category Promotions,Unread",1685446190686767938,Mission College,2020-12-07 18:45:42+00:00,13:45:42,13,12
25,A global bank that doubles as a cybersecurity ...,"""Tricia at WayUp"" <info@bb3.wayup.com>",2020-12-08 02:19:45+00:00,"""Tanish Kumar"" <tanishkumar@berkeley.edu>","Inbox,Opened,Category Promotions",1685474756691839098,Tricia at WayUp,2020-12-08 02:19:45+00:00,02:19:45,02,12
27,The next biggest thing,DailyDropout.fyi <dailydropout@substack.com>,2021-01-29 23:09:33+00:00,tanishkumar@berkeley.edu,"Inbox,Category Promotions,Unread",1690264506723567323,DailyDropout.fyi,2021-01-29 23:09:33+00:00,23:09:33,23,1
30,Read free for 30 days,Scribd <hello@hello.scribd.com>,2021-02-18 08:51:55+00:00,tanishkumar@berkeley.edu,"Inbox,Category Promotions,Unread",1692022410445336472,Scribd,2021-02-18 08:51:55+00:00,08:51:55,08,2
...,...,...,...,...,...,...,...,...,...,...,...
30528,"Now Hiring at Gilead Sciences, Daiichi Sankyo ...",BioSpace <biospace_noreply@biospace.com>,2020-10-28 12:16:34-05:00,noorgill@berkeley.edu,"Inbox,Category Promotions",1.68182E+18,BioSpace,2020-10-28 17:16:34+00:00,12:16:34,12,10
30531,How to pretend to be a morning person,"""Taco from Trello"" <taco@trello.com>",2020-10-01 15:46:29+00:00,"""noorgill"" <noorgill@berkeley.edu>","Inbox,Category Promotions",1.67936E+18,Taco from Trello,2020-10-01 15:46:29+00:00,15:46:29,15,10
30532,Find out why Beefy-T Tees are all the rave!,"""Hanes.com"" <hanes@mail.hanes.com>",2021-02-23 07:11:30-05:00,noorgill@berkeley.edu,"Archived,Important,Opened,Category Promotions",1.69249E+18,Hanes.com,2021-02-23 12:11:30+00:00,07:11:30,07,2
30533,swimwear & sandals & beach towels & sunscreen,"""Urban Outfitters"" <urbanoutfitters@e.urbanout...",2021-03-17 16:04:08+00:00,"""noorgill@berkeley.edu"" <noorgill@berkeley.edu>","Trash,Category Promotions,Unread",1.6945E+18,Urban Outfitters,2021-03-17 16:04:08+00:00,16:04:08,16,3


We also think that the use of punctuations, such as question marks and exclaimation marks may convey strong emotions in the subject lines, so we extracted the number of these punctuations used as well.

In [10]:
df['num_qm'] = df['subject'].str.count('\\?')
df['num_em'] = df['subject'].str.count('\\!')
df

Unnamed: 0,subject,from,date,to,label,thread,name,date_utc,time,hour,month,num_qm,num_em
6,Complete your setup with the Slack desktop app,"""Slack"" <no-reply@email.slackhq.com>",2021-01-21 11:02:05-06:00,<tanishkumar@berkeley.edu>,"Inbox,Important,Category Promotions,Unread",1689516575768603440,Slack,2021-01-21 17:02:05+00:00,11:02:05,11,1,0,0
17,General Registration is now open for Winter/Sp...,Mission College <nadler@wvm.edu>,2020-12-07 13:45:42-05:00,tanishkumar@berkeley.edu,"Inbox,Category Promotions,Unread",1685446190686767938,Mission College,2020-12-07 18:45:42+00:00,13:45:42,13,12,0,1
25,A global bank that doubles as a cybersecurity ...,"""Tricia at WayUp"" <info@bb3.wayup.com>",2020-12-08 02:19:45+00:00,"""Tanish Kumar"" <tanishkumar@berkeley.edu>","Inbox,Opened,Category Promotions",1685474756691839098,Tricia at WayUp,2020-12-08 02:19:45+00:00,02:19:45,02,12,1,1
27,The next biggest thing,DailyDropout.fyi <dailydropout@substack.com>,2021-01-29 23:09:33+00:00,tanishkumar@berkeley.edu,"Inbox,Category Promotions,Unread",1690264506723567323,DailyDropout.fyi,2021-01-29 23:09:33+00:00,23:09:33,23,1,0,0
30,Read free for 30 days,Scribd <hello@hello.scribd.com>,2021-02-18 08:51:55+00:00,tanishkumar@berkeley.edu,"Inbox,Category Promotions,Unread",1692022410445336472,Scribd,2021-02-18 08:51:55+00:00,08:51:55,08,2,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
30528,"Now Hiring at Gilead Sciences, Daiichi Sankyo ...",BioSpace <biospace_noreply@biospace.com>,2020-10-28 12:16:34-05:00,noorgill@berkeley.edu,"Inbox,Category Promotions",1.68182E+18,BioSpace,2020-10-28 17:16:34+00:00,12:16:34,12,10,0,0
30531,How to pretend to be a morning person,"""Taco from Trello"" <taco@trello.com>",2020-10-01 15:46:29+00:00,"""noorgill"" <noorgill@berkeley.edu>","Inbox,Category Promotions",1.67936E+18,Taco from Trello,2020-10-01 15:46:29+00:00,15:46:29,15,10,0,0
30532,Find out why Beefy-T Tees are all the rave!,"""Hanes.com"" <hanes@mail.hanes.com>",2021-02-23 07:11:30-05:00,noorgill@berkeley.edu,"Archived,Important,Opened,Category Promotions",1.69249E+18,Hanes.com,2021-02-23 12:11:30+00:00,07:11:30,07,2,0,1
30533,swimwear & sandals & beach towels & sunscreen,"""Urban Outfitters"" <urbanoutfitters@e.urbanout...",2021-03-17 16:04:08+00:00,"""noorgill@berkeley.edu"" <noorgill@berkeley.edu>","Trash,Category Promotions,Unread",1.6945E+18,Urban Outfitters,2021-03-17 16:04:08+00:00,16:04:08,16,3,0,0


The last feature, even though it is minor compared to others, may also help with predictions. We think the use of hashtags may also expresses specific emotions, so we set whether the subject line contains a hashtag as another feature.

In [11]:
df['hashtag'] = df['subject'].str.contains(r'#[a-zA-z]').astype(int)
df[df['hashtag'] == 1]

Unnamed: 0,subject,from,date,to,label,thread,name,date_utc,time,hour,month,num_qm,num_em,hashtag
463,Let's get you #hired.,"""The WayUp Team"" <info@bb3.wayup.com>",2020-11-26 01:15:32+00:00,""""" <tanishkumar@berkeley.edu>","Inbox,Category Promotions,Unread",1684383552631345154,The WayUp Team,2020-11-26 01:15:32+00:00,01:15:32,01,11,0,0,1
1406,Now Available: #FauxFilter Luminous Matte Foun...,Huda Beauty <hello@hudabeauty.com>,2021-01-24 20:47:41+00:00,"""noorgill08@gmail.com"" <noorgill08@gmail.com>","Inbox,Category Promotions",1689802518682241553,Huda Beauty,2021-01-24 20:47:41+00:00,20:47:41,20,1,0,0,1
2264,#StyleWithBITE: Upswing Full-Volume Mascara ed...,BITE Beauty <noreply@bitebeauty.com>,2021-01-08 23:31:08+00:00,"""noorgill08@gmail.com"" <noorgill08@gmail.com>","Archived,Opened,Category Promotions",1688363255481334067,BITE Beauty,2021-01-08 23:31:08+00:00,23:31:08,23,1,0,0,1
2334,Ready for more #FauxFilter?,Huda Beauty <hello@hudabeauty.com>,2021-01-19 18:05:16+00:00,"""noorgill08@gmail.com"" <noorgill08@gmail.com>","Inbox,Category Promotions",1689339315977285552,Huda Beauty,2021-01-19 18:05:16+00:00,18:05:16,18,1,1,0,1
4072,Happy #CyberMonday! 35% off SITEWIDE,BITE Beauty <noreply@bitebeauty.com>,2020-11-30 13:46:23+00:00,"""noorgill08@gmail.com"" <noorgill08@gmail.com>","Inbox,Category Promotions",1684793183123885193,BITE Beauty,2020-11-30 13:46:23+00:00,13:46:23,13,11,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26183,"#Trending PATRICK TA, HUDA BEAUTY & ONES/SIZE ...",Sephora <shop@beauty.sephora.com>,2021-01-02 17:21:30+00:00,noorgill@berkeley.edu,"Archived,Opened,Category Promotions",1.6878E+18,Sephora,2021-01-02 17:21:30+00:00,17:21:30,17,1,0,0,1
27234,#SHEIN 1111 is on: FREE SHIPPING for Singles' Day,=?UTF-8?B?U0hFSU4=?= <shein@market.sheinmail.com>,2020-11-11 08:40:49-08:00,noorgill@berkeley.edu,"Archived,Opened,Category Promotions",1.68308E+18,=?UTF-8?B?U0hFSU4=?=,2020-11-11 16:40:49+00:00,08:40:49,08,11,0,0,1
27870,Will you answer the call #turntocold?,"""Hanes.com"" <hanes@mail.hanes.com>",2021-04-02 15:55:17-04:00,noorgill@berkeley.edu,"Trash,Category Promotions,Unread",1.69596E+18,Hanes.com,2021-04-02 19:55:17+00:00,15:55:17,15,4,1,0,1
28224,Head to toe in BDG | #BDG365,"""Urban Outfitters"" <urbanoutfitters@e.urbanout...",2020-06-09 23:11:02+00:00,"""noorgill@berkeley.edu"" <noorgill@berkeley.edu>","Inbox,Important,Opened,Category Promotions",1.66906E+18,Urban Outfitters,2020-06-09 23:11:02+00:00,23:11:02,23,6,0,0,1


## New Dataset

In [12]:
df.to_csv('Cleaned Dataset.csv', index=False)

## Train Test Split

In [13]:
df_train, df_test = train_test_split(df, test_size=0.3, random_state=0)
df_train.to_csv('Training Set.csv', index=False)
df_test.to_csv('Test Set.csv', index=False)