In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# This script takes in a single file, processes it, and splits it into a training/testing split based on the input provided by the user

## Loading the original train file into a Pandas dataframe

In [6]:
dataframe = pd.read_csv('./dataset/train.csv')

In [7]:
values = dataframe.values

## Take a Look at the dataframe 

In [9]:
dataframe.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


## Check for any null values

In [12]:
dataframe.isnull().any()

id               False
comment_text     False
toxic            False
severe_toxic     False
obscene          False
threat           False
insult           False
identity_hate    False
dtype: bool

### Looks like there were no null values, good to go!

## Now let's actually split the Dataframe using SKLearn!

#### Variabls to change the splitting

In [28]:
test_size = 0.2
seed = 42 # The answer to everything is 42, and so must our seed be!

In [29]:
data_train, data_test = train_test_split(dataframe, test_size=test_size, random_state=seed)

#### Verify that we split properly

In [30]:
data_train.shape

(127656, 8)

In [31]:
data_test.shape

(31915, 8)

## Save the new train test splits to csv file

In [33]:
data_train.to_csv("./dataset/train_new.csv")

In [34]:
data_test.to_csv("./dataset/test_new.csv")

### Test whether the new CSV files work fine

In [35]:
df_new_train = pd.read_csv("./dataset/train_new.csv")
df_new_test = pd.read_csv("./dataset/test_new.csv")

In [36]:
df_new_train.head()

Unnamed: 0.1,Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,140030,ed56f082116dcbd0,Grandma Terri Should Burn in Trash \nGrandma T...,1,0,0,0,0,0
1,159124,f8e3cd98b63bf401,", 9 May 2009 (UTC)\nIt would be easiest if you...",0,0,0,0,0,0
2,60006,a09e1bcf10631f9a,"""\n\nThe Objectivity of this Discussion is dou...",0,0,0,0,0,0
3,65432,af0ee0066c607eb8,Shelly Shock\nShelly Shock is. . .( ),0,0,0,0,0,0
4,154979,b734772b1a807e09,I do not care. Refer to Ong Teng Cheong talk p...,0,0,0,0,0,0


In [37]:
df_new_test.head()

Unnamed: 0.1,Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,119105,7ca72b5b9c688e9e,"Geez, are you forgetful! We've already discus...",0,0,0,0,0,0
1,131631,c03f72fd8f8bf54f,Carioca RFA \n\nThanks for your support on my ...,0,0,0,0,0,0
2,125326,9e5b8e8fc1ff2e84,"""\n\n Birthday \n\nNo worries, It's what I do ...",0,0,0,0,0,0
3,111256,5332799e706665a6,Pseudoscience category? \n\nI'm assuming that ...,0,0,0,0,0,0
4,83590,dfa7d8f0b4366680,"(and if such phrase exists, it would be provid...",0,0,0,0,0,0


In [38]:
df_new_test.shape

(31915, 9)

In [39]:
df_new_train.shape

(127656, 9)