In [1]:
import numpy as np
import pandas as pd

### Loading CSV file data into data variable

In [12]:
# First we are reading the Data
data=pd.read_csv("Book2.csv")
print(data.head())

                                                text      player
0      I thought the King had more affected the Duke        KENT
1                           of Albany than Cornwall.        KENT
2            It did always seem so to us, but now in  GLOUCESTER
3  the division of the kingdom, for equalities ar...  GLOUCESTER
4  weighed that curiosity in neither can make choice  GLOUCESTER


### Adding another column "len" which indicates length of each text

In [13]:
data['len']=data['text'].apply(lambda x: len(x))
print(data.head())



                                                text      player  len
0      I thought the King had more affected the Duke        KENT   45
1                           of Albany than Cornwall.        KENT   24
2            It did always seem so to us, but now in  GLOUCESTER   40
3  the division of the kingdom, for equalities ar...  GLOUCESTER   50
4  weighed that curiosity in neither can make choice  GLOUCESTER   49


In [14]:
# to verify the values are correct or not, let's try doing a manual check
print(data["text"][0],"--", len(data["text"][0]))
print(data["text"][1],"--", len(data["text"][1]))
# values are same. so let's continue cleaning the text data

I thought the King had more affected the Duke -- 45
of Albany than Cornwall. -- 24


### Cleaning the data

In [15]:
data['text']=data['text'].apply(lambda x:x.strip())
data['len']=data['text'].apply(lambda x: len(x))
print(data.head())

                                                text      player  len
0      I thought the King had more affected the Duke        KENT   45
1                           of Albany than Cornwall.        KENT   24
2            It did always seem so to us, but now in  GLOUCESTER   39
3  the division of the kingdom, for equalities ar...  GLOUCESTER   50
4  weighed that curiosity in neither can make choice  GLOUCESTER   49


#### I am assuming, tidying data means, performing data cleaning to the given text. 

### So removing punctuation marks, performing tokenization and  removing stopwords

In [16]:
from string import punctuation
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [17]:
print("puctuation marks are : \n",punctuation)
def removepunct(x):
    new=""
    for each in x:
        if each not in punctuation:
            new+= each
    return new
            
data['text']=data['text'].apply(lambda x: removepunct(x))
data['len']=data['text'].apply(lambda x: len(x))
print(data.head())

puctuation marks are : 
 !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
                                                text      player  len
0      I thought the King had more affected the Duke        KENT   45
1                            of Albany than Cornwall        KENT   23
2             It did always seem so to us but now in  GLOUCESTER   38
3  the division of the kingdom for equalities are so  GLOUCESTER   49
4  weighed that curiosity in neither can make choice  GLOUCESTER   49


### Now, let's perform tokenization. It splits text/strings into words.

In [18]:
# see that we have removed .,? in text above, so len values have changed in 2,3,4 rows.
#now,

data['text']=data['text'].apply(lambda x: word_tokenize(x))
data['len']=data['text'].apply(lambda x: len(x))
print(data.head())


                                                text      player  len
0  [I, thought, the, King, had, more, affected, t...        KENT    9
1                       [of, Albany, than, Cornwall]        KENT    4
2  [It, did, always, seem, so, to, us, but, now, in]  GLOUCESTER   10
3  [the, division, of, the, kingdom, for, equalit...  GLOUCESTER    9
4  [weighed, that, curiosity, in, neither, can, m...  GLOUCESTER    8


### Lastly, let's remove frequently occuring stopwords

In [19]:
stopw=stopwords.words('english')
def removestopword(x):
    l=[]
    for each in x:
        if each in stopw:
            pass
        else:
            l.append(each)
    return l
            

data['text']=data['text'].apply(lambda x: removestopword(x))
data['len']=data['text'].apply(lambda x: len(x))
print(data.head())

                                          text      player  len
0           [I, thought, King, affected, Duke]        KENT    5
1                           [Albany, Cornwall]        KENT    2
2                       [It, always, seem, us]  GLOUCESTER    4
3              [division, kingdom, equalities]  GLOUCESTER    3
4  [weighed, curiosity, neither, make, choice]  GLOUCESTER    5


In [20]:
### So, Finally text has been cleaned successfully. let's remove the length column
data=data.drop(['len'],axis=1)
print(data.head())

                                          text      player
0           [I, thought, King, affected, Duke]        KENT
1                           [Albany, Cornwall]        KENT
2                       [It, always, seem, us]  GLOUCESTER
3              [division, kingdom, equalities]  GLOUCESTER
4  [weighed, curiosity, neither, make, choice]  GLOUCESTER
