In [1]:
import pandas as pd
import nltk
import numpy as np
import re

In [2]:
path = 'cleaned_data.tsv'


In [3]:
df = pd.read_csv(path,delimiter = '\t', quoting = 3)
df.tail()

Unnamed: 0,Transcripts,AD
3267,the mother's standing there doing the dishes,0
3268,she's washing the dishes looking out the open ...,0
3269,and the water's runnin(g) down over the sink o...,0
3270,and <there are> [//] she's dryin(g) a dish,0
3271,summer of the year,0


In [4]:
df.head()

Unnamed: 0,Transcripts,AD
0,there's &um a young boy that's getting a cooki...,1
1,and it he's uh in bad shape because uh the thi...,1
2,and in the picture the mother is washing dishe...,1
3,and the dishes might get falled over if you don't,1
4,fell fall over there there if you don't get it,1


In [5]:
df.shape

(3272, 2)

In [6]:
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Dellpc\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [7]:
customStopwords = {'a','are','about','above','after','again','against','ain','all','am','an','any','aren',"aren't",'as',
                   'at','be','because','been','before','being','below','between','both','but','by',
                   'can','couldn',"couldn't",'d','didn',"didn't",'do','does','doesn',"doesn't",'doing','don',"don't",
                   'down','during','each','few','for','from','further','had','hadn',"hadn't",'has','hasn',"hasn't",
                   'have','haven',"haven't",'having','he','her','here','hers','herself','him','himself','his','how', 'i','if', 
                   'in', 'into','is','isn',"isn't", "it's",'its','itself','just','ll','m','ma','me','mightn',"mightn't",
                   'more','most','mustn',"mustn't",'my','myself','needn',"needn't",'no','nor','not','now', 'o','of',
                   'off','on','once','only','or','other','our','ours','ourselves','out','over','own','re','s','same',
                   'shan',"shan't",'she',"she's",'should',"should've",'shouldn',"shouldn't",'so','some','such','t','than','that',"that'll",
                   'the','there',"there's",'their','theirs','them','themselves','then','these','they','this','those','through','to','too',
                   'under','until','up','ve','very','was','wasn',"wasn't",'we','were','weren',"weren't",'what','when',
                   'where','which','while','who','whom','why','will','with','won',"won't",'wouldn', "wouldn't",'y',
                   'you',"you'd","you'll","you're","you've",'your','yours','yourself','yourselves'}

In [8]:
preprocessed_data = []
def preprocess(dataframe):
    for i in range(dataframe.shape[0]):
        df = re.sub('[^a-zA-Z]',' ',dataframe.iloc[i][0]) ##remove all the characters except alphabets (E.g. &, *, #)
        df = df.lower() ## convert all the sentences to lower case
        df = df.split() ##tokenize each word
        df = [word for word in df if word not in customStopwords] ##removing stopwords
        lemmatizer = WordNetLemmatizer()
        df = [lemmatizer.lemmatize(word) for word in df] ##performed lemmatization 
        df = ' '.join(df)
        preprocessed_data.append(df)
    return preprocessed_data

In [9]:
preprocessed_data = preprocess(df)
preprocessed_data

['um young boy getting cookie jar',
 'and it uh bad shape uh thing',
 'and picture mother washing dish and see it',
 'and dish might get falled',
 'fell fall get it',
 'and it it picture kitchen window',
 'and curtain uh distinct',
 'water flow still flowing',
 'young boy uh going cookie jar',
 'and lit girl young girl',
 'and saying boy hard it',
 'hardly hard tell anymore',
 'uh and c cookie jar',
 'and stool and it already starting fall',
 'and water sink uh ev overflowing sink',
 'hm know hickey',
 'whether said',
 'uh like it uh wife g mean uh mother near girl',
 'and uh w uh h uh',
 'oh uh think',
 'uh trying wipe uh wipe dish',
 'oh and stop water going',
 'cookie jar',
 'and lid cookie jar',
 'boy come floor',
 'and girl',
 'know much girl',
 'anyway uh housewife kitchen',
 'and sink overflowing',
 'and girl may saying say something like',
 'and uh guess must wife although it might',
 'maybe apron and um maybe um',
 'window',
 'upper one',
 'uh it um pathway',
 'uh know pathway

In [10]:
preprocessed_data = pd.DataFrame(data=preprocessed_data,columns=['Transcript']) ##create dataframe of preprocessed data
preprocessed_data.shape

(3272, 1)

In [11]:
preprocessed_data.replace('',np.nan,inplace=True)
preprocessed_data

Unnamed: 0,Transcript
0,um young boy getting cookie jar
1,and it uh bad shape uh thing
2,and picture mother washing dish and see it
3,and dish might get falled
4,fell fall get it
5,and it it picture kitchen window
6,and curtain uh distinct
7,water flow still flowing
8,young boy uh going cookie jar
9,and lit girl young girl


In [12]:
preprocessed_data['AD'] = df['AD'].values
preprocessed_data

Unnamed: 0,Transcript,AD
0,um young boy getting cookie jar,1
1,and it uh bad shape uh thing,1
2,and picture mother washing dish and see it,1
3,and dish might get falled,1
4,fell fall get it,1
5,and it it picture kitchen window,1
6,and curtain uh distinct,1
7,water flow still flowing,1
8,young boy uh going cookie jar,1
9,and lit girl young girl,1


In [13]:
preprocessed_data.isna().sum() ##check if the dataset contains NaN values

Transcript    29
AD             0
dtype: int64

In [14]:
preprocessed_data.dropna(inplace=True) ##drop NaN value
preprocessed_data.isna().sum()

Transcript    0
AD            0
dtype: int64

In [15]:
from pickle import dump
filename='preprocessed_data.pkl'
def save_preprocess_data(df,filename): ##Function to save the preprocessed data
    dump(df,open(filename,'wb'))
    print('Preprocessed data is saved :) :%s' % filename)  
save_preprocess_data(preprocessed_data,filename)

Preprocessed data is saved :) :preprocessed_data.pkl
