In [2]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords, wordnet
from nltk.stem import PorterStemmer 
from nltk.tokenize import word_tokenize 
import re
from string import punctuation

In [3]:
import warnings
warnings.filterwarnings("ignore")

In [4]:
data = pd.read_csv('All_Data.csv')

In [5]:
data.head()

Unnamed: 0.1,Unnamed: 0,Title,Transcript
0,0,1. Introduction and Scope,PATRICK WINSTON: Welcome\nto 6034. I don't kno...
1,1,2. Reasoning: Goal Trees and Problem Solving,"What we're going to\ntalk about today, is goal..."
2,2,3. Reasoning: Goal Trees and Rule-Based Expert...,PROFESSOR PATRICK WINSTON:\nLadies and gentlem...
3,3,"4. Search: Depth-First, Hill Climbing, Beam",PATRICK WINSTON: Today\nwe're going to be talk...
4,4,"5. Search: Optimal, Branch and Bound, A*","PROFESSOR: It was written about\nRoute 66, whi..."


In [6]:
data.drop('Unnamed: 0',axis=1,inplace=True)

In [7]:
data.head()

Unnamed: 0,Title,Transcript
0,1. Introduction and Scope,PATRICK WINSTON: Welcome\nto 6034. I don't kno...
1,2. Reasoning: Goal Trees and Problem Solving,"What we're going to\ntalk about today, is goal..."
2,3. Reasoning: Goal Trees and Rule-Based Expert...,PROFESSOR PATRICK WINSTON:\nLadies and gentlem...
3,"4. Search: Depth-First, Hill Climbing, Beam",PATRICK WINSTON: Today\nwe're going to be talk...
4,"5. Search: Optimal, Branch and Bound, A*","PROFESSOR: It was written about\nRoute 66, whi..."


## Text Pre-Processing

#### Converting Data To LowerCase

In [8]:
data['Transcript'] = data['Transcript'].apply(lambda x: ' '.join(x.lower() for x in x.split()))

In [9]:
data.head()

Unnamed: 0,Title,Transcript
0,1. Introduction and Scope,patrick winston: welcome to 6034. i don't know...
1,2. Reasoning: Goal Trees and Problem Solving,"what we're going to talk about today, is goals..."
2,3. Reasoning: Goal Trees and Rule-Based Expert...,professor patrick winston: ladies and gentleme...
3,"4. Search: Depth-First, Hill Climbing, Beam",patrick winston: today we're going to be talki...
4,"5. Search: Optimal, Branch and Bound, A*","professor: it was written about route 66, whic..."


#### StopWord Removal

In [10]:
STOPWORDS = set(stopwords.words('english'))
punctuation = list(punctuation)
STOPWORDS.update(punctuation)
punctuation

['!',
 '"',
 '#',
 '$',
 '%',
 '&',
 "'",
 '(',
 ')',
 '*',
 '+',
 ',',
 '-',
 '.',
 '/',
 ':',
 ';',
 '<',
 '=',
 '>',
 '?',
 '@',
 '[',
 '\\',
 ']',
 '^',
 '_',
 '`',
 '{',
 '|',
 '}',
 '~']

In [11]:
len(data['Transcript'][0])

34890

In [12]:
data['Transcript_Without_Stopword']=data['Transcript'].apply(lambda x:' '.join([word for word in x.split() if word not in (STOPWORDS)]))

In [13]:
len(data['Transcript_Without_Stopword'][0])

22053

### Lemmatization

In [14]:
from nltk.stem import WordNetLemmatizer

In [15]:
ps = WordNetLemmatizer()

In [16]:
data['Transcript_Lemmantized'] = data['Transcript_Without_Stopword'].apply(lambda x: ' '.join([ps.lemmatize(word) for word in x.split()]))

In [17]:
data.head()

Unnamed: 0,Title,Transcript,Transcript_Without_Stopword,Transcript_Lemmantized
0,1. Introduction and Scope,patrick winston: welcome to 6034. i don't know...,patrick winston: welcome 6034. know deal micro...,patrick winston: welcome 6034. know deal micro...
1,2. Reasoning: Goal Trees and Problem Solving,"what we're going to talk about today, is goals...","we're going talk today, goals. way little warm...","we're going talk today, goals. way little warm..."
2,3. Reasoning: Goal Trees and Rule-Based Expert...,professor patrick winston: ladies and gentleme...,"professor patrick winston: ladies gentlemen, e...","professor patrick winston: lady gentlemen, eng..."
3,"4. Search: Depth-First, Hill Climbing, Beam",patrick winston: today we're going to be talki...,patrick winston: today we're going talking sea...,patrick winston: today we're going talking sea...
4,"5. Search: Optimal, Branch and Bound, A*","professor: it was written about route 66, whic...","professor: written route 66, used main highway...","professor: written route 66, used main highway..."


In [18]:
data['Transcript_Without_Stopword'][0]

'patrick winston: welcome 6034. know deal microphone. we\'ll see happens. going good year. we\'ve got [inaudible] bunch interesting people. always interesting see people named children two decades ago. find overwhelmed emilys. many peters, pauls, marys, enough call forth suitable song point. lots jesses genders. [inaudible] genders. duncan, where\'s duncan? are, duncan. changed hairstyle. want assure use thane cawdor taking course semester. i\'m going tell artificial intelligence today, subject about. there\'s 10% percent turnover roster last 24 hours. expect another 10% turnover next 24 hours, too. know many sightseers, wanting know something want do. i\'m going tell we\'re going semester, know get here. i\'m going walk outline. i\'m going start talking artificial intelligence is, it. i\'ll give little bit history artificial intelligence, conclude covenants run course. one laptops, please. i\'ll explain covenants end. it? well, must something thinking. let\'s start here, definition ar

In [19]:
data['Transcript_Lemmantized'][0]

'patrick winston: welcome 6034. know deal microphone. we\'ll see happens. going good year. we\'ve got [inaudible] bunch interesting people. always interesting see people named child two decade ago. find overwhelmed emilys. many peters, pauls, marys, enough call forth suitable song point. lot jesses genders. [inaudible] genders. duncan, where\'s duncan? are, duncan. changed hairstyle. want assure use thane cawdor taking course semester. i\'m going tell artificial intelligence today, subject about. there\'s 10% percent turnover roster last 24 hours. expect another 10% turnover next 24 hours, too. know many sightseers, wanting know something want do. i\'m going tell we\'re going semester, know get here. i\'m going walk outline. i\'m going start talking artificial intelligence is, it. i\'ll give little bit history artificial intelligence, conclude covenant run course. one laptops, please. i\'ll explain covenant end. it? well, must something thinking. let\'s start here, definition artificia

In [20]:
data.to_csv('pre-processed_data.csv')